# Charging Analysis v2 - 基于本地轨迹数据的充电行为分析

本 notebook 基于已下载到本地的车辆轨迹数据（parquet格式），重新实现充电事件检测、排队模拟、能耗和SOC计算等分析流程。

## 主要改进
- 直接读取本地 parquet 轨迹数据（按 taxiid 分目录存储）
- 基于站点半径 + 连续停留时间的充电检测逻辑
- 考虑数据缺失情况下的能耗计算
- 完整的排队模拟和SOC轨迹追踪


## 1. 导入库和工具函数


In [1]:
import cudf
import cupy as cp
import pandas as pd
import numpy as np
import os
from pathlib import Path
from scipy.spatial import cKDTree
from heapq import heappush, heappop
import pyarrow.parquet as pq
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster

# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")


In [2]:
# Haversine距离计算（GPU加速）
def haversine_distance_gpu(lon1, lat1, lon2, lat2):
    """
    计算两点间的大圆距离（米），使用cupy进行GPU加速
    """

    # lon1 = cp.asarray(lon1.values if hasattr(lon1, 'values') else lon1)
    # lat1 = cp.asarray(lat1.values if hasattr(lat1, 'values') else lat1)
    # lon2 = cp.asarray(lon2.values if hasattr(lon2, 'values') else lon2)
    # lat2 = cp.asarray(lat2.values if hasattr(lat2, 'values') else lat2)

    lon1_rad = cp.radians(lon1.astype(cp.float64))
    lat1_rad = cp.radians(lat1.astype(cp.float64))
    lon2_rad = cp.radians(lon2.astype(cp.float64))
    lat2_rad = cp.radians(lat2.astype(cp.float64))

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad
    a = cp.sin(dlat / 2)**2 + cp.cos(lat1_rad) * cp.cos(lat2_rad) * cp.sin(dlon / 2)**2
    c = 2 * cp.arcsin(cp.sqrt(a))
    R = 6371000  # 地球半径（米）
    return c * R


## 2. 参数设置


In [3]:
# ========== 充电检测参数 ==========
R_STATION = 200          # 充电站匹配半径（米）
T_STAY = 600             # 最小停留时间（秒），10分钟
T_GAP = 300              # 同一站点内连续点的最大时间间隔（秒），超过则断开
PASSENGER_THR = 0        # 空载判断：passenger == 0

# ========== 能耗和电池参数 ==========
CAP_KWH = 80             # 电池容量（kWh）
CONS_KWH_PER_KM = 0.2   # 能耗率（kWh/km）
AVG_SPEED_KMH = 20       # 平均速度（km/h），用于缺数据时的估算
CHG_EFFICIENCY = 1.0     # 充电效率

# ========== 数据缺失处理 ==========
MAX_GAP_SECONDS = 1800   # 最大允许数据缺失时间（秒）30分钟，超过则不计能耗,视为断开GPS

# ========== 路径设置 ==========
TRACKS_DIR = '../data/taxi_trajectories.parquet'  # 轨迹数据根目录
STATION_CSV = 'station_information.csv'           # 充电站信息
PILE_CSV = 'pile_rated_power.csv'                 # 充电桩功率信息

# ========== 输出文件 ==========
OUTPUT_DIR = 'output_v2'
os.makedirs(OUTPUT_DIR, exist_ok=True)
CHARGING_EVENTS_FILE = f'{OUTPUT_DIR}/charging_events.parquet'
CHAR_QUEUE_FILE = f'{OUTPUT_DIR}/char_queue.parquet'
ENERGY_GAP_FILE = f'{OUTPUT_DIR}/energy_gap.parquet'
BATTERY_TRACE_FILE = f'{OUTPUT_DIR}/battery_trace.parquet'
PRIVATE_GAP_FILE = f'{OUTPUT_DIR}/private_gap_charging_events.parquet'

# ========== 并行处理参数 ==========
MAX_WORKERS = 8          # 并行处理的线程数


## 3. 充电站数据预处理


In [4]:
# 读取充电站和充电桩数据
stations_df = pd.read_csv(STATION_CSV, dtype={'station_id': np.int32})
piles_df = pd.read_csv(PILE_CSV, dtype={'station_id': np.int32, 'pile_id': np.int32})

# 过滤有效充电桩（功率在合理范围内）
max_power = 60
valid_piles_df = piles_df[(piles_df['power'] > 0) & (piles_df['power'] <= max_power)].copy()

# 统计每个站的充电桩信息
stat_pile_df = valid_piles_df.groupby('station_id').agg(
    num_piles=('power', 'count'),
    avg_power=('power', 'mean'),
    max_power=('power', 'max'),
    min_power=('power', 'min')
).reset_index()

# 合并站点信息和桩统计
stations_df = stations_df.merge(stat_pile_df, on='station_id', how='inner')

# 只保留有充电桩的站点
stations_df = stations_df[stations_df['num_piles'] > 0].copy()

print(f"加载了 {len(stations_df)} 个有效充电站")
print(f"总充电桩数: {stations_df['num_piles'].sum()}")
print(f"平均每站桩数: {stations_df['num_piles'].mean():.1f}")


加载了 1478 个有效充电站
总充电桩数: 19988
平均每站桩数: 13.5


In [5]:
# 构建KDTree用于快速最近邻查询（CPU）
station_coords = stations_df[['longitude', 'latitude']].to_numpy()
station_tree = cKDTree(station_coords)
station_id_lookup = stations_df['station_id'].to_numpy()

print("充电站KDTree构建完成")


充电站KDTree构建完成


## 4. 充电事件检测

### 4.1 单车辆充电检测函数


In [6]:
def detect_charging_for_one_taxi(taxiid, tracks_dir, station_tree, station_id_lookup, stations_df):
    """
    检测单辆车的充电事件

    逻辑：
    1. 读取该车的轨迹数据
    2. 筛选空载点（passenger == 0）
    3. 对每个点查询最近充电站
    4. 筛选距离 <= R_STATION 的点
    5. 按站点和时间连续性分段
    6. 保留持续时间 >= T_STAY 的段作为充电事件

    返回：DataFrame with columns: taxiid, nearest_station_id, start_time, end_time,
          duration_s, stay_lon, stay_lat, num_points
    """
    try:
        # 读取轨迹数据
        track_path = os.path.join(tracks_dir, f'taxiid={taxiid}')
        if not os.path.exists(track_path):
            return pd.DataFrame()

        # 读取parquet文件（可能有多个分区文件）
        track_files = list(Path(track_path).glob('*.parquet'))
        if not track_files:
            return pd.DataFrame()

        # 读取并合并所有分区
        track = pd.concat([pd.read_parquet(f) for f in track_files], ignore_index=True)

        if track.empty:
            return pd.DataFrame()

        # 按时间排序
        track = track.sort_values('time').reset_index(drop=True)

        # 筛选空载点
        track = track[track['passenger'] == PASSENGER_THR].copy()

        if track.empty:
            return pd.DataFrame()

        # 查询最近充电站（CPU）
        track_coords = track[['lon', 'lat']].to_numpy()
        distances_deg, indices = station_tree.query(track_coords, k=1)
        distances_m = distances_deg * 111320  # 转换为米

        # 筛选在充电站半径内的点
        mask_in_station = distances_m <= R_STATION
        track_in_station = track[mask_in_station].copy()

        if track_in_station.empty:
            return pd.DataFrame()

        # 添加站点ID和距离信息
        track_in_station['nearest_station_id'] = station_id_lookup[indices[mask_in_station]]
        track_in_station['distance_to_station'] = distances_m[mask_in_station]

        # 按站点ID和时间排序
        track_in_station = track_in_station.sort_values(['nearest_station_id', 'time']).reset_index(drop=True)

        # 分段：同一站点内连续的点为一组
        # 如果时间间隔 > T_GAP 或站点ID变化，则断开
        track_in_station['prev_time'] = track_in_station.groupby('nearest_station_id')['time'].shift()
        track_in_station['prev_station'] = track_in_station['nearest_station_id'].shift()

        track_in_station['dt'] = (track_in_station['time'] - track_in_station['prev_time']).dt.total_seconds()
        track_in_station['station_changed'] = (track_in_station['nearest_station_id'] != track_in_station['prev_station'])

        # 标记段的开始（第一个点，或时间间隔过大，或站点变化）
        track_in_station['segment_start'] = (
            track_in_station['prev_time'].isna() |
            (track_in_station['dt'] > T_GAP) |
            track_in_station['station_changed']
        )

        # 分配段ID
        track_in_station['segment_id'] = track_in_station['segment_start'].cumsum()

        # 聚合每个段
        segments = track_in_station.groupby(['nearest_station_id', 'segment_id']).agg(
            start_time=('time', 'min'),
            end_time=('time', 'max'),
            stay_lon=('lon', 'mean'),
            stay_lat=('lat', 'mean'),
            num_points=('time', 'count'),
            avg_distance=('distance_to_station', 'mean')
        ).reset_index()

        # 计算持续时间
        segments['duration_s'] = (segments['end_time'] - segments['start_time']).dt.total_seconds()

        # 筛选满足最小停留时间的段
        charging_events = segments[segments['duration_s'] >= T_STAY].copy()

        # 添加taxiid
        charging_events['taxiid'] = taxiid

        # 选择输出列
        result = charging_events[['taxiid', 'nearest_station_id', 'start_time', 'end_time',
                                   'duration_s', 'stay_lon', 'stay_lat', 'num_points']].copy()

        return result

    except Exception as e:
        logging.error(f"处理车辆 {taxiid} 时出错: {e}")
        return pd.DataFrame()


### 4.2 批量处理所有车辆


In [7]:
# 获取所有车辆ID（从目录结构）
def get_all_taxiids(tracks_dir):
    """从目录结构中提取所有taxiid"""
    tracks_path = Path(tracks_dir)
    if not tracks_path.exists():
        logging.error(f"轨迹数据目录不存在: {tracks_dir}")
        return []

    taxiids = []
    for item in tracks_path.iterdir():
        if item.is_dir() and item.name.startswith('taxiid='):
            taxiid = item.name.replace('taxiid=', '')
            taxiids.append(taxiid)

    return sorted(taxiids)


In [8]:
# 获取所有车辆ID
all_taxiids = get_all_taxiids(TRACKS_DIR)
logging.info(f"找到 {len(all_taxiids)} 辆车")

2025-11-13 17:11:33,490 | INFO | 找到 19495 辆车


In [9]:
# 如果文件已存在，可以选择跳过或重新计算
if os.path.exists(CHARGING_EVENTS_FILE):
    logging.info(f"充电事件文件已存在: {CHARGING_EVENTS_FILE}")
    logging.info("如需重新计算，请删除该文件后重新运行")
else:
    # 批量处理
    all_charging_events = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {
            executor.submit(detect_charging_for_one_taxi, tid, TRACKS_DIR,
                          station_tree, station_id_lookup, stations_df): tid
            for tid in all_taxiids
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="检测充电事件"):
            taxiid = futures[future]
            try:
                result = future.result()
                if not result.empty:
                    all_charging_events.append(result)
            except Exception as e:
                logging.error(f"车辆 {taxiid} 处理失败: {e}")

    # 合并所有结果
    if all_charging_events:
        charging_events = pd.concat(all_charging_events, ignore_index=True)
        charging_events = charging_events.sort_values(['taxiid', 'start_time']).reset_index(drop=True)

        # 保存结果
        charging_events.to_parquet(CHARGING_EVENTS_FILE, index=False)
        logging.info(f"检测到 {len(charging_events)} 个充电事件，已保存到 {CHARGING_EVENTS_FILE}")
        logging.info(f"涉及 {charging_events['taxiid'].nunique()} 辆车")
    else:
        logging.warning("未检测到任何充电事件")
        charging_events = pd.DataFrame()

检测充电事件: 100%|██████████| 19495/19495 [13:56<00:00, 23.31it/s] 
2025-11-13 17:25:37,742 | INFO | 检测到 186097 个充电事件，已保存到 output_v2/charging_events.parquet
2025-11-13 17:25:37,755 | INFO | 涉及 19323 辆车


In [6]:
# 加载充电事件数据（如果已保存）
if os.path.exists(CHARGING_EVENTS_FILE):
    charging_events = pd.read_parquet(CHARGING_EVENTS_FILE)
    logging.info(f"已加载 {len(charging_events)} 个充电事件")
    logging.info(f"涉及 {charging_events['taxiid'].nunique()} 辆车")
    print(charging_events.head())
else:
    logging.warning("充电事件文件不存在，请先运行检测步骤")


2025-11-14 15:15:33,797 | INFO | 已加载 186097 个充电事件
2025-11-14 15:15:33,810 | INFO | 涉及 19323 辆车


      taxiid  nearest_station_id          start_time            end_time  \
0  UUUB0C0M7                2266 2020-01-02 01:21:09 2020-01-02 01:35:24   
1  UUUB0C0M7                2205 2020-01-03 02:02:20 2020-01-03 02:13:40   
2  UUUB0C0M7                2330 2020-01-04 04:33:45 2020-01-04 04:57:53   
3  UUUB0C0M7                2159 2020-01-05 02:24:03 2020-01-05 02:36:17   
4  UUUB0C0M7                2161 2020-01-07 12:00:35 2020-01-07 12:40:05   

   duration_s    stay_lon   stay_lat  num_points  
0       855.0  114.129456  22.547155          17  
1       680.0  114.121628  22.543396          20  
2      1448.0  114.142906  22.556120          57  
3       734.0  114.115372  22.558201          20  
4      2370.0  114.116241  22.580210          51  


# 4.3 recharge at private stations

In [10]:
def detect_private_gap_charging(taxiid, tracks_dir, station_tree, station_id_lookup,
                                 dt_threshold=3000, night_start=22, night_end=6,
                                 min_station_distance=200, max_dist=200, up_dt_threshold=7200):
    """
    检测单辆车的私人充电事件（基于深夜长缺失）

    逻辑：
    1. 只保留 zoneid > 0 的轨迹点
    2. 计算相邻点时间间隔 dt
    3. 筛选 dt >= dt_threshold（默认50分钟）的缺失
    4. 判断缺失开始时间在夜间窗口（22:00-06:00）
    5. 检查缺失开始位置距离最近公共充电站 > min_station_distance（默认1000米）

    参数：
    - taxiid: 车辆ID
    - tracks_dir: 轨迹数据目录
    - station_tree: 充电站KDTree（用于快速查询）
    - station_id_lookup: 充电站ID查找数组
    - dt_threshold: 最小缺失时长（秒），默认3000秒（50分钟）
    - night_start: 夜间开始时间（小时），默认22
    - night_end: 夜间结束时间（小时），默认6
    - min_station_distance: 距离最近公共充电站的最小距离（米），默认1000米

    返回：DataFrame with columns: taxiid, start_time, end_time, gap_s, charge_type
    """
    try:
        # 读取轨迹数据
        track_path = os.path.join(tracks_dir, f'taxiid={taxiid}')
        if not os.path.exists(track_path):
            return pd.DataFrame()

        track_files = list(Path(track_path).glob('*.parquet'))
        if not track_files:
            return pd.DataFrame()

        # 读取轨迹数据（使用pandas）
        track = pd.concat([pd.read_parquet(f) for f in track_files], ignore_index=True)
        track = track.sort_values('time').reset_index(drop=True)

        if track.empty:
            return pd.DataFrame()

        # 1. 只保留 zoneid > 0 的点
        if 'zoneid' in track.columns:
            track = track[track['zoneid'] > 0].copy()
        else:
            # 如果没有zoneid列，记录警告但继续处理
            logging.warning(f"车辆 {taxiid} 没有zoneid列，跳过zoneid过滤")
            # 可以选择直接返回空DataFrame
            # return pd.DataFrame()

        if track.empty or len(track) < 2:
            return pd.DataFrame()

        # 2. 计算相邻点的时间间隔
        track['prev_time'] = track['time'].shift()
        track['prev_lon'] = track['lon'].shift()
        track['prev_lat'] = track['lat'].shift()
        track['dt'] = (track['time'] - track['prev_time']).dt.total_seconds()

        # 使用简单的haversine函数计算距离（CPU版本，避免GPU数组问题）
        def haversine_simple(lon1, lat1, lon2, lat2):
            """简单的haversine距离计算（CPU版本）"""
            lon1_rad = np.radians(lon1)
            lat1_rad = np.radians(lat1)
            lon2_rad = np.radians(lon2)
            lat2_rad = np.radians(lat2)
            dlon = lon2_rad - lon1_rad
            dlat = lat2_rad - lat1_rad
            a = np.sin(dlat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon/2)**2
            c = 2 * np.arcsin(np.sqrt(a))
            R = 6371000  # 地球半径（米）
            return c * R

        # 填充NaN值
        prev_lon_filled = track['prev_lon'].fillna(track['lon'])
        prev_lat_filled = track['prev_lat'].fillna(track['lat'])

        # 计算距离（使用.values转换为numpy数组）
        track['dist'] = haversine_simple(
            track['lon'].values, track['lat'].values,
            prev_lon_filled.values, prev_lat_filled.values
        )


        # 3. 筛选 dt >= dt_threshold 的缺失
        long_gaps = track[((track['dt'] >= dt_threshold) & (track['dist'] <= max_dist)) | (track['dt']>=up_dt_threshold)].copy()

        if long_gaps.empty:
            return pd.DataFrame()

        # 4. 判断缺失开始时间是否在夜间窗口（22:00-06:00）
        long_gaps['gap_start_hour'] = long_gaps['prev_time'].dt.hour

        # 夜间窗口：22:00-23:59 或 00:00-05:59
        # night_mask = (
        #     (long_gaps['gap_start_hour'] >= night_start) |  # 22:00-23:59
        #     (long_gaps['gap_start_hour'] < night_end)      # 00:00-05:59
        # )

        #night_gaps = long_gaps[night_mask].copy()
        night_gaps = long_gaps.copy()
        if night_gaps.empty:
            return pd.DataFrame()

        # 5. 检查缺失开始位置距离最近公共充电站的距离
        # 使用KDTree查询最近充电站
        gap_start_coords = night_gaps[['prev_lon', 'prev_lat']].to_numpy()
        distances_deg, indices = station_tree.query(gap_start_coords, k=1)
        distances_m = distances_deg * 111320  # 转换为米（粗略转换，更精确可以用haversine）

        # 筛选距离最近充电站 > min_station_distance 的缺失
        far_from_station_mask = distances_m > min_station_distance
        private_gaps = night_gaps[far_from_station_mask].copy()

        if private_gaps.empty:
            return pd.DataFrame()

        # 构建结果
        private_charges = []
        for idx, row in private_gaps.iterrows():
            private_charges.append({
                'taxiid': taxiid,
                'start_time': row['prev_time'],  # 缺失开始时间
                'end_time': row['time'],         # 缺失结束时间（数据重新出现）
                'gap_s': row['dt'],              # 缺失时长（秒）
                'start_lon': row['prev_lon'],   # 缺失开始位置经度
                'start_lat': row['prev_lat'],   # 缺失开始位置纬度
                'charge_type': 'private_gap'     # 标记为私人充电（缺失型）
            })

        if private_charges:
            return pd.DataFrame(private_charges)
        else:
            return pd.DataFrame()

    except Exception as e:
        logging.error(f"检测车辆 {taxiid} 私人充电事件时出错: {e}")
        import traceback
        logging.error(traceback.format_exc())
        return pd.DataFrame()


def batch_detect_private_gap_charging(all_taxiids, tracks_dir, station_tree, station_id_lookup,
                                       output_file, dt_threshold=3000, night_start=22, night_end=6,
                                       min_station_distance=200,max_dist=200, up_dt_threshold=7200):
    """
    批量检测所有车辆的私人充电事件（深夜长缺失型）

    参数：
    - all_taxiids: 所有车辆ID列表
    - tracks_dir: 轨迹数据目录
    - station_tree: 充电站KDTree（用于快速查询）
    - station_id_lookup: 充电站ID查找数组
    - output_file: 输出文件路径
    - dt_threshold: 最小缺失时长（秒），默认3000秒（50分钟）
    - night_start: 夜间开始时间（小时），默认22
    - night_end: 夜间结束时间（小时），默认6
    - min_station_distance: 距离最近公共充电站的最小距离（米），默认1000米
    """
    if os.path.exists(output_file):
        logging.info(f"私人充电事件文件已存在: {output_file}")
        logging.info("如需重新计算，请删除该文件后重新运行")
        return pd.read_parquet(output_file)

    all_private_charges = []

    for taxiid in tqdm(all_taxiids, desc="检测私人充电事件（深夜长缺失）"):
        result = detect_private_gap_charging(
            taxiid, tracks_dir, station_tree, station_id_lookup,
            dt_threshold=dt_threshold,
            night_start=night_start,
            night_end=night_end,
            min_station_distance=min_station_distance,
            max_dist=max_dist,
            up_dt_threshold=up_dt_threshold
        )
        if not result.empty:
            all_private_charges.append(result)

    if all_private_charges:
        private_charges_df = pd.concat(all_private_charges, ignore_index=True)
        private_charges_df = private_charges_df.sort_values(['taxiid', 'start_time']).reset_index(drop=True)

        # 保存结果
        private_charges_df.to_parquet(output_file, index=False)
        logging.info(f"检测到 {len(private_charges_df)} 个私人充电事件（深夜长缺失），已保存到 {output_file}")
        logging.info(f"涉及 {private_charges_df['taxiid'].nunique()} 辆车")

        # 基本统计
        avg_gap_hours = private_charges_df['gap_s'].mean() / 3600
        median_gap_hours = private_charges_df['gap_s'].median() / 3600
        logging.info(f"平均缺失时长: {avg_gap_hours:.2f} 小时")
        logging.info(f"中位数缺失时长: {median_gap_hours:.2f} 小时")

        return private_charges_df
    else:
        logging.warning("未检测到任何私人充电事件")
        return pd.DataFrame()




In [11]:
# ========== 使用示例 ==========
# 设置参数

DT_THRESHOLD = 1800  # 30分钟
NIGHT_START = 22     # 晚上10点
NIGHT_END = 6        # 早上6点
MIN_STATION_DISTANCE = 200
MAX_DISTANCE = 500
UP_DT_THRESHOLD = 3600  # 60分钟

# 批量检测（需要传入station_tree和station_id_lookup）
private_gap_charges_df = batch_detect_private_gap_charging(
    all_taxiids,
    TRACKS_DIR,
    station_tree,      # 从之前的代码中获取
    station_id_lookup, # 从之前的代码中获取
    PRIVATE_GAP_FILE,
    dt_threshold=DT_THRESHOLD,
    night_start=NIGHT_START,
    night_end=NIGHT_END,
    min_station_distance=MIN_STATION_DISTANCE,
    max_dist= MAX_DISTANCE,
    up_dt_threshold=UP_DT_THRESHOLD
)

# 查看结果
if not private_gap_charges_df.empty:
    print("\n私人充电事件示例：")
    print(private_gap_charges_df.head(10))

    print(f"\n总事件数: {len(private_gap_charges_df)}")
    print(f"涉及车辆数: {private_gap_charges_df['taxiid'].nunique()}")

检测私人充电事件（深夜长缺失）: 100%|██████████| 19495/19495 [05:18<00:00, 61.22it/s]
2025-11-13 18:26:02,833 | INFO | 检测到 372251 个私人充电事件（深夜长缺失），已保存到 output_v2/private_gap_charging_events.parquet
2025-11-13 18:26:02,864 | INFO | 涉及 19316 辆车
2025-11-13 18:26:02,879 | INFO | 平均缺失时长: 2.21 小时
2025-11-13 18:26:02,880 | INFO | 中位数缺失时长: 1.24 小时



私人充电事件示例：
      taxiid          start_time            end_time   gap_s   start_lon  \
0  UUUB0C0M7 2020-01-01 04:17:29 2020-01-01 05:48:13  5444.0  114.143974   
1  UUUB0C0M7 2020-01-01 16:18:31 2020-01-01 17:47:47  5356.0  114.143753   
2  UUUB0C0M7 2020-01-02 04:16:24 2020-01-02 05:41:39  5115.0  114.143974   
3  UUUB0C0M7 2020-01-02 16:09:34 2020-01-02 17:32:36  4982.0  114.143990   
4  UUUB0C0M7 2020-01-03 04:33:40 2020-01-03 05:44:59  4279.0  114.143890   
5  UUUB0C0M7 2020-01-03 16:01:51 2020-01-03 17:08:24  3993.0  114.143898   
6  UUUB0C0M7 2020-01-04 02:11:38 2020-01-04 02:52:55  2477.0  113.813133   
7  UUUB0C0M7 2020-01-04 05:04:38 2020-01-04 06:45:40  6062.0  114.143761   
8  UUUB0C0M7 2020-01-04 16:20:01 2020-01-04 17:24:12  3851.0  114.143723   
9  UUUB0C0M7 2020-01-05 05:03:00 2020-01-05 06:27:21  5061.0  114.143776   

   start_lat  charge_type  
0  22.557758  private_gap  
1  22.557638  private_gap  
2  22.557781  private_gap  
3  22.557774  private_gap  
4  22.557579

In [7]:
if os.path.exists(PRIVATE_GAP_FILE):
    private_gap_charges_df = pd.read_parquet(PRIVATE_GAP_FILE)
    logging.info(f"已加载 {len(private_gap_charges_df)} 个充电事件")
    logging.info(f"涉及 {private_gap_charges_df['taxiid'].nunique()} 辆车")
    print(private_gap_charges_df.head())
else:
    logging.warning("私人充电事件文件不存在，请先运行检测步骤")

2025-11-14 15:18:02,959 | INFO | 已加载 372251 个充电事件
2025-11-14 15:18:02,976 | INFO | 涉及 19316 辆车


      taxiid          start_time            end_time   gap_s   start_lon  \
0  UUUB0C0M7 2020-01-01 04:17:29 2020-01-01 05:48:13  5444.0  114.143974   
1  UUUB0C0M7 2020-01-01 16:18:31 2020-01-01 17:47:47  5356.0  114.143753   
2  UUUB0C0M7 2020-01-02 04:16:24 2020-01-02 05:41:39  5115.0  114.143974   
3  UUUB0C0M7 2020-01-02 16:09:34 2020-01-02 17:32:36  4982.0  114.143990   
4  UUUB0C0M7 2020-01-03 04:33:40 2020-01-03 05:44:59  4279.0  114.143890   

   start_lat  charge_type  
0  22.557758  private_gap  
1  22.557638  private_gap  
2  22.557781  private_gap  
3  22.557774  private_gap  
4  22.557579  private_gap  


In [9]:
# 1. 给公共充电事件添加 charge_type
charging_events['charge_type'] = 'public'

# 2. 准备公共充电事件的字段（作为标准格式）
public_charges_selected = charging_events[[
    'taxiid',
    'nearest_station_id',
    'start_time',
    'end_time',
    'duration_s',
    'stay_lon',
    'stay_lat',
    'charge_type'
]].copy()

# 3. 准备私人充电事件的字段（统一到public的格式）
private_charges_formatted = private_gap_charges_df.copy()

# 添加charge_type
private_charges_formatted['charge_type'] = 'private_gap'

# 统一字段格式
private_charges_selected = pd.DataFrame({
    'taxiid': private_charges_formatted['taxiid'],
    'nearest_station_id': -1,  # 私人充电标记为-1
    'start_time': private_charges_formatted['start_time'],
    'end_time': private_charges_formatted['end_time'],
    'duration_s': private_charges_formatted['gap_s'],  # gap_s -> duration_s
    'stay_lon': private_charges_formatted.get('start_lon', private_charges_formatted.get('lon', np.nan)),  # start_lon -> stay_lon
    'stay_lat': private_charges_formatted.get('start_lat', private_charges_formatted.get('lat', np.nan)),  # start_lat -> stay_lat
    'charge_type': 'private_gap'
})

# 4. 合并所有充电事件
all_charging_events = pd.concat([
    public_charges_selected,
    private_charges_selected
], ignore_index=True)
# 5. 按车辆和时间排序
all_charging_events = all_charging_events.sort_values(['taxiid', 'start_time']).reset_index(drop=True)

In [None]:
# tracktest = private_gap_charges_df.copy()
# idex= tracktest['gap_s'] >=3000
# tracktest=tracktest[idex]
# tracktest['prev_lon'] = tracktest['start_lon'].shift().fillna(tracktest['start_lon'])
# tracktest['prev_lat'] =tracktest['start_lat'].shift().fillna(tracktest['start_lat'])
# tracktest['prev_time'] = tracktest['start_time'].shift()
# tracktest['dt'] = (tracktest['start_time'] - tracktest['prev_time']).dt.total_seconds().fillna(0)
# start_lon_gpu = cp.asarray(tracktest['start_lon'].values)
# start_lat_gpu = cp.asarray(tracktest['start_lat'].values)
# prev_lon_gpu  = cp.asarray(tracktest['prev_lon'].values)
# prev_lat_gpu  = cp.asarray(tracktest['prev_lat'].values)
#
# # 2. 调用函数：传入 GPU 数组
# # 此时函数内部接收到的是 cupy.ndarray，原本报错的 .astype(cp.float64) 就能正常运行了
# dist_gpu = haversine_distance_gpu(
#     start_lon_gpu,
#     start_lat_gpu,
#     prev_lon_gpu,
#     prev_lat_gpu
# )
#
# # 3. 接收结果：将 CuPy 数组转回 NumPy 数组 (下载回 CPU)
# tracktest['dist'] = dist_gpu.get()

In [None]:
# tracktest[10:32]

In [None]:
# import folium
# from folium import plugins
# import pandas as pd
# from pathlib import Path
# import numpy as np
# from datetime import datetime
#
# def visualize_taxi_trajectory_with_charging(taxiid, tracks_dir, charging_events_df,
#                                            private_charges_df, stations_df,
#                                            output_html=None):
#     """
#     可视化单辆车的轨迹和充电事件
#
#     参数：
#     - taxiid: 车辆ID
#     - tracks_dir: 轨迹数据目录
#     - charging_events_df: 公共充电事件DataFrame（包含taxiid, start_time, end_time, nearest_station_id等）
#     - private_charges_df: 私人充电事件DataFrame（包含taxiid, start_time, end_time等）
#     - stations_df: 充电站信息DataFrame（包含station_id, longitude, latitude等）
#     - output_html: 输出HTML文件路径（可选）
#
#     返回：folium地图对象
#     """
#     # 1. 读取轨迹数据
#     track_path = os.path.join(tracks_dir, f'taxiid={taxiid}')
#     if not os.path.exists(track_path):
#         print(f"车辆 {taxiid} 的轨迹数据不存在")
#         return None
#
#     track_files = list(Path(track_path).glob('*.parquet'))
#     if not track_files:
#         print(f"车辆 {taxiid} 没有轨迹文件")
#         return None
#
#     track = pd.concat([pd.read_parquet(f) for f in track_files], ignore_index=True)
#     track = track.sort_values('time').reset_index(drop=True)
#
#     if track.empty:
#         print(f"车辆 {taxiid} 的轨迹数据为空")
#         return None
#
#     # 2. 筛选该车的充电事件
#     public_charges = charging_events_df[charging_events_df['taxiid'] == taxiid].copy()
#     private_charges = private_charges_df[private_charges_df['taxiid'] == taxiid].copy()
#
#     # 合并所有充电事件
#     all_charges = []
#
#     # 公共充电事件
#     if not public_charges.empty:
#         for _, row in public_charges.iterrows():
#             duration_min = (row['end_time'] - row['start_time']).total_seconds() / 60
#             all_charges.append({
#                 'start_time': row['start_time'],
#                 'end_time': row['end_time'],
#                 'lon': row['stay_lon'],
#                 'lat': row['stay_lat'],
#                 'station_id': row['nearest_station_id'],
#                 'duration_min': duration_min,
#                 'charge_type': 'public'
#             })
#
#     # 私人充电事件
#     if not private_charges.empty:
#         for _, row in private_charges.iterrows():
#             duration_min = row['gap_s'] / 60 if 'gap_s' in row else 0
#             all_charges.append({
#                 'start_time': row['start_time'],
#                 'end_time': row['end_time'],
#                 'lon': row.get('start_lon', row.get('lon', None)),
#                 'lat': row.get('start_lat', row.get('lat', None)),
#                 'station_id': None,
#                 'duration_min': duration_min,
#                 'charge_type': 'private'
#             })
#
#     if not all_charges:
#         print(f"车辆 {taxiid} 没有充电事件")
#         return None
#
#     charges_df = pd.DataFrame(all_charges)
#     charges_df = charges_df.sort_values('start_time').reset_index(drop=True)
#
#     # 3. 计算地图中心（使用轨迹中心或充电事件中心）
#     center_lat = track['lat'].mean()
#     center_lon = track['lon'].mean()
#
#     # 4. 创建地图（深圳）
#     m = folium.Map(
#         location=[center_lat, center_lon],
#         zoom_start=12,
#         tiles='OpenStreetMap'
#     )
#
#     # 5. 按日期分组轨迹，用不同颜色绘制
#     track['date'] = track['time'].dt.date
#     unique_dates = track['date'].unique()
#
#     # 生成颜色列表（使用不同颜色区分不同天）
#     colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred',
#               'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue',
#               'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen',
#               'gray', 'black', 'lightgray']
#
#     # 为每天绘制轨迹
#     for idx, date in enumerate(sorted(unique_dates)):
#         date_track = track[track['date'] == date]
#         color = colors[idx % len(colors)]
#
#         # 提取坐标点
#         coords = [[row['lat'], row['lon']] for _, row in date_track.iterrows()]
#
#         # 绘制轨迹线
#         folium.PolyLine(
#             coords,
#             color=color,
#             weight=2,
#             opacity=0.6,
#             popup=f"日期: {date}",
#             tooltip=f"日期: {date}"
#         ).add_to(m)
#
#         # 添加日期标记（在轨迹起点）
#         if len(coords) > 0:
#             folium.Marker(
#                 coords[0],
#                 icon=folium.Icon(color=color, icon='info-sign'),
#                 popup=f"日期: {date}<br>起点",
#                 tooltip=f"{date} 起点"
#             ).add_to(m)
#
#     # 6. 标记充电事件
#     for _, charge in charges_df.iterrows():
#         if pd.isna(charge['lon']) or pd.isna(charge['lat']):
#             continue
#
#         if charge['charge_type'] == 'public':
#             # 公共充电站
#             station_id = int(charge['station_id'])
#             duration_str = f"{charge['duration_min']:.1f}分钟"
#
#             # 获取充电站信息
#             station_info = stations_df[stations_df['station_id'] == station_id]
#             if not station_info.empty:
#                 station_lon = station_info.iloc[0]['longitude']
#                 station_lat = station_info.iloc[0]['latitude']
#             else:
#                 station_lon = charge['lon']
#                 station_lat = charge['lat']
#
#             # 创建弹出信息
#             popup_html = f"""
#             <div style="font-size: 12px;">
#                 <b>公共充电站</b><br>
#                 站ID: {station_id}<br>
#                 开始时间: {charge['start_time']}<br>
#                 结束时间: {charge['end_time']}<br>
#                 充电时长: {duration_str}
#             </div>
#             """
#
#             folium.Marker(
#                 [station_lat, station_lon],
#                 icon=folium.Icon(color='green', icon='plug', prefix='fa'),
#                 popup=folium.Popup(popup_html, max_width=300),
#                 tooltip=f"充电站 {station_id}<br>{duration_str}"
#             ).add_to(m)
#
#         else:
#             # 私人充电
#             duration_str = f"{charge['duration_min']:.1f}分钟"
#
#             popup_html = f"""
#             <div style="font-size: 12px;">
#                 <b>私人充电</b><br>
#                 开始时间: {charge['start_time']}<br>
#                 结束时间: {charge['end_time']}<br>
#                 充电时长: {duration_str}
#             </div>
#             """
#
#             folium.Marker(
#                 [charge['lat'], charge['lon']],
#                 icon=folium.Icon(color='red', icon='home', prefix='fa'),
#                 popup=folium.Popup(popup_html, max_width=300),
#                 tooltip=f"私人充电<br>{duration_str}"
#             ).add_to(m)
#
#     # 7. 添加图例
#     legend_html = '''
#     <div style="position: fixed;
#                 bottom: 50px; right: 50px; width: 200px; height: auto;
#                 background-color: white; z-index:9999; font-size:14px;
#                 border:2px solid grey; border-radius: 5px; padding: 10px">
#     <h4>图例</h4>
#     <p><i class="fa fa-plug" style="color:green"></i> 公共充电站</p>
#     <p><i class="fa fa-home" style="color:red"></i> 私人充电</p>
#     <p><span style="color:blue">━━━</span> 轨迹（不同颜色=不同日期）</p>
#     </div>
#     '''
#     m.get_root().html.add_child(folium.Element(legend_html))
#
#     # 8. 添加标题
#     title_html = f'''
#     <h3 align="center" style="font-size:20px"><b>车辆 {taxiid} 轨迹与充电事件</b></h3>
#     '''
#     m.get_root().html.add_child(folium.Element(title_html))
#
#     # 9. 保存地图
#     if output_html:
#         m.save(output_html)
#         print(f"地图已保存到: {output_html}")
#
#     return m
#



In [None]:
# # ========== 使用示例 ==========
# # 假设你已经加载了以下数据：
# # - charging_events_df: 公共充电事件
# # - private_gap_charges_df: 私人充电事件
# # - stations_df: 充电站信息
#
# # 选择一个车辆ID
# TID = 'UUUB0C0M7'  # 替换为你想查看的车辆ID
#
# # 生成可视化地图
# output_path = f'{OUTPUT_DIR}/trajectory_{TID}.html'
#
# m = visualize_taxi_trajectory_with_charging(
#     taxiid=TID,
#     tracks_dir=TRACKS_DIR,
#     charging_events_df=charging_events,  # 你的公共充电事件DataFrame
#     private_charges_df=private_gap_charges_df,  # 你的私人充电事件DataFrame
#     stations_df=stations_df,  # 充电站信息DataFrame
#     output_html=output_path
# )
#
# # 在notebook中显示地图
# if m:
#     m

In [None]:


# def plot_private_charging_locations(taxiid, private_charges_df, tracks_dir=None,
#                                     stations_df=None, show_trajectory=False):
#     """
#     绘制单辆车的私人充电位置
#
#     参数：
#     - taxiid: 车辆ID
#     - private_charges_df: 私人充电事件DataFrame
#     - tracks_dir: 轨迹数据目录（可选，如果show_trajectory=True）
#     - stations_df: 充电站信息DataFrame（可选，用于显示充电站位置作为参考）
#     - show_trajectory: 是否显示轨迹（默认False）
#
#     返回：folium地图对象
#     """
#     # 1. 筛选该车的私人充电事件
#     private_charges = private_charges_df[private_charges_df['taxiid'] == taxiid].copy()
#
#     if private_charges.empty:
#         print(f"车辆 {taxiid} 没有私人充电事件")
#         return None
#
#     # 确保有位置信息
#     if 'start_lon' in private_charges.columns:
#         private_charges['lon'] = private_charges['start_lon']
#         private_charges['lat'] = private_charges['start_lat']
#     elif 'lon' not in private_charges.columns or 'lat' not in private_charges.columns:
#         print(f"车辆 {taxiid} 的私人充电事件缺少位置信息")
#         return None
#
#     # 过滤掉没有位置信息的记录
#     private_charges = private_charges[
#         private_charges['lon'].notna() & private_charges['lat'].notna()
#     ].copy()
#
#     if private_charges.empty:
#         print(f"车辆 {taxiid} 没有有效的私人充电位置信息")
#         return None
#
#     # 2. 计算地图中心
#     center_lat = private_charges['lat'].mean()
#     center_lon = private_charges['lon'].mean()
#
#     # 3. 创建地图
#     m = folium.Map(
#         location=[center_lat, center_lon],
#         zoom_start=13,
#         tiles='OpenStreetMap'
#     )
#
#     # 4. 标记每个私人充电位置
#     for idx, row in private_charges.iterrows():
#         # 计算充电时长
#         if 'gap_s' in row:
#             duration_hours = row['gap_s'] / 3600
#             duration_str = f"{duration_hours:.2f}小时"
#         elif 'end_time' in row and 'start_time' in row:
#             duration_seconds = (row['end_time'] - row['start_time']).total_seconds()
#             duration_hours = duration_seconds / 3600
#             duration_str = f"{duration_hours:.2f}小时"
#         else:
#             duration_str = "未知"
#
#         # 创建弹出信息
#         popup_html = f"""
#         <div style="font-size: 12px;">
#             <b>私人充电位置</b><br>
#             开始时间: {row['start_time']}<br>
#             结束时间: {row.get('end_time', 'N/A')}<br>
#             充电时长: {duration_str}<br>
#             位置: ({row['lat']:.6f}, {row['lon']:.6f})
#         </div>
#         """
#
#         # 根据充电时长设置不同的颜色深度
#         if duration_hours >= 6:
#             color = 'darkred'  # 长时间充电
#         elif duration_hours >= 4:
#             color = 'red'
#         elif duration_hours >= 2:
#             color = 'orange'
#         else:
#             color = 'lightred'  # 短时间充电
#
#         folium.Marker(
#             [row['lat'], row['lon']],
#             icon=folium.Icon(color=color, icon='home', prefix='fa'),
#             popup=folium.Popup(popup_html, max_width=300),
#             tooltip=f"私人充电 #{idx+1}<br>{duration_str}"
#         ).add_to(m)
#
#         # 添加编号标签
#         folium.CircleMarker(
#             [row['lat'], row['lon']],
#             radius=8,
#             popup=f"#{idx+1}",
#             tooltip=f"#{idx+1}",
#             color='black',
#             fill=True,
#             fillColor='white',
#             fillOpacity=0.8,
#             weight=1
#         ).add_to(m)
#
#         # 在标记旁边添加文本标签
#         folium.Marker(
#             [row['lat'] + 0.001, row['lon'] + 0.001],  # 稍微偏移位置
#             icon=folium.DivIcon(
#                 html=f'<div style="font-size: 10px; color: black; font-weight: bold;">#{idx+1}</div>',
#                 icon_size=(20, 20),
#                 icon_anchor=(0, 0)
#             )
#         ).add_to(m)
#
#     # 5. 可选：显示轨迹（如果提供tracks_dir）
#     if show_trajectory and tracks_dir:
#         track_path = os.path.join(tracks_dir, f'taxiid={taxiid}')
#         if os.path.exists(track_path):
#             track_files = list(Path(track_path).glob('*.parquet'))
#             if track_files:
#                 track = pd.concat([pd.read_parquet(f) for f in track_files], ignore_index=True)
#                 track = track.sort_values('time').reset_index(drop=True)
#
#                 if not track.empty:
#                     # 绘制轨迹线（简化版，采样显示）
#                     coords = [[row['lat'], row['lon']]
#                              for _, row in track[::max(1, len(track)//1000)].iterrows()
#                              if pd.notna(row['lat']) and pd.notna(row['lon'])]
#
#                     if coords:
#                         folium.PolyLine(
#                             coords,
#                             color='gray',
#                             weight=1,
#                             opacity=0.3,
#                             tooltip='轨迹'
#                         ).add_to(m)
#
#     # 6. 可选：显示附近充电站作为参考
#     if stations_df is not None:
#         # 只显示距离私人充电位置较近的充电站（5km内）
#         from scipy.spatial import cKDTree
#         station_coords = stations_df[['longitude', 'latitude']].to_numpy()
#         station_tree = cKDTree(station_coords)
#
#         private_coords = private_charges[['lon', 'lat']].to_numpy()
#         distances, indices = station_tree.query(private_coords, k=1, distance_upper_bound=0.05)  # 约5km
#
#         nearby_stations = set()
#         for dist, idx in zip(distances, indices):
#             if dist < 0.05:  # 约5km
#                 nearby_stations.add(idx)
#
#         for idx in nearby_stations:
#             station = stations_df.iloc[idx]
#             folium.Marker(
#                 [station['latitude'], station['longitude']],
#                 icon=folium.Icon(color='blue', icon='plug', prefix='fa'),
#                 popup=f"充电站 {station['station_id']}",
#                 tooltip=f"充电站 {station['station_id']}"
#             ).add_to(m)
#
#     # 7. 添加图例
#     legend_html = '''
#     <div style="position: fixed;
#                 bottom: 50px; right: 50px; width: 180px; height: auto;
#                 background-color: white; z-index:9999; font-size:12px;
#                 border:2px solid grey; border-radius: 5px; padding: 10px">
#     <h4>图例</h4>
#     <p><i class="fa fa-home" style="color:darkred"></i> 私人充电 (≥6h)</p>
#     <p><i class="fa fa-home" style="color:red"></i> 私人充电 (4-6h)</p>
#     <p><i class="fa fa-home" style="color:orange"></i> 私人充电 (2-4h)</p>
#     <p><i class="fa fa-home" style="color:lightred"></i> 私人充电 (<2h)</p>
#     <p><i class="fa fa-plug" style="color:blue"></i> 附近公共充电站</p>
#     </div>
#     '''
#     m.get_root().html.add_child(folium.Element(legend_html))
#
#     # 8. 添加标题
#     title_html = f'''
#     <h3 align="center" style="font-size:18px">
#         <b>车辆 {taxiid} 私人充电位置</b><br>
#         <span style="font-size:14px">共 {len(private_charges)} 个私人充电事件</span>
#     </h3>
#     '''
#     m.get_root().html.add_child(folium.Element(title_html))
#
#     return m
#
#
# # ========== 使用示例 ==========
# # 选择一个车辆ID
# TID = 'UUUB0C0P5'  # 替换为你想查看的车辆ID
#
# # 绘制私人充电位置
# m = plot_private_charging_locations(
#     taxiid=TID,
#     private_charges_df=private_gap_charges_df,  # 你的私人充电事件DataFrame
#     stations_df=stations_df,  # 可选：显示附近充电站作为参考
#     show_trajectory=False  # 可选：是否显示轨迹
# )
#
# # 在notebook中显示地图
# if m:
#     m

In [None]:
# def find_nearest_station(lon, lat, stations_df):
#     """
#     查找给定位置最近的充电站
#
#     参数：
#     - lon: 经度
#     - lat: 纬度
#     - stations_df: 充电站信息DataFrame（包含longitude和latitude列）
#
#     返回：最近充电站的信息和距离（米）
#     """
#     if stations_df.empty:
#         return None, None
#
#     # 构建KDTree
#     station_coords = stations_df[['longitude', 'latitude']].to_numpy()
#     station_tree = cKDTree(station_coords)
#
#     # 查询最近充电站
#     query_point = np.array([[lon, lat]])
#     distances_deg, indices = station_tree.query(query_point, k=1)
#
#     # 转换为米（粗略转换）
#     # 更精确的方法是用haversine公式
#     distance_m_approx = distances_deg[0] * 111320  # 粗略转换
#
#     # 使用haversine公式精确计算距离
#     def haversine_distance(lon1, lat1, lon2, lat2):
#         """计算两点间的大圆距离（米）"""
#         from math import radians, sin, cos, sqrt, atan2
#
#         R = 6371000  # 地球半径（米）
#         lon1_rad = radians(lon1)
#         lat1_rad = radians(lat1)
#         lon2_rad = radians(lon2)
#         lat2_rad = radians(lat2)
#
#         dlon = lon2_rad - lon1_rad
#         dlat = lat2_rad - lat1_rad
#
#         a = sin(dlat/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon/2)**2
#         c = 2 * atan2(sqrt(a), sqrt(1-a))
#
#         return R * c
#
#     nearest_idx = indices[0]
#     nearest_station = stations_df.iloc[nearest_idx]
#
#     # 精确计算距离
#     distance_m = haversine_distance(
#         lon, lat,
#         nearest_station['longitude'],
#         nearest_station['latitude']
#     )
#
#     return nearest_station, distance_m


In [None]:
# # ========== 查询代码 ==========
# # 假设 tracktest 是你的DataFrame变量名
# # 假设你有未过滤的充电站信息（可能是从CSV文件读取的原始数据）
# # 1. 获取 tracktest 中索引5和7的数据
# if 'tracktest' in locals() or 'tracktest' in globals():
#     point_5 = tracktest.iloc[20]
#     point_7 = tracktest.iloc[21]
#
#     print("=" * 60)
#     print("查询结果")
#     print("=" * 60)
#
#     # 2. 读取未过滤的充电站信息（如果还没有加载）
#     # 假设原始充电站文件是 'station_information.csv'
#     stations_df_raw = pd.read_csv(STATION_CSV, dtype={'station_id': np.int32})
#
#     # 3. 查询索引5的数据
#     print("\n【索引5的数据】")
#     print(f"经度: {point_5.get('start_lon', point_5.get('longitude', 'N/A'))}")
#     print(f"纬度: {point_5.get('start_lat', point_5.get('latitude', 'N/A'))}")
#
#     lon_5 = point_5.get('start_lon', point_5.get('longitude', None))
#     lat_5 = point_5.get('start_lat', point_5.get('latitude', None))
#
#     if lon_5 is not None and lat_5 is not None:
#         nearest_station_5, distance_5 = find_nearest_station(
#             lon_5, lat_5, stations_df_raw
#         )
#
#         if nearest_station_5 is not None:
#             print(f"\n最近的充电站:")
#             print(f"  站ID: {nearest_station_5.get('station_id', 'N/A')}")
#             print(f"  经度: {nearest_station_5['longitude']:.6f}")
#             print(f"  纬度: {nearest_station_5['latitude']:.6f}")
#             print(f"  距离: {distance_5:.2f} 米 ({distance_5/1000:.3f} 公里)")
#
#             # 显示更多充电站信息（如果有）
#             if 'station_name' in nearest_station_5:
#                 print(f"  站名: {nearest_station_5['station_name']}")
#         else:
#             print("未找到最近的充电站")
#     else:
#         print("错误：无法获取经纬度信息")
#
#     # 4. 查询索引7的数据
#     print("\n" + "-" * 60)
#     print("【索引7的数据】")
#     print(f"经度: {point_7.get('start_lon', point_7.get('longitude', 'N/A'))}")
#     print(f"纬度: {point_7.get('start_lat', point_7.get('latitude', 'N/A'))}")
#
#     lon_7 = point_7.get('start_lon', point_7.get('longitude', None))
#     lat_7 = point_7.get('start_lat', point_7.get('latitude', None))
#
#     if lon_7 is not None and lat_7 is not None:
#         nearest_station_7, distance_7 = find_nearest_station(
#             lon_7, lat_7, stations_df_raw
#         )
#
#         if nearest_station_7 is not None:
#             print(f"\n最近的充电站:")
#             print(f"  站ID: {nearest_station_7.get('station_id', 'N/A')}")
#             print(f"  经度: {nearest_station_7['longitude']:.6f}")
#             print(f"  纬度: {nearest_station_7['latitude']:.6f}")
#             print(f"  距离: {distance_7:.2f} 米 ({distance_7/1000:.3f} 公里)")
#
#             # 显示更多充电站信息（如果有）
#             if 'station_name' in nearest_station_7:
#                 print(f"  站名: {nearest_station_7['station_name']}")
#         else:
#             print("未找到最近的充电站")
#     else:
#         print("错误：无法获取经纬度信息")
#
#     print("\n" + "=" * 60)
#
# else:
#     print("错误：未找到 tracktest 变量")
#     print("请确保 tracktest DataFrame 已经定义")

In [None]:
import folium

# 深圳中心坐标（大约）
shenzhen_center = [22.5431, 114.0579]

# 创建地图
m = folium.Map(
    location=shenzhen_center,
    zoom_start=12,
    tiles='OpenStreetMap'
)

# 给定经纬度（示例，你可以修改）
given_lon = 114.1731  # 经度
given_lat = 22.6067   # 纬度

# 在地图上标记点
folium.Marker(
    [given_lat, given_lon],
    popup=f'位置: ({given_lat}, {given_lon})',
    tooltip='点击查看坐标',
    icon=folium.Icon(color='red', icon='info-sign')
).add_to(m)

# 显示地图
m

## 5. 排队模拟和等待时间计算


In [8]:
def simulate_queue(df):
    """
    模拟单个充电站的排队情况

    输入：DataFrame，包含一个站的所有充电事件，按start_time排序
    输出：DataFrame，包含等待时间、实际充电时间、是否放弃等信息
    """
    if df.empty:
        return pd.DataFrame()

    k = int(df['num_piles'].iloc[0])  # 充电桩数量
    busy_chargers_heap = []  # 最小堆，存储正在充电的车辆结束时间
    results = []

    for _, row in df.iterrows():
        arrival_time = row['start_time']
        departure_time = row['end_time']

        # 移除在当前车辆到达前已空闲的充电桩
        while busy_chargers_heap and busy_chargers_heap[0] <= arrival_time:
            heappop(busy_chargers_heap)

        # 检查是否有可用充电桩
        if len(busy_chargers_heap) < k:
            # 有空闲充电桩，无需等待
            wait_seconds = 0
            charge_start_time = arrival_time
        else:
            # 所有充电桩都忙，需要等待
            earliest_free_time = heappop(busy_chargers_heap)
            wait_seconds = (earliest_free_time - arrival_time).total_seconds()
            charge_start_time = earliest_free_time

        # 判断是否放弃充电
        stay_duration = (departure_time - arrival_time).total_seconds()
        gave_up = (wait_seconds >= stay_duration)

        # 计算实际充电时长
        if gave_up:
            actual_charge_seconds = 0
            # 如果放弃，把之前弹出的时间放回去
            if 'earliest_free_time' in locals() and wait_seconds > 0:
                heappush(busy_chargers_heap, earliest_free_time)
        else:
            actual_charge_seconds = (departure_time - charge_start_time).total_seconds()
            # 该车辆占用充电桩直到离开时间
            heappush(busy_chargers_heap, departure_time)

        results.append({
            'taxiid': row['taxiid'],
            'nearest_station_id': row['nearest_station_id'],
            'arrive_time': arrival_time,
            'leave_time': departure_time,
            'wait_dur': wait_seconds,
            'giveup': gave_up,
            'charge_dur': actual_charge_seconds
        })

    return pd.DataFrame(results)


In [None]:
# 执行排队模拟
if 'charging_events' in locals() and not charging_events.empty:
    # 合并站点信息（获取num_piles）
    queue_input = charging_events.merge(
        stations_df[['station_id', 'num_piles']],
        left_on='nearest_station_id',
        right_on='station_id',
        how='left'
    )

    # 按站点和时间排序
    queue_input = queue_input.sort_values(['nearest_station_id', 'start_time'], kind='mergesort')

    # 对每个站点进行排队模拟
    all_queue_results = []
    for station_id, group in tqdm(queue_input.groupby('nearest_station_id'), desc="排队模拟"):
        result = simulate_queue(group)
        if not result.empty:
            all_queue_results.append(result)

    # 合并结果
    if all_queue_results:
        char_queue_df = pd.concat(all_queue_results, ignore_index=True)
        char_queue_df = char_queue_df.sort_values(['taxiid', 'arrive_time']).reset_index(drop=True)

        # 保存结果
        char_queue_df.to_parquet(CHAR_QUEUE_FILE, index=False)
        logging.info(f"排队模拟完成，结果已保存到 {CHAR_QUEUE_FILE}")

        # 基本统计
        avg_wait = char_queue_df['wait_dur'].mean()
        giveup_rate = char_queue_df['giveup'].mean()
        logging.info(f"平均等待时间: {avg_wait:.2f} 秒 ({avg_wait/60:.2f} 分钟)")
        logging.info(f"放弃率: {giveup_rate:.2%}")
    else:
        logging.warning("排队模拟未产生结果")
        char_queue_df = pd.DataFrame()
else:
    logging.warning("充电事件数据不存在，无法进行排队模拟")


In [None]:
# 加载排队结果（如果已保存）
if os.path.exists(CHAR_QUEUE_FILE):
    char_queue_df = pd.read_parquet(CHAR_QUEUE_FILE)
    logging.info(f"已加载排队模拟结果: {len(char_queue_df)} 条记录")
    print(char_queue_df.head())
else:
    logging.warning("排队模拟结果文件不存在")


## 6. 能耗和SOC计算

### 6.1 计算两次充电之间的能耗


In [None]:
check = (all_charging_events['taxiid'] == 'UUUB0C1P1')
all_charging_events[check]

In [None]:
# ========== 第二部分：基于完整充电事件DF计算能耗 ==========

def calculate_energy_between_charges_v2(taxiid, tracks_dir, charging_events_taxi):
    """
    计算单辆车在两次充电之间的能耗（支持公共和私人充电混合）

    输入：
    - taxiid: 车辆ID
    - tracks_dir: 轨迹数据目录
    - charging_events_taxi: 该车的充电事件DataFrame（已按时间排序，包含公共和私人充电）

    输出：
    DataFrame with columns: taxiid, charge_start, prev_end, gap_s, drive_s,
                            idle_s, distance_km, energy_used_kWh, missing_s, charge_type
    """
    try:
        # 读取轨迹数据
        track_path = os.path.join(tracks_dir, f'taxiid={taxiid}')
        if not os.path.exists(track_path):
            return pd.DataFrame()

        track_files = list(Path(track_path).glob('*.parquet'))
        if not track_files:
            return pd.DataFrame()

        # 读取轨迹（使用cudf加速）
        track = cudf.concat([cudf.read_parquet(f) for f in track_files], ignore_index=True)

        # 过滤zoneid < 0的点
        unvalid_idx = track['zoneid'] < 0
        track = track[~unvalid_idx]

        track = track.sort_values('time').reset_index(drop=True)

        if track.empty or len(charging_events_taxi) < 2:
            return pd.DataFrame()

        # 计算相邻点之间的距离和时间差
        track['prev_lon'] = track['lon'].shift().fillna(track['lon'])
        track['prev_lat'] = track['lat'].shift().fillna(track['lat'])
        track['prev_time'] = track['time'].shift()
        track = track.reset_index(drop=True)
        track['dt'] = (track['time'] - track['prev_time']).dt.total_seconds().fillna(0)
        track['dist'] = haversine_distance_gpu(
            track['lon'], track['lat'],
            track['prev_lon'], track['prev_lat']
        )

        # 过滤异常跳点（短时间内距离过大）
        mask_jump = (track['dist'] / track['dt'] > 20)
        track = track[~mask_jump]

        # 标记怠速点
        track['idle'] = (track['velocity'] <= 5).astype('int8')

        # 转换为pandas以便后续处理
        track_pd = track[['time', 'lon', 'lat', 'dt', 'dist', 'idle']].to_pandas()

        # 对每对相邻充电事件计算能耗
        results = []
        charging_events_sorted = charging_events_taxi.sort_values('start_time').reset_index(drop=True)

        for i in range(1, len(charging_events_sorted)):
            prev_end = charging_events_sorted.iloc[i-1]['end_time']
            curr_start = charging_events_sorted.iloc[i]['start_time']
            curr_charge_type = charging_events_sorted.iloc[i]['charge_type']  # 当前充电事件的类型

            # 提取两次充电之间的轨迹段
            segment = track_pd[
                (track_pd['time'] > prev_end) &
                (track_pd['time'] <= curr_start)
            ].copy()

            if segment.empty:
                continue

            # 检查数据缺失（时间间隔过大）
            segment['time_gap'] = segment['dt']
            large_gaps = segment[segment['time_gap'] > MAX_GAP_SECONDS]
            missing_s = large_gaps['time_gap'].sum() if not large_gaps.empty else 0

            # 只计算有数据的部分
            valid_segment = segment[segment['time_gap'] <= MAX_GAP_SECONDS]

            if valid_segment.empty:
                continue

            # 计算行驶和怠速时间
            drive_s = int(valid_segment.loc[valid_segment['idle'] == 0, 'dt'].sum())
            idle_s = int(valid_segment.loc[valid_segment['idle'] == 1, 'dt'].sum())
            gap_s = drive_s + idle_s

            # 计算距离和能耗
            distance_km = valid_segment['dist'].sum() / 1000
            energy_used_kWh = distance_km * CONS_KWH_PER_KM

            results.append({
                'taxiid': taxiid,
                'charge_start': curr_start,
                'prev_end': prev_end,
                'gap_s': gap_s,
                'drive_s': drive_s,
                'idle_s': idle_s,
                'distance_km': distance_km,
                'energy_used_kWh': energy_used_kWh,
                'missing_s': missing_s,
                'charge_type': curr_charge_type  # 标记当前充电事件的类型
            })

        return pd.DataFrame(results)

    except Exception as e:
        logging.error(f"计算车辆 {taxiid} 能耗时出错: {e}")
        import traceback
        logging.error(traceback.format_exc())
        return pd.DataFrame()



In [None]:
# ========== 批量计算所有车辆的能耗（使用合并后的完整充电事件） ==========
ENERGY_GAP_FILE2 = f'{OUTPUT_DIR}/energy_gap2.parquet'
# 如果文件已存在，可以选择跳过或重新计算
if os.path.exists(ENERGY_GAP_FILE2):
    logging.info(f"能耗数据文件已存在: {ENERGY_GAP_FILE2}")
    logging.info("如需重新计算，请删除该文件后重新运行")
    energy_gap_df = pd.read_parquet(ENERGY_GAP_FILE2)
else:
    all_energy_results = []

    # 按车辆分组处理
    for taxiid, ce_group in tqdm(all_charging_events.groupby('taxiid'), desc="计算能耗（含私人充电）"):
        if len(ce_group) < 2:  # 至少需要2次充电才能计算间隔
            continue

        result = calculate_energy_between_charges_v2(taxiid, TRACKS_DIR, ce_group)
        if not result.empty:
            all_energy_results.append(result)

        # 定期释放GPU内存
        cp._default_memory_pool.free_all_blocks()

    if all_energy_results:
        energy_gap_df = pd.concat(all_energy_results, ignore_index=True)
        energy_gap_df = energy_gap_df.sort_values(['taxiid', 'charge_start']).reset_index(drop=True)

        # 保存结果
        energy_gap_df.to_parquet(ENERGY_GAP_FILE2, index=False)
        logging.info(f"能耗计算完成，结果已保存到 {ENERGY_GAP_FILE2}")
        logging.info(f"共计算了 {len(energy_gap_df)} 个充电间隔")
        logging.info(f"  公共充电间隔: {len(energy_gap_df[energy_gap_df['charge_type'] == 'public'])}")
        logging.info(f"  私人充电间隔: {len(energy_gap_df[energy_gap_df['charge_type'] == 'private_gap'])}")
    else:
        logging.warning("未计算出任何能耗数据")
        energy_gap_df = pd.DataFrame()

# 查看结果
if not energy_gap_df.empty:
    print("\n能耗数据示例：")
    print(energy_gap_df.head(10))

    print("\n能耗数据统计：")
    print(f"总充电间隔数: {len(energy_gap_df)}")
    print(f"  公共充电间隔: {len(energy_gap_df[energy_gap_df['charge_type'] == 'public'])}")
    print(f"  私人充电间隔: {len(energy_gap_df[energy_gap_df['charge_type'] == 'private_gap'])}")

    if len(energy_gap_df[energy_gap_df['charge_type'] == 'public']) > 0:
        print(f"\n公共充电间隔的平均能耗: {energy_gap_df[energy_gap_df['charge_type'] == 'public']['energy_used_kWh'].mean():.2f} kWh")
        print(f"公共充电间隔的平均距离: {energy_gap_df[energy_gap_df['charge_type'] == 'public']['distance_km'].mean():.2f} km")

    if len(energy_gap_df[energy_gap_df['charge_type'] == 'private_gap']) > 0:
        print(f"\n私人充电间隔的平均能耗: {energy_gap_df[energy_gap_df['charge_type'] == 'private_gap']['energy_used_kWh'].mean():.2f} kWh")
        print(f"私人充电间隔的平均距离: {energy_gap_df[energy_gap_df['charge_type'] == 'private_gap']['distance_km'].mean():.2f} km")

In [None]:
idx = energy_gap_df['energy_used_kWh'] >75
energy_gap_df2 = energy_gap_df.loc[idx]

In [None]:
energy_gap_df2

# 6.2 SOC 轨迹

In [None]:
# ========== 基于all_charging_events计算SOC轨迹 ==========

# 检查必要的数据文件是否存在
if all([os.path.exists(CHAR_QUEUE_FILE),
        os.path.exists(ENERGY_GAP_FILE),
        'all_charging_events' in locals() or 'all_charging_events' in globals()]):

    # 1. 加载数据
    cq = pd.read_parquet(CHAR_QUEUE_FILE)  # 排队结果
    eg = pd.read_parquet(ENERGY_GAP_FILE)  # 能耗数据（包含charge_type）

    # 使用已有的all_charging_events
    battery_trace = all_charging_events.copy()

    # 2. 分离公共和私人充电事件
    public_mask = battery_trace['charge_type'] == 'public'
    private_mask = battery_trace['charge_type'] == 'private_gap'

    battery_trace_public = battery_trace[public_mask].copy()
    battery_trace_private = battery_trace[private_mask].copy()

    # 3. 处理公共充电事件：合并排队结果和站点功率
    if not battery_trace_public.empty:
        # 合并排队结果
        battery_trace_public = battery_trace_public.merge(
            cq[['taxiid', 'arrive_time', 'charge_dur', 'wait_dur', 'giveup']],
            left_on=['taxiid', 'start_time'],
            right_on=['taxiid', 'arrive_time'],
            how='left'
        )

        # 合并站点平均功率
        battery_trace_public = battery_trace_public.merge(
            stations_df[['station_id', 'avg_power']],
            left_on='nearest_station_id',
            right_on='station_id',
            how='left'
        )
        battery_trace_public = battery_trace_public.drop(columns=['station_id'], errors='ignore')

    # 4. 处理私人充电事件：设置默认值
    if not battery_trace_private.empty:
        battery_trace_private['arrive_time'] = battery_trace_private['start_time']
        battery_trace_private['charge_dur'] = battery_trace_private['duration_s']  # gap_s作为充电时长
        battery_trace_private['wait_dur'] = 0  # 私人充电没有等待
        battery_trace_private['giveup'] = False  # 私人充电不放弃
        battery_trace_private['avg_power'] = np.nan  # 私人充电没有功率信息

    # 5. 合并公共和私人充电事件
    if not battery_trace_public.empty and not battery_trace_private.empty:
        # 统一列名
        common_cols = ['taxiid', 'nearest_station_id', 'start_time', 'end_time',
                      'duration_s', 'stay_lon', 'stay_lat', 'charge_type',
                      'arrive_time', 'charge_dur', 'wait_dur', 'giveup', 'avg_power']

        battery_trace = pd.concat([
            battery_trace_public[common_cols],
            battery_trace_private[common_cols]
        ], ignore_index=True)
    elif not battery_trace_public.empty:
        battery_trace = battery_trace_public
    elif not battery_trace_private.empty:
        battery_trace = battery_trace_private
    else:
        battery_trace = pd.DataFrame()

    if battery_trace.empty:
        logging.warning("没有充电事件数据，无法计算SOC轨迹")
    else:
        # 6. 按车辆和时间排序
        battery_trace = battery_trace.sort_values(['taxiid', 'start_time']).reset_index(drop=True)
        battery_trace['prev_end'] = battery_trace.groupby('taxiid')['end_time'].shift()

        # 7. 合并能耗数据（两次充电之间的能耗，包含charge_type）
        battery_trace = battery_trace.merge(
            eg[['taxiid', 'charge_start', 'energy_used_kWh', 'missing_s',
                'gap_s', 'drive_s', 'idle_s', 'distance_km', 'charge_type']],
            left_on=['taxiid', 'start_time'],
            right_on=['taxiid', 'charge_start'],
            how='left',
            suffixes=('', '_from_energy')
        )

        # 使用能耗表中的charge_type（更准确，因为它标记的是当前充电事件的类型）
        battery_trace['charge_type'] = battery_trace['charge_type_from_energy'].fillna(battery_trace['charge_type'])
        battery_trace = battery_trace.drop(columns=['charge_type_from_energy', 'charge_start'], errors='ignore')

        # 8. 计算充入电量
        battery_trace['charge_dur_hours'] = battery_trace['charge_dur'] / 3600

        # 公共充电：根据功率和时长计算
        # 私人充电：直接设为满电容量（CAP_KWH）
        battery_trace['energy_in_kWh'] = np.where(
            battery_trace['charge_type'] == 'private_gap',
            CAP_KWH,  # 私人充电假设充满
            np.where(
                battery_trace['giveup'] == False,
                battery_trace['charge_dur_hours'] * battery_trace['avg_power'] * CHG_EFFICIENCY,
                0  # 放弃充电
            )
        )

        # 9. 初始化SOC（假设开始时满电）
        battery_trace['energy_used_kWh'] = battery_trace['energy_used_kWh'].fillna(0)
        battery_trace['energy_in_kWh'] = battery_trace['energy_in_kWh'].fillna(0)

        # 10. 逐车递推SOC
        def calculate_soc_trace_v2(group):
            soc_before_list = []
            soc_after_list = []
            soc_prev = CAP_KWH  # 初始满电

            for _, row in group.iterrows():
                soc_before = max(0, soc_prev - row['energy_used_kWh'])
                soc_after = min(CAP_KWH, soc_before + row['energy_in_kWh'])
                soc_before_list.append(soc_before)
                soc_after_list.append(soc_after)
                soc_prev = soc_after

            group['SOC_before_kWh'] = soc_before_list
            group['SOC_after_kWh'] = soc_after_list
            return group

        battery_trace = battery_trace.groupby('taxiid', group_keys=False).apply(calculate_soc_trace_v2)

        # 11. 计算百分比
        battery_trace['SOC_before_pct'] = battery_trace['SOC_before_kWh'] / CAP_KWH * 100
        battery_trace['SOC_after_pct'] = battery_trace['SOC_after_kWh'] / CAP_KWH * 100

        # 12. 选择输出列（包含charge_type）
        output_cols = [
            'taxiid', 'charge_type', 'nearest_station_id', 'start_time', 'end_time',
            'SOC_before_kWh', 'SOC_after_kWh', 'SOC_before_pct', 'SOC_after_pct',
            'energy_used_kWh', 'energy_in_kWh',
            'distance_km', 'drive_s', 'idle_s', 'gap_s', 'missing_s',
            'charge_dur', 'wait_dur', 'giveup'
        ]
        output_cols = [col for col in output_cols if col in battery_trace.columns]

        battery_trace_output = battery_trace[output_cols].copy()
        battery_trace_output.to_parquet(BATTERY_TRACE_FILE, index=False)

        logging.info(f"SOC轨迹计算完成，结果已保存到 {BATTERY_TRACE_FILE}")
        logging.info(f"总充电事件数: {len(battery_trace_output)}")
        logging.info(f"  公共充电: {len(battery_trace_output[battery_trace_output['charge_type'] == 'public'])}")
        logging.info(f"  私人充电: {len(battery_trace_output[battery_trace_output['charge_type'] == 'private_gap'])}")

        print("\nSOC轨迹数据示例：")
        print(battery_trace_output.head(10))

        # 13. 统计信息
        print("\n" + "=" * 60)
        print("SOC轨迹统计")
        print("=" * 60)

        public_charges = battery_trace_output[battery_trace_output['charge_type'] == 'public']
        private_charges = battery_trace_output[battery_trace_output['charge_type'] == 'private_gap']

        print(f"\n公共充电统计:")
        print(f"  事件数: {len(public_charges)}")
        if len(public_charges) > 0:
            print(f"  平均充入电量: {public_charges['energy_in_kWh'].mean():.2f} kWh")
            print(f"  平均充电前SOC: {public_charges['SOC_before_pct'].mean():.2f}%")
            print(f"  平均充电后SOC: {public_charges['SOC_after_pct'].mean():.2f}%")

        print(f"\n私人充电统计:")
        print(f"  事件数: {len(private_charges)}")
        if len(private_charges) > 0:
            print(f"  平均充入电量: {private_charges['energy_in_kWh'].mean():.2f} kWh (应该都是{CAP_KWH})")
            print(f"  平均充电前SOC: {private_charges['SOC_before_pct'].mean():.2f}%")
            print(f"  平均充电后SOC: {private_charges['SOC_after_pct'].mean():.2f}%")

        print(f"\n整体SOC统计:")
        print(f"  充电前SOC平均: {battery_trace_output['SOC_before_pct'].mean():.2f}%")
        print(f"  充电前SOC中位数: {battery_trace_output['SOC_before_pct'].median():.2f}%")
        print(f"  充电前SOC最小值: {battery_trace_output['SOC_before_pct'].min():.2f}%")
        print(f"  充电后SOC平均: {battery_trace_output['SOC_after_pct'].mean():.2f}%")
        print(f"  充电后SOC中位数: {battery_trace_output['SOC_after_pct'].median():.2f}%")
        print(f"  充电后SOC最大值: {battery_trace_output['SOC_after_pct'].max():.2f}%")

        low_soc = (battery_trace_output['SOC_before_pct'] < 20).sum()
        print(f"\n低电量充电次数 (SOC < 20%): {low_soc:,} ({low_soc/len(battery_trace_output)*100:.2f}%)")

        print("=" * 60)

else:
    logging.warning("缺少必要的数据文件或变量，无法计算SOC轨迹")
    logging.warning(f"需要以下文件:")
    logging.warning(f"  - {CHAR_QUEUE_FILE}")
    logging.warning(f"  - {ENERGY_GAP_FILE}")
    logging.warning(f"需要以下变量:")
    logging.warning(f"  - all_charging_events (合并后的充电事件DataFrame)")

In [None]:
battery_trace_output[100:120]

In [None]:
battery_trace_output[battery_trace_output['SOC_before_kWh']<10]

In [None]:
battery_trace_output[battery_trace_output['taxiid']=='UUUB0H3K7']

## 7. 统计分析和可视化

### 7.1 基本统计指标


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# ========== Charging Distribution by Hour (Public vs Private) ==========

# Determine which DataFrame to use
if 'all_charging_events' in locals() or 'all_charging_events' in globals():
    df = all_charging_events.copy()
elif 'battery_trace_output' in locals() or 'battery_trace_output' in globals():
    df = battery_trace_output.copy()
elif os.path.exists(BATTERY_TRACE_FILE):
    df = pd.read_parquet(BATTERY_TRACE_FILE)
else:
    logging.warning("Charging event data not found")
    df = pd.DataFrame()

if not df.empty and 'start_time' in df.columns and 'charge_type' in df.columns:
    # 1. Extract hour and separate by type
    df['hour'] = df['start_time'].dt.hour
    public_charges = df[df['charge_type'] == 'public']
    private_charges = df[df['charge_type'] == 'private_gap']

    # 2. Count events per hour, ensuring all 24 hours exist (fill missing with 0)
    public_counts = public_charges['hour'].value_counts().reindex(range(24), fill_value=0).sort_index()
    private_counts = private_charges['hour'].value_counts().reindex(range(24), fill_value=0).sort_index()

    # 3. Print data
    print("=" * 60)
    print("Charging Event Start Time Distribution (by Type)")
    print("=" * 60)
    distribution_df = pd.DataFrame({
        'Public_Charging': public_counts/100,
        'Private_Charging': private_counts/100
    })
    distribution_df['Total'] = distribution_df['Public_Charging'] + distribution_df['Private_Charging']
    print(distribution_df.to_string())

    # 4. Visualization: grouped bar chart
    plt.figure(figsize=(15, 7))

    x = np.arange(24)  # Hour positions
    width = 0.4        # Bar width

    # Draw grouped bars
    plt.bar(x - width/2, public_counts/100, width, label='Public Charging', color='deepskyblue', alpha=0.8)
    plt.bar(x + width/2, private_counts/100, width, label='Private Charging', color='orangered', alpha=0.8)

    # Add title, labels, and legend
    plt.ylabel('Number of Charging Events', fontsize=12)
    plt.xlabel('Hour of Day', fontsize=12)
    plt.title('Public vs Private Charging Distribution by Hour', fontsize=16)
    plt.xticks(x)
    plt.legend(fontsize=11)

    # Add Y-axis grid
    plt.grid(axis='y', linestyle='--', alpha=0.6)

    # Format Y-axis with thousand separators
    ax = plt.gca()
    #ax.get_yaxis().set_major_formatter(mticker.FuncFormatter(lambda val, p: format(int(val), ',')))

    plt.tight_layout()
    plt.show()

else:
    print("Data is empty or missing 'start_time' or 'charge_type' columns")

In [None]:

# ========== 读取数据 ==========
char_queue_file = 'char_queue.parquet'  # 根据实际路径调整
if os.path.exists(char_queue_file):
    cq_df = pd.read_parquet(char_queue_file)
else:
    # 尝试其他可能的路径
    CHAR_QUEUE_FILE = f'{OUTPUT_DIR}/char_queue.parquet'
    if os.path.exists(CHAR_QUEUE_FILE):
        cq_df = pd.read_parquet(CHAR_QUEUE_FILE)
    else:
        print(f"Error: Cannot find char_queue.parquet file")
        cq_df = pd.DataFrame()

if not cq_df.empty:
    # 确保时间列是datetime类型
    if 'arrive_time' in cq_df.columns:
        cq_df['arrive_time'] = pd.to_datetime(cq_df['arrive_time'])
        cq_df['date'] = cq_df['arrive_time'].dt.date

    print("=" * 60)
    print("Public Charging Statistics")
    print("=" * 60)

    # 1. 每车每日平均充电次数
    if 'taxiid' in cq_df.columns and 'date' in cq_df.columns:
        daily_charges = cq_df.groupby(['taxiid', 'date']).size().reset_index(name='charges_per_day')
        avg_charges_per_day = daily_charges.groupby('taxiid')['charges_per_day'].mean()

        print(f"\n1. Average Charging Frequency per Vehicle per Day:")
        print(f"   Total vehicles: {len(avg_charges_per_day)}")
        print(f"   Mean: {avg_charges_per_day.mean():.2f} times/day")
        print(f"   Median: {avg_charges_per_day.median():.2f} times/day")
        print(f"   Min: {avg_charges_per_day.min():.2f} times/day")
        print(f"   Max: {avg_charges_per_day.max():.2f} times/day")
        print(f"   Std: {avg_charges_per_day.std():.2f} times/day")

    # 2. 在站停留时间分布（charge_dur）
    if 'charge_dur' in cq_df.columns:
        # 转换为分钟
        charge_dur_minutes = (cq_df['charge_dur'] + cq_df['wait_dur'])/ 60

        print(f"\n2. Charging Duration Distribution (at station):")
        print(f"   Total events: {len(charge_dur_minutes)}")
        print(f"   Mean: {charge_dur_minutes.mean():.2f} minutes")
        print(f"   Median: {charge_dur_minutes.median():.2f} minutes")
        print(f"   Min: {charge_dur_minutes.min():.2f} minutes")
        print(f"   Max: {charge_dur_minutes.max():.2f} minutes")
        print(f"   Std: {charge_dur_minutes.std():.2f} minutes")
        print(f"   10th percentile: {charge_dur_minutes.quantile(0.10):.2f} minutes")
        print(f"   25th percentile: {charge_dur_minutes.quantile(0.25):.2f} minutes")
        print(f"   75th percentile: {charge_dur_minutes.quantile(0.75):.2f} minutes")
        print(f"   95th percentile: {charge_dur_minutes.quantile(0.95):.2f} minutes")

    # 3. 等待时间分布（wait_dur）
    if 'wait_dur' in cq_df.columns:
        # 转换为分钟
        wait_dur_minutes = cq_df['wait_dur'] / 60

        print(f"\n3. Waiting Time Distribution:")
        print(f"   Total events: {len(wait_dur_minutes)}")
        print(f"   Events with wait > 0: {(wait_dur_minutes > 0).sum()} ({(wait_dur_minutes > 0).sum()/len(wait_dur_minutes)*100:.2f}%)")
        print(f"   Mean (all): {wait_dur_minutes.mean():.2f} minutes")
        print(f"   Mean (wait > 0): {wait_dur_minutes[wait_dur_minutes > 0].mean():.2f} minutes" if (wait_dur_minutes > 0).sum() > 0 else "   Mean (wait > 0): N/A")
        print(f"   Median: {wait_dur_minutes.median():.2f} minutes")
        print(f"   Max: {wait_dur_minutes.max():.2f} minutes")
        print(f"   5th percentile: {wait_dur_minutes.quantile(0.05):.2f} minutes")
        print(f"   75th percentile: {wait_dur_minutes.quantile(0.75):.2f} minutes")
        print(f"   90th percentile: {wait_dur_minutes.quantile(0.90):.2f} minutes")
        print(f"   95th percentile: {wait_dur_minutes.quantile(0.95):.2f} minutes")

    # 4. 可视化
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # 4.1 每车每日平均充电次数分布
    if 'taxiid' in cq_df.columns and 'date' in cq_df.columns:
        axes[0, 0].hist(avg_charges_per_day.values, bins=50, color='steelblue', alpha=0.7, edgecolor='black')
        axes[0, 0].set_xlabel('Average Charges per Day')
        axes[0, 0].set_ylabel('Number of Vehicles')
        axes[0, 0].set_title('Distribution of Average Daily Charging Frequency')
        axes[0, 0].grid(axis='y', alpha=0.3)
        axes[0, 0].axvline(avg_charges_per_day.mean(), color='red', linestyle='--', label=f'Mean: {avg_charges_per_day.mean():.2f}')
        axes[0, 0].legend()

    # 4.2 充电时长分布
    if 'charge_dur' in cq_df.columns:
        charge_dur_minutes = cq_df['charge_dur'] / 60
        # 限制显示范围到合理值（例如0-180分钟）
        charge_dur_filtered = charge_dur_minutes[charge_dur_minutes <= 180]
        axes[0, 1].hist(charge_dur_filtered.values, bins=50, color='green', alpha=0.7, edgecolor='black')
        axes[0, 1].set_xlabel('Charging Duration (minutes)')
        axes[0, 1].set_ylabel('Number of Events')
        axes[0, 1].set_title('Distribution of Charging Duration at Station')
        axes[0, 1].grid(axis='y', alpha=0.3)
        axes[0, 1].axvline(charge_dur_minutes.mean(), color='red', linestyle='--', label=f'Mean: {charge_dur_minutes.mean():.2f} min')
        axes[0, 1].legend()

    # 4.3 等待时间分布（所有事件）
    if 'wait_dur' in cq_df.columns:
        wait_dur_minutes = cq_df['wait_dur'] / 60
        # 限制显示范围（例如0-120分钟）
        wait_dur_filtered = wait_dur_minutes[wait_dur_minutes <= 120]
        axes[1, 0].hist(wait_dur_filtered.values, bins=50, color='orange', alpha=0.7, edgecolor='black')
        axes[1, 0].set_xlabel('Waiting Time (minutes)')
        axes[1, 0].set_ylabel('Number of Events')
        axes[1, 0].set_title('Distribution of Waiting Time (All Events)')
        axes[1, 0].grid(axis='y', alpha=0.3)
        axes[1, 0].axvline(wait_dur_minutes.mean(), color='red', linestyle='--', label=f'Mean: {wait_dur_minutes.mean():.2f} min')
        axes[1, 0].legend()

    # 4.4 等待时间分布（仅等待时间>0的事件）
    if 'wait_dur' in cq_df.columns:
        wait_dur_minutes = cq_df['wait_dur'] / 60
        wait_positive = wait_dur_minutes[wait_dur_minutes > 0]
        if len(wait_positive) > 0:
            wait_positive_filtered = wait_positive[wait_positive <= 120]
            axes[1, 1].hist(wait_positive_filtered.values, bins=50, color='coral', alpha=0.7, edgecolor='black')
            axes[1, 1].set_xlabel('Waiting Time (minutes)')
            axes[1, 1].set_ylabel('Number of Events')
            axes[1, 1].set_title(f'Distribution of Waiting Time (Wait > 0, n={len(wait_positive)})')
            axes[1, 1].grid(axis='y', alpha=0.3)
            axes[1, 1].axvline(wait_positive.mean(), color='red', linestyle='--', label=f'Mean: {wait_positive.mean():.2f} min')
            axes[1, 1].legend()
        else:
            axes[1, 1].text(0.5, 0.5, 'No events with wait > 0', ha='center', va='center', transform=axes[1, 1].transAxes)
            axes[1, 1].set_title('Distribution of Waiting Time (Wait > 0)')

    plt.tight_layout()
    plt.show()

    print("\n" + "=" * 60)

else:
    print("Error: char_queue DataFrame is empty or file not found")

In [None]:
# 基本统计
if 'char_queue_df' in locals() and not char_queue_df.empty:
    print("=" * 60)
    print("排队和充电统计")
    print("=" * 60)

    print(f"\n总充电事件数: {len(char_queue_df):,}")
    print(f"涉及车辆数: {char_queue_df['taxiid'].nunique():,}")
    print(f"涉及充电站数: {char_queue_df['nearest_station_id'].nunique():,}")

    print(f"\n平均等待时间: {char_queue_df['wait_dur'].mean():.2f} 秒 ({char_queue_df['wait_dur'].mean()/60:.2f} 分钟)")
    print(f"中位数等待时间: {char_queue_df['wait_dur'].median():.2f} 秒 ({char_queue_df['wait_dur'].median()/60:.2f} 分钟)")
    print(f"95%分位数等待时间: {char_queue_df['wait_dur'].quantile(0.95):.2f} 秒 ({char_queue_df['wait_dur'].quantile(0.95)/60:.2f} 分钟)")

    print(f"\n放弃率: {char_queue_df['giveup'].mean():.2%}")
    print(f"放弃事件数: {char_queue_df['giveup'].sum():,}")

    successful_charges = char_queue_df[char_queue_df['giveup'] == False]
    if not successful_charges.empty:
        print(f"\n成功充电事件数: {len(successful_charges):,}")
        print(f"平均充电时长: {successful_charges['charge_dur'].mean()/60:.2f} 分钟")
        print(f"中位数充电时长: {successful_charges['charge_dur'].median()/60:.2f} 分钟")

    print("=" * 60)


In [None]:
# 每日充电次数统计
if 'char_queue_df' in locals() and not char_queue_df.empty:
    char_queue_df['charge_date'] = char_queue_df['arrive_time'].dt.date

    # 每辆车每天的成功充电次数
    daily_charges = char_queue_df[char_queue_df['giveup'] == False].groupby(['taxiid', 'charge_date']).size().reset_index(name='daily_charge_count')

    # 每辆车的平均每日充电次数
    avg_daily_per_taxi = daily_charges.groupby('taxiid')['daily_charge_count'].mean().reset_index(name='avg_daily_charges')

    overall_avg = avg_daily_per_taxi['avg_daily_charges'].mean()

    print(f"\n平均每日充电次数（每车）: {overall_avg:.2f} 次/天")
    print(f"中位数每日充电次数: {avg_daily_per_taxi['avg_daily_charges'].median():.2f} 次/天")


In [None]:
# SOC统计
if os.path.exists(BATTERY_TRACE_FILE):
    battery_trace = pd.read_parquet(BATTERY_TRACE_FILE)

    print("\n" + "=" * 60)
    print("SOC统计")
    print("=" * 60)

    print(f"\n充电前SOC统计:")
    print(f"  平均: {battery_trace['SOC_before_pct'].mean():.2f}%")
    print(f"  中位数: {battery_trace['SOC_before_pct'].median():.2f}%")
    print(f"  最小值: {battery_trace['SOC_before_pct'].min():.2f}%")

    low_soc = (battery_trace['SOC_before_pct'] < 20).sum()
    print(f"\n低电量充电次数 (SOC < 20%): {low_soc:,} ({low_soc/len(battery_trace)*100:.2f}%)")

    print(f"\n充电后SOC统计:")
    print(f"  平均: {battery_trace['SOC_after_pct'].mean():.2f}%")
    print(f"  中位数: {battery_trace['SOC_after_pct'].median():.2f}%")
    print(f"  最大值: {battery_trace['SOC_after_pct'].max():.2f}%")

    print("=" * 60)


### 7.2 可视化


In [None]:
# 等待时间分布
if 'char_queue_df' in locals() and not char_queue_df.empty:
    waiting_events = char_queue_df[char_queue_df['wait_dur'] > 0.1]  # 只显示有等待的

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # 等待时间直方图
    axes[0].hist(waiting_events['wait_dur'] / 60, bins=50, edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('等待时间 (分钟)')
    axes[0].set_ylabel('事件数')
    axes[0].set_title('等待时间分布')
    axes[0].set_xlim(0, 120)  # 最多显示2小时

    # 总停留时长分布
    char_queue_df['stay_duration_min'] = (
        (char_queue_df['leave_time'] - char_queue_df['arrive_time']).dt.total_seconds() / 60
    )
    axes[1].hist(char_queue_df['stay_duration_min'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
    axes[1].set_xlabel('总停留时长 (分钟)')
    axes[1].set_ylabel('事件数')
    axes[1].set_title('总停留时长分布')
    axes[1].set_xlim(0, 180)  # 最多显示3小时

    plt.tight_layout()
    plt.show()


In [None]:
# SOC分布
if os.path.exists(BATTERY_TRACE_FILE):
    battery_trace = pd.read_parquet(BATTERY_TRACE_FILE)

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # 充电前SOC分布
    axes[0].hist(battery_trace['SOC_before_pct'], bins=30, edgecolor='black', alpha=0.7, color='orange')
    axes[0].set_xlabel('SOC (%)')
    axes[0].set_ylabel('事件数')
    axes[0].set_title('充电前SOC分布')
    axes[0].axvline(20, color='red', linestyle='--', label='20% 低电量线')
    axes[0].legend()

    # 充电后SOC分布
    axes[1].hist(battery_trace['SOC_after_pct'], bins=30, edgecolor='black', alpha=0.7, color='green')
    axes[1].set_xlabel('SOC (%)')
    axes[1].set_ylabel('事件数')
    axes[1].set_title('充电后SOC分布')

    plt.tight_layout()
    plt.show()


In [None]:
# 两次充电间距离分布
if os.path.exists(ENERGY_GAP_FILE):
    energy_gap_df = pd.read_parquet(ENERGY_GAP_FILE)

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # 距离分布
    axes[0].hist(energy_gap_df['distance_km'], bins=50, edgecolor='black', alpha=0.7, color='purple')
    axes[0].set_xlabel('距离 (km)')
    axes[0].set_ylabel('事件数')
    axes[0].set_title('两次充电间行驶距离分布')

    # 能耗分布
    axes[1].hist(energy_gap_df['energy_used_kWh'], bins=50, edgecolor='black', alpha=0.7, color='brown')
    axes[1].set_xlabel('能耗 (kWh)')
    axes[1].set_ylabel('事件数')
    axes[1].set_title('两次充电间能耗分布')

    plt.tight_layout()
    plt.show()


In [None]:
# 充电事件空间分布地图
if 'charging_events' in locals() and not charging_events.empty:
    # 计算地图中心
    center_lat = charging_events['stay_lat'].mean()
    center_lon = charging_events['stay_lon'].mean()

    # 创建地图
    m = folium.Map(location=[center_lat, center_lon], zoom_start=11, tiles='OpenStreetMap')

    # 添加充电站
    station_cluster = MarkerCluster(name='Charging Stations').add_to(m)
    for _, row in stations_df.iterrows():
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=5,
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6,
            popup=f"Station {row['station_id']}<br>Piles: {row['num_piles']}"
        ).add_to(station_cluster)

    # 添加充电事件（采样显示，避免过多）
    sample_size = min(1000, len(charging_events))
    charge_sample = charging_events.sample(n=sample_size) if len(charging_events) > sample_size else charging_events

    charge_cluster = MarkerCluster(name='Charging Events').add_to(m)
    for _, row in charge_sample.iterrows():
        folium.CircleMarker(
            location=[row['stay_lat'], row['stay_lon']],
            radius=3,
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.4,
            popup=f"Taxi: {row['taxiid']}<br>Duration: {row['duration_s']/60:.1f} min"
        ).add_to(charge_cluster)

    folium.LayerControl().add_to(m)
    m


## 8. 单车辆案例分析

可以在这里选择特定车辆进行详细分析


In [None]:
# 选择一个车辆进行详细分析
TID = 'UUUB0C0M7'  # 可以改成任意车辆ID

if 'charging_events' in locals():
    ce_tid = charging_events[charging_events['taxiid'] == TID].sort_values('start_time')

    if not ce_tid.empty:
        print(f"车辆 {TID} 的充电事件:")
        print(f"总充电次数: {len(ce_tid)}")
        print(ce_tid[['start_time', 'end_time', 'duration_s', 'nearest_station_id']])

        if os.path.exists(BATTERY_TRACE_FILE):
            battery_trace_tid = pd.read_parquet(BATTERY_TRACE_FILE)
            battery_trace_tid = battery_trace_tid[battery_trace_tid['taxiid'] == TID].sort_values('start_time')

            if not battery_trace_tid.empty:
                print(f"\nSOC轨迹:")
                print(battery_trace_tid[['start_time', 'SOC_before_pct', 'SOC_after_pct',
                                         'energy_used_kWh', 'energy_in_kWh']])
    else:
        print(f"车辆 {TID} 没有充电事件")
else:
    print("请先加载充电事件数据")
