In [9]:
import unittest
import os
import time
from xml.dom.minidom import parse
import xml.dom.minidom
import pandas as pd
import csv

In [3]:
# 1. 转换 sumo 的 string 格式的 edge id，node id 为 integer 的格式


In [19]:
# Capture network information from ".xml" and save into ".csv"

'''
SUMO 的道路网络中包含了很多连接信息，这行代码从中提取以下需要的信息：
1. edge ID，
2. edge ID 对应的特征（路长，限速，lanes number），
3. 连接 edge ID 的两个 node IDs，
'''

def network_data_exploration(filename):
    # Define path
    path = os.path.abspath('./../../../Traffic_Simulation_Data_Generation_for_Baselines/') 
    # Define data path
    data_path = os.path.join(path,filename) 
    print('Data path is:', data_path) 
    
    # Open ".xml" and find data
    DOMTree = xml.dom.minidom.parse(data_path) 
    data = DOMTree.documentElement 
    
    # Get element list
    nodeList = data.getElementsByTagName("edge")
    print("Number of edges are: ", len(nodeList))
    
    # Define and initilzie explored data
    all_rows = []
    
    for node in nodeList: 
        # Get features of each edge
        edge_ID = node.getAttribute("id")
        node_start = node.getAttribute("from")
        node_end = node.getAttribute("to")
        priority = node.getAttribute("priority")
        # Get element list of edge
        subNodeList = node.getElementsByTagName("lane")
        
        # Initialize features
        speed = 0
        length = 0
        for subNode in subNodeList:
            # Get features of each lane
            speed += float(subNode.getAttribute("speed"))
        
            length += float(subNode.getAttribute("length"))
        speed_avg = speed / len(subNodeList)
        length_avg = length / len(subNodeList)
        lane_num = len(subNodeList)
            
        newRow = {"edge_id": edge_ID, "node_start": node_start, "node_end": node_end, "lane_num":lane_num, "speed": speed_avg, "length": length_avg, "priority": priority}
        all_rows.append(newRow)
              
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(all_rows)
    return df
 
    
# Call function to export network information data
df = network_data_exploration("test.net.xml")
df.head()
df.to_csv('Manhattan_network_raw.csv', index=False)

# Potential mistakes:
# Edges allow different types of vehicles but do not considered here.

Data path is: /Users/xuzizhuo/Desktop/Main Folder/My_works/Traffic_Simulation_Work/Traffic_Simulation_Data_Generation_for_Baselines/test.net.xml
Number of edges are:  29493


In [10]:
'''
构建 SUMO 路网信息映射成 int 格式的，Simulation Algorithm 可读的词典：
1. 把 string 结构的 node ID 转成 int 结构的：node_to_int，
2. 把 string 结构的 edge ID 转成 int 结构的：edge_to_int，
3. 把 int 结构的 edge ID 转成 string 结构的：int_to_edge，
4. 储存 int 结构的 node ID 和 edge ID 到 Manhattan_network_mapped.csv 中。
'''


# Map edge and node ID from 0 to their length

# Read data
df = pd.read_csv('Manhattan_network_raw.csv')

# Convert unique string ids of "node_start" and "node_end" to unique integers.
# Get a list of unique nodes
unique_nodes = pd.concat([df['node_start'], df['node_end']]).unique()
# Create a mapping of node string id to integer
node_to_int = {node: idx for idx, node in enumerate(unique_nodes)}
# Replace the string ids in the dataframe
df['node_start'] = df['node_start'].map(node_to_int)
df['node_end'] = df['node_end'].map(node_to_int)

# Convert "edge_id" to unique integers.
# Create a mapping of edge string id to integer
edge_to_int = {edge: idx for idx, edge in enumerate(df['edge_id'].unique())}
# Replace the string ids in the dataframe
df['edge_id'] = df['edge_id'].map(edge_to_int)


# 反转 edge_to_int 映射
int_to_edge = {v: k for k, v in edge_to_int.items()}


# Save mapped dataframe
df.to_csv('Manhattan_network_mapped.csv', index=False)

In [13]:
edge_to_int['727135244#3']

25897

In [14]:
int_to_edge[25897]

'727135244#3'

In [11]:
import xml.etree.ElementTree as ET
import networkx as nx
import os

def parse_sumo_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    edges = {}
    connections = []

    for edge in root.findall('.//edge'):
        edge_id = edge.get('id')
        edges[edge_id] = {
            'from': edge.get('from'),
            'to': edge.get('to')
        }

    for connection in root.findall('.//connection'):
        from_edge = connection.get('from')
        to_edge = connection.get('to')
        direction = connection.get('dir')  # 提取连接的方向
        connections.append((from_edge, to_edge, direction))

    return edges, connections

path = os.path.abspath('./../../../Traffic_Simulation_Data_Generation_for_Baselines/') 
xml_file = os.path.join(path,'test.net.xml') 
print('Data path is:', xml_file) 

edges, connections = parse_sumo_xml(xml_file)   

Data path is: /Users/xuzizhuo/Desktop/Main Folder/My_works/Traffic_Simulation_Work/Traffic_Simulation_Data_Generation_for_Baselines/test.net.xml


In [12]:
import csv

def save_connections_to_csv(connections, edge_to_int, filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['from_edge', 'to_edge', 'direction'])  # 写入标题行
        
        for from_edge, to_edge, direction in connections:
            from_edge_int = edge_to_int[from_edge]
            to_edge_int = edge_to_int[to_edge]
            writer.writerow([from_edge_int, to_edge_int, direction])

# 假设 xml_file 是你的XML文件路径，node_mapping 是已经创建的映射
edges, connections = parse_sumo_xml(xml_file)

# 现在我们假设 node_mapping 已经存在
# 例如：node_mapping = {'993510828#1': 1, '32973204#0': 2}

# 调用 save_connections_to_csv 来保存转换后的信息
save_connections_to_csv(connections, edge_to_int, 'connections_to_directions.csv')


In [8]:
# 2. 构建映射后的 int edge id 和其 static features 的文件


In [21]:
'''
生成 int 结构 edge ID -> lane_num，speed_limit，average_length 的词典 edge_id_to_features，
储存词典 edge_id_to_features 到 csv 文件，方便 C++ 读取。
'''

# 读取预处理后的数据
df = pd.read_csv('Manhattan_network_mapped.csv')

# 初始化一个空字典来存储整数 edge_id 映射到其特征的信息
edge_id_to_features = {}

# 遍历 DataFrame 的每一行
for index, row in df.iterrows():
    # 为当前 edge_id 存储 lane_num, speed, 和 length 的信息
    edge_id_to_features[row['edge_id']] = {
        "lane_num": row['lane_num'],
        "speed": row['speed'],
        "length": row['length'],
        "edge_str": int_to_edge[row['edge_id']]
    }

# 将字典保存为 CSV 文件
with open('edge_id_to_features.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["edge_id", "lane_num", "speed", "length", "edge_str"])  # 写入标题
    for edge_id, features in edge_id_to_features.items():
        writer.writerow([edge_id, features['lane_num'], features['speed'], features['length'], features['edge_str']])


In [22]:
'''
构建词典 nodes_to_edge_raw 和 edge_to_nodes_raw：
1. edge_to_nodes_raw：从 string 结构的 edge ID 映射成 string 结构的 node IDs
2. nodes_to_edge_raw：从 string 结构的 node IDs 映射成 string 结构的 edge ID
'''

# Map edge to node pairs & node pairs to edges

# 读取 CSV 文件
df = pd.read_csv('Manhattan_network_raw.csv')

# 初始化两个字典
edge_to_nodes_raw = {}  # edge_id 映射到 node_start 和 node_end
nodes_to_edge_raw = {}  # node_start 和 node_end 映射到 edge_id

# 遍历 DataFrame
for index, row in df.iterrows():
    edge_id = row['edge_id']
    node_start = row['node_start']
    node_end = row['node_end']

    # 建立 edge_id 到 node_start 和 node_end 的映射
    edge_to_nodes_raw[edge_id] = (node_start, node_end)

    # 建立 node_start 和 node_end 到 edge_id 的映射
    nodes_to_edge_raw[(node_start, node_end)] = edge_id

# 测试输出
print("Edge to Nodes Mapping:", edge_to_nodes_raw["-1004369132#0"])
print("Nodes to Edge Mapping:", nodes_to_edge_raw[(7480410399, 42437990)])

Edge to Nodes Mapping: (7480410399, 42437990)
Nodes to Edge Mapping: -1004369132#0


In [23]:
'''
构建词典 nodes_to_edge_mapped 和 edge_to_nodes_mapped：
1. edge_to_nodes_mapped：从 int 结构的 edge ID 映射成 int 结构的 node IDs
2. nodes_to_edge_mapped：从 int 结构的 node IDs 映射成 int 结构的 edge ID
'''


# Map edge to node pairs & node pairs to edges

# 读取 CSV 文件
df = pd.read_csv('Manhattan_network_mapped.csv')

# 初始化两个字典
edge_to_nodes_mapped = {}  # edge_id 映射到 node_start 和 node_end
nodes_to_edge_mapped = {}  # node_start 和 node_end 映射到 edge_id

# 遍历 DataFrame
for index, row in df.iterrows():
    edge_id = row['edge_id']
    node_start = row['node_start']
    node_end = row['node_end']

    # 建立 edge_id 到 node_start 和 node_end 的映射
    edge_to_nodes_mapped[edge_id] = (node_start, node_end)

    # 建立 node_start 和 node_end 到 edge_id 的映射
    nodes_to_edge_mapped[(node_start, node_end)] = edge_id

# 测试输出
# print("Edge to Nodes Mapping:", edge_to_nodes_mapped[0])
# print("Nodes to Edge Mapping:", nodes_to_edge_mapped[(0, 9115)])

In [24]:
nodes_to_edge_mapped[(12573, 9504)]
int_to_edge[16133]

# edge id="46201646" from="370913784" to="370914027"

'46201646'

In [25]:
nodes_to_edge_mapped[(9504, 9520)]
int_to_edge[28341]

# edge id="960097006" from="370914027" to="589099584"

'961976870'

In [26]:
'''
把 SUMO 的路网结构存成 Simulation Algorithm 可读的
'''

import pandas as pd
import csv

# Paths for the input and output files
csv_file_path = 'Manhattan_network_mapped.csv'
txt_file_path1 = 'Manhattan_network_BJ.txt'
txt_file_path2 = 'Manhattan_network_min_Travel_Time.txt'

# Loading the CSV file
df = pd.read_csv(csv_file_path)

# Calculating unique nodes and edges
unique_nodes = pd.concat([df['node_start'], df['node_end']]).unique()
unique_edges = df['edge_id'].unique()

# The two integer values to be added at the beginning of each .txt file
int_val_1 = len(unique_nodes)
print("Length of unqiue nodes: ", len(unique_nodes))
int_val_2 = len(unique_edges)
print("Length of unique edges: ", len(unique_edges))

# Generating the first .txt file
with open(txt_file_path1, 'w') as txt_file1:
    # Writing the two integers to the TXT file
    txt_file1.write(f"{int(int_val_1)} {int(int_val_2)}\n")
    
    # Iterating over rows in the DataFrame to format and write each row according to specifications
    for index, row in df.iterrows():
        txt_file1.write(f"{int(row['node_start'])} {int(row['node_end'])} {int(row['edge_id'])} {round(row['length'],2)}\n")

# Generating the second .txt file
with open(txt_file_path2, 'w') as txt_file2:
    # Writing the two integers to the TXT file
    txt_file2.write(f"{int(int_val_1)} {int(int_val_2)}\n")
    
    # Iterating over rows in the DataFrame to format and write each row according to specifications
    for index, row in df.iterrows():
        travel_time = round(row['length'] / row['speed'],2)  # Calculating travel time as length_avg / speed_avg
        txt_file2.write(f"{int(row['node_start'])} {int(row['node_end'])} {travel_time}\n")


Length of unqiue nodes:  20853
Length of unique edges:  29493


In [27]:
# 3. 转换 route 数据为 sumo 可读的格式


In [28]:
'''
SUMO 在模拟结束后可以生成轨迹数据，
以下代码从轨迹数据中提取轨迹数据的相应信息
'''

# Capture trajectory data from ".xml" file to ".csv" file

def trajectory_information_capture(filename):
    #获取xml文件地址
    path = os.path.abspath('./../../../Traffic_Simulation_Data_Generation_for_Baselines/0/') 
    #获取xml文件地址
    data_path = os.path.join(path,filename) 
    
    # 打开xml文档
    DOMTree = xml.dom.minidom.parse(data_path) 
    # 根据xml文档，得到文档元素的对象
    data = DOMTree.documentElement 
    
    # 获取节点列表
    nodeList = data.getElementsByTagName("vehicle")
    # 定义route数量
    nodeLen = len(nodeList)
    print("Length of routes are: ", nodeLen)
    
    all_rows = []
    routeID_set = set()
    for node in nodeList: 
        # 获取当前节点属性值
        route_ID = node.getAttribute("id")
        depart_time = node.getAttribute("depart")
        # arrival_time = node.getAttribute("arrival")
        
        subNodeList = node.getElementsByTagName("route")
        
        for subNode in subNodeList:
            if len(subNodeList) != 1:
                print("Error.")
            
            # 获取当前节点属性值
            route = subNode.getAttribute("edges")
            # exit_times = subNode.getAttribute("exitTimes")
            # 拆分字符串
            ids = route.split()
            # 提取第一个和最后一个ID
            start_node = ids[0]
            end_node = ids[-1]
        
        # newRow = {"route_id": route_ID, "depart_time": depart_time, "arrival_time": arrival_time, 
        #           "start_edge": start_node, "end_edge": end_node, "route_by_edge": route, "exit_times": exit_times}
        
        newRow = {"route_id": route_ID, "depart_time": depart_time,
                  "start_edge": start_node, "end_edge": end_node, "route_by_edge": route}
        
        routeID_set.add(route_ID)
        all_rows.append(newRow)

    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(all_rows)
    print(f"Number of route is: {len(routeID_set)}")
    return df

# df_trajectory = trajectory_information_capture("outputfile.xml")
df_trajectory = trajectory_information_capture("merged_trips.trips.xml")

df_trajectory.to_csv('Manhattan_trajectory_raw.csv', index=False)

Length of routes are:  192484
Number of route is: 192484


In [29]:
# Convert raw edge IDs constructed route into mapped node IDs
# e.g. raw_edge_1 raw_edge_2 ... -> mapped_node_1 mapped_node_2 mapped_node_3

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_raw.csv')

# 初始化新列
df['route_by_mappped_node'] = None

# 遍历 DataFrame
for index, row in df.iterrows():
    # 分割 route_by_edge 字符串并应用映射
    node_pairs = [edge_to_nodes_mapped[edge_to_int[edge_id]] for edge_id in row['route_by_edge'].split(' ') if edge_id in edge_to_int]

    # 处理节点序列，确保节点不重复
    node_sequence = []
    for pair in node_pairs:
        # 确保节点转换为字符串
        node_start = str(pair[0])
        node_end = str(pair[1])

        if not node_sequence or node_sequence[-1] != node_start:
            node_sequence.append(node_start)
        node_sequence.append(node_end)

    # 更新新列
    df.at[index, 'route_by_mappped_node'] = ' '.join(node_sequence)

# 保存到新的 CSV 文件
df.to_csv('Manhattan_trajectory_mapped_node.csv', index=False)
df.head()

Unnamed: 0,route_id,depart_time,start_edge,end_edge,route_by_edge,route_by_mappped_node
0,1,0.0,25167444,-195743156#0,25167444 -194923762#1 -194923762#0 -194923763#...,9122.0 716.0 715.0 720.0 719.0 718.0 717.0 127...
1,2,0.0,204085726#0,5671345#12,204085726#0 204085726#1 204085726#2 1016808887...,8517.0 8518.0 8519.0 4960.0 4961.0 10583.0 105...
2,3,0.0,346951422#0,945375290#3,346951422#0 346951422#1 154716102#0 154716102#...,2763.0 1581.0 1582.0 7320.0 1627.0 2627.0 2626...
3,4,0.0,953819149#0,5671661#2,953819149#0 953819149#1 953819152 953819150 11...,19852.0 19853.0 19851.0 17508.0 4782.0 365.0 3...
4,5,0.0,-46334665#3,46577943,-46334665#3 -46334665#2 -46334665#1 -46334665#...,2551.0 2550.0 2549.0 2548.0 271.0 270.0 269.0 ...


In [31]:
import os
import pandas as pd

# 使用示例
path = os.path.abspath('./../../../Traffic_Simulation_Data_Generation_for_Baselines/0/') 
# Define data path
input_csv_path = os.path.join(path, 'TraCI_output_adjusted.csv')

# Read the CSV data
df_ETA = pd.read_csv(input_csv_path)

# 选择相关列
df = df_ETA[['Vehicle_ID', 'E_Length', 'Edge_ID', 'Speed_Net', 'Time', 'Travel_Time', 'Delay_Time', 'LowSpee_Time']]

# Add Wait_Sum to Travel_Time
df['Travel_Time'] = pd.to_numeric(df['Travel_Time'], errors='coerce') + pd.to_numeric(df['Delay_Time'], errors='coerce') + pd.to_numeric(df['LowSpee_Time'], errors='coerce')

df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Travel_Time'] = 0

# 对于不同 Edge_ID 的 Travel_Time 求平均，构建词典，key 是 Edge_ID，value 是平均 Travel_Time
average_travel_time_dict = df.groupby('Edge_ID')['Travel_Time'].mean().to_dict()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Travel_Time'] = pd.to_numeric(df['Travel_Time'], errors='coerce') + pd.to_numeric(df['Delay_Time'], errors='coerce') + pd.to_numeric(df['LowSpee_Time'], errors='coerce')


In [16]:
# 验证：
# trips 文件中包含的 route 数据和 TraCI 转换出来的 route 数据的正确性是否对应


In [32]:
import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_raw.csv')

# 初始化一个空字典
route_dict = {}

# 遍历每一行，构建词典
for index, row in df.iterrows():
    route_id = row['route_id']
    edge_ids = row['route_by_edge'].split(' ')  # 将 edge ids 拆分成列表
    route_dict[route_id] = edge_ids

# 输出词典
print(len(route_dict))


192484


In [34]:
import os
import pandas as pd

# 获取CSV文件地址
path = os.path.abspath('./../../../Traffic_Simulation_Data_Generation_for_Baselines/0/') 
data_path = os.path.join(path, 'TraCI_output_adjusted.csv')

# 载入CSV文件
df = pd.read_csv(data_path)

# 将 'Travel_Time' 计算为三列之和
df['Travel_Time'] = pd.to_numeric(df['Travel_Time'], errors='coerce') + \
                    pd.to_numeric(df['Delay_Time'], errors='coerce') + \
                    pd.to_numeric(df['LowSpee_Time'], errors='coerce')

# 将 E_Length / Speed_Net 小于 1 的行的 Travel_Time 设为 0
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Travel_Time'] = 0

# 将 'Delay_Time', 'LowSpee_Time', 'Wait_Time' 转换为 0 和 1 的形式
df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']] = df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']].applymap(lambda x: 0 if x == 0 else 1)

# 初始化计数器
row_count = 0
    
# 构建 vehicle_edge_time_dict 词典
vehicle_edge_time_dict = {}
for index, row in df.iterrows():
    vehicle_id = row['Vehicle_ID']
    edge_id = row['Edge_ID']
    
    # 构建内层词典
    time_data = {
        'Delay_Time': row['Delay_Time'],
        'LowSpee_Time': row['LowSpee_Time'],
        'Wait_Time': row['Wait_Time'],
        'Travel_Time': row['Travel_Time']
    }
    
    # 如果该 vehicle_id 已存在于字典中
    if vehicle_id in vehicle_edge_time_dict:
        # 如果 edge_id 还没有被记录过，添加它
        if edge_id not in vehicle_edge_time_dict[vehicle_id]:
            vehicle_edge_time_dict[vehicle_id][edge_id] = time_data
        else:
            # 如果 edge_id 已存在，可以选择更新数据或者跳过
            print(f"Edge {edge_id} 已存在于 vehicle {vehicle_id} 中")
    else:
        # 如果该 vehicle_id 不存在，初始化一个新的词典
        vehicle_edge_time_dict[vehicle_id] = {edge_id: time_data}


    # 更新计数器
    row_count += 1

    # 每处理100万行输出进度
    if row_count % 5000000 == 0:
        print(f"已处理 {row_count} 行数据")

  df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']] = df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']].applymap(lambda x: 0 if x == 0 else 1)


已处理 5000000 行数据
已处理 10000000 行数据
已处理 15000000 行数据
已处理 20000000 行数据
已处理 25000000 行数据
已处理 30000000 行数据
已处理 35000000 行数据
已处理 40000000 行数据


In [35]:
# 1. 验证两个词典的长度是否相同
if len(route_dict) != len(vehicle_edge_time_dict):
    print(f"Error: 词典长度不一致 - route_dict 长度: {len(route_dict)}, vehicle_edge_time_dict 长度: {len(vehicle_edge_time_dict)}")
else:
    print("验证1:词典长度一致")

# 2. 验证每个 route_id 的 edge ids 是否包含重复
def check_duplicates(route_dict):
    for route_id, edge_ids in route_dict.items():
        if len(edge_ids) != len(set(edge_ids)):
            print(f"Error: route_id {route_id} 中存在重复的 edge ids")
        # else:
        #     print(f"route_id {route_id} 中不存在重复的 edge ids")

# 检查 route_dict 是否有重复的 edge ids
print("验证2.1 : 检查 route_dict 中的 edge ids 是否包含重复:")
check_duplicates(route_dict)

'''
# 检查 vehicle_edge_time_dict 中的 edge ids 是否包含重复
print("检查 vehicle_edge_time_dict 中的 edge ids 是否包含重复:")
for route_id, edge_dict in vehicle_edge_time_dict.items():
    edge_ids = list(edge_dict.keys())
    if len(edge_ids) != len(set(edge_ids)):
        print(f"Error: route_id {route_id} 中存在重复的 edge ids")
    else:
        print(f"route_id {route_id} 中不存在重复的 edge ids")
'''

# 3. 验证 route_dict 的 edge ids 是否包含 vehicle_edge_time_dict 中的 edge ids
print("验证3: 检查 route_dict 是否包含 vehicle_edge_time_dict 的 edge ids:")
for route_id, edge_dict in vehicle_edge_time_dict.items():
    if route_id in route_dict:
        route_edges = set(route_dict[route_id])
        vehicle_edges = set(edge_dict.keys())
        
        # 如果 route_dict 的 edge ids 不完全包含 vehicle_edge_time_dict 的 edge ids，打印错误
        if not vehicle_edges.issubset(route_edges):
            print(f"Error: route_id {route_id} 的 edge ids 不完全包含在 route_dict 中")
        # else:
        #     print(f"route_id {route_id} 的 edge ids 完全包含在 route_dict 中")
    else:
        print(f"Error: route_id {route_id} 不存在于 route_dict 中")


验证1:词典长度一致
验证2.1 : 检查 route_dict 中的 edge ids 是否包含重复:
验证3: 检查 route_dict 是否包含 vehicle_edge_time_dict 的 edge ids:


In [36]:
# 补全数据


In [37]:
# 遍历 route_id 并处理每个 route_id 下的 edge ids
for route_id in route_dict:
    
    # 1. 获取 route_dict 中的 edge ids
    route_edges = route_dict[route_id]  # 这个是完整的 edge ids 列表
    
    # 2. 检查 vehicle_edge_time_dict 中是否有这个 route_id
    if route_id in vehicle_edge_time_dict:
        # 获取 vehicle_edge_time_dict 中当前 route_id 下的 edge ids
        vehicle_edges = list(vehicle_edge_time_dict[route_id].keys())
    else:
        # 如果 vehicle_edge_time_dict 中没有该 route_id，初始化一个空的嵌套字典
        print(f"Error: route_id {route_id} 不存在于 vehicle_edge_time_dict 中")
        vehicle_edge_time_dict[route_id] = {}
        vehicle_edges = []

    # 3. 找出缺失的 edge ids：在 route_dict 中但不在 vehicle_edge_time_dict 中的 edge ids
    missing_edges = [edge_id for edge_id in route_edges if edge_id not in vehicle_edges]
    
    # 4. 对于缺失的 edge ids，补全 vehicle_edge_time_dict 中的值
    for edge_id in missing_edges:
        # 初始化为0的值，包含所有字段
        vehicle_edge_time_dict[route_id][edge_id] = {
            'Delay_Time': 0,
            'LowSpee_Time': 0,
            'Wait_Time': 0,
            'Travel_Time': 0
        }

    # 5. 保持顺序（可选）：将 vehicle_edge_time_dict 的 edge ids 按照 route_dict 的顺序排列
    # 我们先获取完整的 edge ids 顺序列表
    ordered_edges = {edge_id: vehicle_edge_time_dict[route_id][edge_id] for edge_id in route_edges}
    
    # 将有序的 edge ids 重新赋值回 vehicle_edge_time_dict 中
    vehicle_edge_time_dict[route_id] = ordered_edges

# 最终检查结果，确认每个 route_id 的 edge ids 是否已经被补全
for route_id in vehicle_edge_time_dict:
    if not set(vehicle_edge_time_dict[route_id].keys()) == set(route_dict[route_id]):
        print(f"Error: route_id {route_id} 未能成功补全")

In [22]:
# baseline：
# routeETA 结果

In [38]:
import numpy as np

# 设定参数 N
N = 50

# 初始化两个列表，用于存储 truth 和 预测的 travel time 总和
truth_values = []
predicted_values = []

# 遍历 vehicle_edge_time_dict 中的每个 route_id
for route_id, edge_time_dict in vehicle_edge_time_dict.items():
    
    # 获取该 route_id 下的所有 edge ids 列表
    route_edges = list(edge_time_dict.keys())
    
    # 取前 N 个 edge ids，如果总长度小于 N，则取所有 edge
    selected_edges = route_edges[:N]
    
    # 计算 truth: 取前 N 个 edge 的 Travel_Time 之和
    truth_travel_time = sum(edge_time_dict[edge_id]['Travel_Time'] for edge_id in selected_edges)
    
    # 计算预测值：根据前 N 个 edge 从 average_travel_time_dict 中取 travel time 的和
    predicted_travel_time = sum(average_travel_time_dict.get(edge_id, 0) for edge_id in selected_edges)
    
    # 将 truth 和预测值分别添加到列表中
    truth_values.append(truth_travel_time)
    predicted_values.append(predicted_travel_time)

# 计算 MAE (Mean Absolute Error)
mae = np.mean(np.abs(np.array(truth_values) - np.array(predicted_values)))

# 输出结果
print(f"MAE (Mean Absolute Error): {mae}")


MAE (Mean Absolute Error): 28.172839039487258


In [39]:
import numpy as np

# 设定参数 N
N = 50

# 初始化一个列表，用于存储每条 route 的总误差
route_errors = []

# 遍历 vehicle_edge_time_dict 中的每个 route_id
for route_id, edge_time_dict in vehicle_edge_time_dict.items():
    
    # 获取该 route_id 下的所有 edge ids 列表
    route_edges = list(edge_time_dict.keys())
    
    # 取前 N 个 edge ids，如果总长度小于 N，则取所有 edge
    selected_edges = route_edges[:N]
    
    # 初始化该 route 的总误差
    route_total_error = 0
    
    # 遍历每一个 edge，分别计算每个 edge 的 travel time 的绝对误差
    for edge_id in selected_edges:
        # 真实值：Travel_Time
        truth_travel_time = edge_time_dict[edge_id]['Travel_Time']
        
        # 预测值：从 average_travel_time_dict 获取，如果没有，则默认值为 0
        predicted_travel_time = average_travel_time_dict.get(edge_id, 0)
        
        # 计算该 edge 的绝对误差
        edge_error = abs(truth_travel_time - predicted_travel_time)
        
        # 将该 edge 的误差累加到该 route 的总误差中
        route_total_error += edge_error
    
    # 将该 route 的总误差添加到列表中
    route_errors.append(route_total_error)

# 计算 MAE (Mean Absolute Error)，对所有 route 的误差取平均值
mae = np.mean(route_errors)

# 输出结果
print(f"MAE (Mean Absolute Error): {mae}")


MAE (Mean Absolute Error): 55.35518095859286


In [None]:
# 4. 生成输入的 route，query，以及 travel time 数据

In [None]:
# 同时生成 route 数据和 time 数据的代码
# route.txt 中，每行第一个值是 node 长度，其次是 start_node 以及对应的时间 bool 信息
# time.txt 中，每行第一个值是 vehicle id，travel time 值的长度，以及后面的 travel time 值作为 truth


In [40]:
# 先根据 route_id 对 vehicle_edge_time_dict 进行排序
vehicle_edge_time_dict = dict(sorted(vehicle_edge_time_dict.items(), key=lambda item: item[0]))


In [41]:
# 打开 route.txt 文件进行写入
with open('route.txt', 'w') as route_file, open('time.txt', 'w') as time_file:
    
    # 遍历 vehicle_edge_time_dict 中的每个 route_id
    for idx, (route_id, edge_time_dict) in enumerate(vehicle_edge_time_dict.items()):
        
        # 判断 idx 是否与 route_id 相等
        if idx+1 != route_id:
            print(f"Mismatch: idx {idx} does not match route_id {route_id}")
        
        # 获取该 route_id 下的所有 edge ids 列表
        edge_ids = list(edge_time_dict.keys())
        
        # 写入 route.txt 文件中的 edge ids 长度
        route_file.write(f"{len(edge_ids) + 1} ")  # +1 因为我们需要最后写入 second_node
        
        # 写入 time.txt 文件的 route_id 和 edge ids 的长度
        # time_file.write(f"{route_id} {len(edge_ids)} ")
        time_file.write(f"{len(edge_ids)} ")
        
        # 遍历每个 edge_id，获取对应的时间数据
        for edge_id in edge_ids:
            # 使用 edge_to_int 将 edge_id 转换为整数
            if edge_id in edge_to_int:
                edge_int = edge_to_int[edge_id]
            else:
                # 如果 edge_id 不在 edge_to_int 中，打印错误信息并跳过该 edge_id
                print(f"Error: edge_id {edge_id} 不存在于 edge_to_int 词典中")
                continue  # 跳过该 edge_id
            
            # 使用 edge_to_nodes_mapped 词典将 edge_int 转换为 node pair
            if edge_int in edge_to_nodes_mapped:
                first_node, second_node = edge_to_nodes_mapped[edge_int]
            else:
                # 如果 edge_int 不在 edge_to_nodes_mapped 中，打印错误信息并跳过该 edge_id
                print(f"Error: edge_int {edge_int} 不存在于 edge_to_nodes_mapped 词典中")
                continue  # 跳过该 edge_id
            
            # 获取该 edge 的时间数据
            delay_time = edge_time_dict[edge_id]['Delay_Time']
            low_speed_time = edge_time_dict[edge_id]['LowSpee_Time']
            wait_time = edge_time_dict[edge_id]['Wait_Time']
            travel_time = edge_time_dict[edge_id]['Travel_Time']
            
            # 将 first_node 及其 Delay_Time, LowSpee_Time, Wait_Time 写入 route.txt 文件
            route_file.write(f"{int(first_node)} {delay_time} {low_speed_time} {wait_time} ")
            
            # 将 Travel_Time 写入 time.txt 文件
            time_file.write(f"{travel_time} ")
        
        # 最后一个 edge 的 second_node 写入 route.txt 文件，三个时间值都赋 0
        if edge_ids:
            route_file.write(f"{int(second_node)} 0 0 0 ")
        
        # 每条 route 处理完后换行
        route_file.write('\n')
        time_file.write('\n')

print("Data has been written to route.txt and time.txt")


Data has been written to route.txt and time.txt


In [2]:
# 原始写入 query 数据的代码，但是没有经过验证
import pandas as pd

In [3]:
'''
写入 routes 的 query 数据
'''

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_mapped_node.csv')

df = df.sort_values(by='route_id')

# 打开一个新的文本文件用于写入数据
with open('query.txt', 'w') as file:
    # 遍历 DataFrame 中的每一行
    for index, row in df.iterrows():
        # 提取出发节点和目的地节点，并将它们转换为整数
        node_sequence = row['route_by_mappped_node'].split()
        departure_node = int(float(node_sequence[0]))  # 先转换为浮点数，再转换为整数
        destination_node = int(float(node_sequence[-1]))

        # 提取出发时间并转换为整数
        departure_time = int(row['depart_time'])

        # 将提取的信息按格式写入文件
        file.write(f"{departure_node} {destination_node} {departure_time}\n")

print("Data has been written to query.txt")

Data has been written to query.txt


In [None]:
# 原始写入 query 数据的代码 + 验证步骤


In [None]:
'''
import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_mapped_node.csv')

df = df.sort_values(by='route_id')

# 打开一个新的文本文件用于写入数据
with open('query.txt', 'w') as file:
    # 遍历 DataFrame 中的每一行
    for index, row in df.iterrows():
        # 提取出发节点和目的地节点，并将它们转换为整数
        node_sequence = row['route_by_mappped_node'].split()
        departure_node = int(float(node_sequence[0]))  # 先转换为浮点数，再转换为整数
        destination_node = int(float(node_sequence[-1]))

        # 提取出发时间并转换为整数
        departure_time = int(row['depart_time'])

        # 获取当前 route_id
        route_id = row['route_id']
        
        # 通过 vehicle_edge_time_dict 获取该 route 的首尾 edge
        if route_id in vehicle_edge_time_dict:
            edge_ids = list(vehicle_edge_time_dict[route_id].keys())
            
            # 首个和最后一个 edge_id
            first_edge = edge_ids[0]
            last_edge = edge_ids[-1]
            
            # 获取首个 edge 的 first_node 和最后一个 edge 的 second_node
            if first_edge in edge_to_int and last_edge in edge_to_int:
                first_edge_int = edge_to_int[first_edge]
                last_edge_int = edge_to_int[last_edge]
                
                if first_edge_int in edge_to_nodes_mapped and last_edge_int in edge_to_nodes_mapped:
                    first_node = edge_to_nodes_mapped[first_edge_int][0]  # 获取 first_node
                    second_node = edge_to_nodes_mapped[last_edge_int][1]  # 获取 second_node
                else:
                    print(f"Error: Edge {first_edge_int} or {first_edge_int} 不存在于 edge_to_int 词典中")
                    continue
            else:
                print(f"Error: Edge {first_edge} or {last_edge} 不存在于 edge_to_int 词典中")
                continue  # 跳过该行
            
            # 验证 departure_node 和 destination_node 是否与首尾 node 相同
            if first_node != departure_node:
                print(f"Error: route_id {route_id} 的 departure_node 不匹配: 文件中的 {departure_node}, 词典中的 {first_node}")
            if second_node != destination_node:
                print(f"Error: route_id {route_id} 的 destination_node 不匹配: 文件中的 {destination_node}, 词典中的 {second_node}")
        
        else:
            print(f"Error: route_id {route_id} 不存在于 vehicle_edge_time_dict 中")
            continue  # 跳过该行

        # 将提取的信息按格式写入文件
        file.write(f"{departure_node} {destination_node} {departure_time}\n")

print("Data has been written to query.txt")
'''

In [26]:
'''
写入 routes 的 route 数据
'''

'''
# Capture route and stored into .txt file

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_mapped_node.csv')

df = df.sort_values(by='route_id')

# 打开一个新的文本文件用于写入数据
with open('route.txt', 'w') as file:
    # 遍历 DataFrame 中的每一行
    for index, row in df.iterrows():
        # 分割 node ID 序列并将每个节点 ID 转换为整数
        node_sequence = [int(float(node_id)) for node_id in row['route_by_mappped_node'].split()]
        node_count = len(node_sequence)

        # 将节点数量和整数形式的节点序列按格式写入文件
        file.write(f"{node_count} {' '.join(map(str, node_sequence))}\n")

print("Data has been written to route.txt")
'''

Data has been written to route.txt


In [24]:
'''
生成每个 edge 上对应出行时间的统计，
但是，包含车辆在红绿灯的等待时间，对我们希望忽略等待时间的影响的问题暂时没有帮助
'''

'''
import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_mapped_node.csv')

# 打开一个新的文本文件用于写入数据
with open('time_counts_and_values.txt', 'w') as file:
    # 遍历 DataFrame 中的每一行
    for index, row in df.iterrows():
        # 分割时间数据并将每个时间数据转换为整数
        time_sequence = [int(float(time)) for time in row['exit_times'].split()]
        time_count = len(time_sequence)

        # 将时间数量和整数形式的时间数据按格式写入文件
        file.write(f"{time_count} {' '.join(map(str, time_sequence))}\n")

print("Data has been written to time_counts_and_values.txt")
'''

Data has been written to time_counts_and_values.txt


In [41]:
'''
生成处理短路径后的每条 route 的 travel time 作为 truth：可以设定 edges 长度
'''

'''
import pandas as pd
import os

#获取xml文件地址
path = os.path.abspath('./../../SUMO_data_generation/') 
#获取xml文件地址
data_path = os.path.join(path,'TraCI_output_adjusted.csv') 


# 载入CSV文件
df = pd.read_csv(data_path)

df['Travel_Time'] = pd.to_numeric(df['Travel_Time'], errors='coerce') + \
                    pd.to_numeric(df['Delay_Time'], errors='coerce') + \
                    pd.to_numeric(df['LowSpee_Time'], errors='coerce') 

df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Travel_Time'] = 0

# 将这 'Delay_Time', 'LowSpee_Time', 'Wait_Time' 转换为 0 和 1 的形式
df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']] = df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']].applymap(lambda x: 0 if x == 0 else 1)


# 构建一个词典嵌套词典 vehicle_edge_time_dict，外层 key 是 df 中的 'Vehicle_ID'，value 是 'Edge_ID', 
# 内层词典 key 是 'Edge_ID'，value 是 'Delay_Time', 'LowSpee_Time', 'Wait_Time', 'Travel_Time'

# 任务 1:

# 设定参数N（计算每个Vehicle_ID的前N个Travel_Time的和）
N = 50  

# 对于相同 vehicle id 对应的 vehicle_route_dict 和 vehicle_edge_time_dict，比较其 edge ids，
# 由于 vehicle_route_dict 的 edge ids 包含 vehicle_edge_time_dict 对应的 edge ids，
# 在 vehicle_route_dict 中找到第 N 个作为 target_id，在 vehicle_edge_time_dict 对应的 edge ids 中找到 target_id，
# 如果 target_id 在 edge ids 中的位置大于 N，打印报错，
# 合并 target_id 在 edge ids 中的所有 travel time 的和作为该 vehicle id 对应的 travel time，
# 并存入一个新的词典中，key 是 vehcile id，value 是 travel time




# 分组并计算每个组的Travel_Time和
grouped = df.groupby('Vehicle_ID')['Travel_Time'].apply(lambda x: x.head(N).sum()).reset_index()

# 按照 Vehicle_ID 进行排序
grouped = grouped.sort_values(by='Vehicle_ID')

# 计算Vehicle_ID的unique值的数量
unique_vehicle_ids = len(grouped)

# 准备要写入文件的数据
lines = [f"{unique_vehicle_ids} {N}\n"]
lines += [f"{row['Vehicle_ID']} {row['Travel_Time']}\n" for index, row in grouped.iterrows()]

# 写入到文本文件
with open('time_no_wait.txt', 'w') as f:
    f.writelines(lines)

print(f"数据处理完成，保留 {N} 长度的 route 数据写入 time_no_wait.txt文件。")
'''

'\nimport pandas as pd\nimport os\n\n#获取xml文件地址\npath = os.path.abspath(\'./../../SUMO_data_generation/\') \n#获取xml文件地址\ndata_path = os.path.join(path,\'TraCI_output_adjusted.csv\') \n\n\n# 载入CSV文件\ndf = pd.read_csv(data_path)\n\ndf[\'Travel_Time\'] = pd.to_numeric(df[\'Travel_Time\'], errors=\'coerce\') +                     pd.to_numeric(df[\'Delay_Time\'], errors=\'coerce\') +                     pd.to_numeric(df[\'LowSpee_Time\'], errors=\'coerce\') \n\ndf.loc[df[\'E_Length\'] / df[\'Speed_Net\'] < 1, \'Travel_Time\'] = 0\n\n# 将这 \'Delay_Time\', \'LowSpee_Time\', \'Wait_Time\' 转换为 0 和 1 的形式\ndf[[\'Delay_Time\', \'LowSpee_Time\', \'Wait_Time\']] = df[[\'Delay_Time\', \'LowSpee_Time\', \'Wait_Time\']].applymap(lambda x: 0 if x == 0 else 1)\n\n\n# 构建一个词典嵌套词典 vehicle_edge_time_dict，外层 key 是 df 中的 \'Vehicle_ID\'，value 是 \'Edge_ID\', \n# 内层词典 key 是 \'Edge_ID\'，value 是 \'Delay_Time\', \'LowSpee_Time\', \'Wait_Time\', \'Travel_Time\'\n\n# 任务 1:\n\n# 设定参数N（计算每个Vehicle_ID的前N个Travel_Time的和）\n

In [31]:
'''
生成处理短路径后的每条 route 的 travel time 作为 truth：默认 edges 长度
'''

'''
import pandas as pd
import pandas as pd
import os

#获取xml文件地址
path = os.path.abspath('./../../SUMO_data_generation/') 
#获取xml文件地址
data_path = os.path.join(path,'TraCI_output_adjusted.csv') 


# 载入CSV文件
df = pd.read_csv(data_path)

print(f'Records number is: {len(df)}')

df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Travel_Time'] = 0

print(f"Small edge number is: {(df['E_Length'] / df['Speed_Net'] < 1).sum()}")

# 设定参数N（None表示计算所有Travel_Time的和）
N = None  # 您可以根据需要更改这个值或设置为具体的数字

# 分组并计算每个组的Travel_Time和，根据N的值决定计算方式
if N is None:
    grouped = df.groupby('Vehicle_ID')['Travel_Time'].sum().reset_index()
else:
    grouped = df.groupby('Vehicle_ID')['Travel_Time'].apply(lambda x: x.head(N).sum()).reset_index()
    
# 按照 Vehicle_ID 进行排序
grouped = grouped.sort_values(by='Vehicle_ID')

# 计算Vehicle_ID的unique值的数量
unique_vehicle_ids = len(grouped)

# 准备要写入文件的数据
lines = [f"{unique_vehicle_ids} {N if N is not None else 0}\n"]
lines += [f"{row['Vehicle_ID']} {row['Travel_Time']}\n" for index, row in grouped.iterrows()]


# 写入到文本文件
with open('time_no_wait.txt', 'w') as f:
    f.writelines(lines)

print("数据处理完成，完整 route 数据写入 time_no_wait.txt文件。")
'''

Records number is: 3597154
Small edge number is: 1709371
数据处理完成，完整 route 数据写入 time_no_wait.txt文件。


In [30]:
'''
route 出行时间的统计
'''

'''
#获取xml文件地址
path = os.path.abspath('./../../SUMO_data_generation/') 
#获取xml文件地址
data_path = os.path.join(path,'TraCI_output_adjusted.csv') 


# 载入CSV文件
df = pd.read_csv(data_path)

print(f'Records number is: {len(df)}')

df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Travel_Time'] = 0

print(f"Small edge number is: {(df['E_Length'] / df['Speed_Net'] < 1).sum()}")


# 设定参数N（None表示计算所有Travel_Time的和）
N = None  # 您可以根据需要更改这个值或设置为具体的数字

# 分组并计算每个组的Travel_Time和，根据N的值决定计算方式
if N is None:
    grouped = df.groupby('Vehicle_ID')['Travel_Time'].sum().reset_index()
else:
    grouped = df.groupby('Vehicle_ID')['Travel_Time'].apply(lambda x: x.head(N).sum()).reset_index()
    
print(f'Max travel time is: {grouped["Travel_Time"].max()}')
print(f'Min travel time is: {grouped["Travel_Time"].min()}')
print(f'Travel time range difference is: {grouped["Travel_Time"].max() - grouped["Travel_Time"].min()}')

print(f'\n Trave time for each route is:')
print(grouped)
'''

Records number is: 39845224
Small edge number is: 18851703
Max travel time is: 2362
Min travel time is: 1
Travel time range difference is: 2361

 Trave time for each route is:
        Vehicle_ID  Travel_Time
0                0          985
1                1          371
2                2         1054
3                3          286
4                4          455
...            ...          ...
177682      177682          829
177683      177683          173
177684      177684          591
177685      177685         1456
177686      177686          233

[177687 rows x 2 columns]
