In [9]:
import unittest
import os
import time
from xml.dom.minidom import parse
import xml.dom.minidom
import pandas as pd
import csv

In [10]:
# Capture network information from ".xml" and save into ".csv"

def network_data_exploration(filename):
    # Define path
    path = os.path.abspath('./../../SUMO_data_generation/') 
    # Define data path
    data_path = os.path.join(path,filename) 
    print('Data path is:', data_path) 
    
    # Open ".xml" and find data
    DOMTree = xml.dom.minidom.parse(data_path) 
    data = DOMTree.documentElement 
    
    # Get element list
    nodeList = data.getElementsByTagName("edge")
    print("Number of edges are: ", len(nodeList))
    
    # Define and initilzie explored data
    all_rows = []
    
    for node in nodeList: 
        # Get features of each edge
        edge_ID = node.getAttribute("id")
        node_start = node.getAttribute("from")
        node_end = node.getAttribute("to")
        priority = node.getAttribute("priority")
        # Get element list of edge
        subNodeList = node.getElementsByTagName("lane")
        
        # Initialize features
        speed = 0
        length = 0
        for subNode in subNodeList:
            # Get features of each lane
            speed += float(subNode.getAttribute("speed"))
        
            length += float(subNode.getAttribute("length"))
        speed_avg = speed / len(subNodeList)
        length_avg = length / len(subNodeList)
        lane_num = len(subNodeList)
            
        newRow = {"edge_id": edge_ID, "node_start": node_start, "node_end": node_end, "lane_num":lane_num, "speed": speed_avg, "length": length_avg, "priority": priority}
        all_rows.append(newRow)
              
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(all_rows)
    return df


# Call function to export network information data
df = network_data_exploration("test.net.xml")
df.head()
df.to_csv('Manhattan_network_raw.csv', index=False)

# Potential mistakes:
# Edges allow different types of vehicles but do not considered here.

Data path is: /Users/xuzizhuo/Desktop/Traffic Simulation Prediction/SUMO_data_generation/test.net.xml
Number of edges are:  29497


In [11]:
'''
把 string 结构的 node ID 转成 int 结构的：node_to_int，
把 string 结构的 edge ID 转成 int 结构的：edge_to_int，
把 int 结构的 edge ID 转成 string 结构的：int_to_edge，
储存 int 结构的 node ID 和 edge ID 到 Manhattan_network_mapped.csv 中。
'''


# Map edge and node ID from 0 to their length

# Read data
df = pd.read_csv('Manhattan_network_raw.csv')

# Convert unique string ids of "node_start" and "node_end" to unique integers.
# Get a list of unique nodes
unique_nodes = pd.concat([df['node_start'], df['node_end']]).unique()
# Create a mapping of node string id to integer
node_to_int = {node: idx for idx, node in enumerate(unique_nodes)}
# Replace the string ids in the dataframe
df['node_start'] = df['node_start'].map(node_to_int)
df['node_end'] = df['node_end'].map(node_to_int)

# Convert "edge_id" to unique integers.
# Create a mapping of edge string id to integer
edge_to_int = {edge: idx for idx, edge in enumerate(df['edge_id'].unique())}
# Replace the string ids in the dataframe
df['edge_id'] = df['edge_id'].map(edge_to_int)


# 反转 edge_to_int 映射
int_to_edge = {v: k for k, v in edge_to_int.items()}


# Save mapped dataframe
df.to_csv('Manhattan_network_mapped.csv', index=False)

In [12]:
'''
储存词典 edge_to_int 到本地文件。
'''

import pickle


def save_dict_to_file(dictionary, filename):
    with open(filename, 'wb') as f:
        pickle.dump(dictionary, f)

save_dict_to_file(edge_to_int, 'edge_to_int.pkl')

In [13]:
'''
生成 int 结构 edge ID -> lane_num，speed_limit，average_length 的词典 edge_id_to_features，
储存词典 edge_id_to_features 到 csv 文件，方便 C++ 读取。
'''

# 读取预处理后的数据
df = pd.read_csv('Manhattan_network_mapped.csv')

# 初始化一个空字典来存储整数 edge_id 映射到其特征的信息
edge_id_to_features = {}

# 遍历 DataFrame 的每一行
for index, row in df.iterrows():
    # 为当前 edge_id 存储 lane_num, speed, 和 length 的信息
    edge_id_to_features[row['edge_id']] = {
        "lane_num": row['lane_num'],
        "speed": row['speed'],
        "length": row['length']
    }

# 将字典保存为 CSV 文件
with open('edge_id_to_features.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["edge_id", "lane_num", "speed", "length"])  # 写入标题
    for edge_id, features in edge_id_to_features.items():
        writer.writerow([edge_id, features['lane_num'], features['speed'], features['length']])


In [34]:
# Map edge to node pairs & node pairs to edges

# 读取 CSV 文件
df = pd.read_csv('Manhattan_network_raw.csv')

# 初始化两个字典
edge_to_nodes_raw = {}  # edge_id 映射到 node_start 和 node_end
nodes_to_edge_raw = {}  # node_start 和 node_end 映射到 edge_id

# 遍历 DataFrame
for index, row in df.iterrows():
    edge_id = row['edge_id']
    node_start = row['node_start']
    node_end = row['node_end']

    # 建立 edge_id 到 node_start 和 node_end 的映射
    edge_to_nodes_raw[edge_id] = (node_start, node_end)

    # 建立 node_start 和 node_end 到 edge_id 的映射
    nodes_to_edge_raw[(node_start, node_end)] = edge_id

# 测试输出
print("Edge to Nodes Mapping:", edge_to_nodes_raw["-1004369132#0"])
print("Nodes to Edge Mapping:", nodes_to_edge_raw[(7480410399, 42437990)])

Edge to Nodes Mapping: (7480410399, 42437990)
Nodes to Edge Mapping: -1004369132#0


In [35]:
# Map edge to node pairs & node pairs to edges

# 读取 CSV 文件
df = pd.read_csv('Manhattan_network_mapped.csv')

# 初始化两个字典
edge_to_nodes_mapped = {}  # edge_id 映射到 node_start 和 node_end
nodes_to_edge_mapped = {}  # node_start 和 node_end 映射到 edge_id

# 遍历 DataFrame
for index, row in df.iterrows():
    edge_id = row['edge_id']
    node_start = row['node_start']
    node_end = row['node_end']

    # 建立 edge_id 到 node_start 和 node_end 的映射
    edge_to_nodes_mapped[edge_id] = (node_start, node_end)

    # 建立 node_start 和 node_end 到 edge_id 的映射
    nodes_to_edge_mapped[(node_start, node_end)] = edge_id

# 测试输出
# print("Edge to Nodes Mapping:", edge_to_nodes_mapped[0])
# print("Nodes to Edge Mapping:", nodes_to_edge_mapped[(0, 9115)])

In [41]:
nodes_to_edge_mapped[(12573, 9504)]
int_to_edge[16133]

# edge id="46201646" from="370913784" to="370914027"

'46201646'

In [44]:
nodes_to_edge_mapped[(9504, 9520)]
int_to_edge[28341]

# edge id="960097006" from="370914027" to="589099584"

'960097006'

In [37]:
import pandas as pd
import csv

# Paths for the input and output files
csv_file_path = 'Manhattan_network_mapped.csv'
txt_file_path1 = 'Manhattan_network_BJ.txt'
txt_file_path2 = 'Manhattan_network_min_Travel_Time.txt'

# Loading the CSV file
df = pd.read_csv(csv_file_path)

# Calculating unique nodes and edges
unique_nodes = pd.concat([df['node_start'], df['node_end']]).unique()
unique_edges = df['edge_id'].unique()

# The two integer values to be added at the beginning of each .txt file
int_val_1 = len(unique_nodes)
print("Length of unqiue nodes: ", len(unique_nodes))
int_val_2 = len(unique_edges)
print("Length of unique edges: ", len(unique_edges))

# Generating the first .txt file
with open(txt_file_path1, 'w') as txt_file1:
    # Writing the two integers to the TXT file
    txt_file1.write(f"{int(int_val_1)} {int(int_val_2)}\n")
    
    # Iterating over rows in the DataFrame to format and write each row according to specifications
    for index, row in df.iterrows():
        txt_file1.write(f"{int(row['node_start'])} {int(row['node_end'])} {int(row['edge_id'])} {round(row['length'],2)}\n")

# Generating the second .txt file
with open(txt_file_path2, 'w') as txt_file2:
    # Writing the two integers to the TXT file
    txt_file2.write(f"{int(int_val_1)} {int(int_val_2)}\n")
    
    # Iterating over rows in the DataFrame to format and write each row according to specifications
    for index, row in df.iterrows():
        travel_time = round(row['length'] / row['speed'],2)  # Calculating travel time as length_avg / speed_avg
        txt_file2.write(f"{int(row['node_start'])} {int(row['node_end'])} {travel_time}\n")


Length of unqiue nodes:  20859
Length of unique edges:  29497


In [10]:
# Capture trajectory data from ".xml" file to ".csv" file

def trajectory_information_capture(filename):
    #获取xml文件地址
    path = os.path.abspath('./../../SUMO_data_generation') 
    #获取xml文件地址
    data_path = os.path.join(path,filename) 
    
    # 打开xml文档
    DOMTree = xml.dom.minidom.parse(data_path) 
    # 根据xml文档，得到文档元素的对象
    data = DOMTree.documentElement 
    
    # 获取节点列表
    nodeList = data.getElementsByTagName("vehicle")
    # 定义route数量
    nodeLen = len(nodeList)
    print("Length of routes are: ", nodeLen)
    
    all_rows = []
    
    for node in nodeList: 
        # 获取当前节点属性值
        route_ID = node.getAttribute("id")
        depart_time = node.getAttribute("depart")
        arrival_time = node.getAttribute("arrival")
        
        subNodeList = node.getElementsByTagName("route")
        
        for subNode in subNodeList:
            if len(subNodeList) != 1:
                print("Error.")
            
            # 获取当前节点属性值
            route = subNode.getAttribute("edges")
            exit_times = subNode.getAttribute("exitTimes")
            # 拆分字符串
            ids = route.split()
            # 提取第一个和最后一个ID
            start_node = ids[0]
            end_node = ids[-1]
        
        newRow = {"route_id": route_ID, "depart_time": depart_time, "arrival_time": arrival_time, 
                  "start_edge": start_node, "end_edge": end_node, "route_by_edge": route, "exit_times": exit_times}
        all_rows.append(newRow)

    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(all_rows)
    return df

df_trajectory = trajectory_information_capture("outputfile.xml")
df_trajectory.to_csv('Manhattan_trajectory_raw.csv', index=False)

Length of routes are:  89


In [11]:
# Convert raw edge IDs constructed route into mapped node IDs
# e.g. raw_edge_1 raw_edge_2 ... -> mapped_node_1 mapped_node_2 mapped_node_3

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_raw.csv')

# 初始化新列
df['route_by_mappped_node'] = None

# 遍历 DataFrame
for index, row in df.iterrows():
    # 分割 route_by_edge 字符串并应用映射
    node_pairs = [edge_to_nodes_mapped[edge_to_int[edge_id]] for edge_id in row['route_by_edge'].split(' ') if edge_id in edge_to_int]

    # 处理节点序列，确保节点不重复
    node_sequence = []
    for pair in node_pairs:
        # 确保节点转换为字符串
        node_start = str(pair[0])
        node_end = str(pair[1])

        if not node_sequence or node_sequence[-1] != node_start:
            node_sequence.append(node_start)
        node_sequence.append(node_end)

    # 更新新列
    df.at[index, 'route_by_mappped_node'] = ' '.join(node_sequence)

# 保存到新的 CSV 文件
df.to_csv('Manhattan_trajectory_mapped_node.csv', index=False)
df.head()

Unnamed: 0,route_id,depart_time,arrival_time,start_edge,end_edge,route_by_edge,exit_times,route_by_mappped_node
0,7,53.0,309.0,254594899,815957441#0,254594899 243046576#0 243046576#1 -1013655882#...,71.00 76.00 90.00 92.00 99.00 103.00 106.00 11...,4302.0 1808.0 9070.0 28.0 27.0 26.0 25.0 24.0 ...
1,8,60.0,376.0,-196116967#1,33117604,-196116967#1 -196116967#0 -455687907#22 -45568...,90.00 99.00 102.00 102.00 103.00 106.00 106.00...,977.0 665.0 2287.0 2286.0 2285.0 2283.0 2282.0...
2,11,82.0,389.0,5671090,1025731530#1,5671090 760727197 760727187#0 760727187#1 1080...,91.00 98.00 101.00 102.00 103.00 106.00 109.00...,15920.0 18700.0 18698.0 18699.0 6187.0 209.0 2...
3,19,142.0,489.0,243046575#5,978645353#1,243046575#5 243046575#6 243046575#7 243046575#...,144.00 154.00 165.00 166.00 167.00 168.00 187....,4098.0 9067.0 9068.0 9069.0 20385.0 20386.0 20...
4,1,8.0,499.0,-481047237#1,421830414,-481047237#1 -481047237#0 -5672138#5 -5672138#...,17.00 17.00 18.00 33.00 35.00 36.00 44.00 52.0...,2828.0 2827.0 3591.0 3590.0 3589.0 3588.0 7339...


In [12]:
# Capture query and stored into .txt file

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_mapped_node.csv')

# 打开一个新的文本文件用于写入数据
with open('query.txt', 'w') as file:
    # 遍历 DataFrame 中的每一行
    for index, row in df.iterrows():
        # 提取出发节点和目的地节点，并将它们转换为整数
        node_sequence = row['route_by_mappped_node'].split()
        departure_node = int(float(node_sequence[0]))  # 先转换为浮点数，再转换为整数
        destination_node = int(float(node_sequence[-1]))

        # 提取出发时间并转换为整数
        departure_time = int(row['depart_time'])

        # 将提取的信息按格式写入文件
        file.write(f"{departure_node} {destination_node} {departure_time}\n")

print("Data has been written to query.txt")

Data has been written to query.txt


In [13]:
# Capture route and stored into .txt file

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_mapped_node.csv')

# 打开一个新的文本文件用于写入数据
with open('route.txt', 'w') as file:
    # 遍历 DataFrame 中的每一行
    for index, row in df.iterrows():
        # 分割 node ID 序列并将每个节点 ID 转换为整数
        node_sequence = [int(float(node_id)) for node_id in row['route_by_mappped_node'].split()]
        node_count = len(node_sequence)

        # 将节点数量和整数形式的节点序列按格式写入文件
        file.write(f"{node_count} {' '.join(map(str, node_sequence))}\n")

print("Data has been written to route.txt")

Data has been written to route.txt


In [14]:
# Capture time and stored into .txt file

import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('Manhattan_trajectory_mapped_node.csv')

# 打开一个新的文本文件用于写入数据
with open('time_counts_and_values.txt', 'w') as file:
    # 遍历 DataFrame 中的每一行
    for index, row in df.iterrows():
        # 分割时间数据并将每个时间数据转换为整数
        time_sequence = [int(float(time)) for time in row['exit_times'].split()]
        time_count = len(time_sequence)

        # 将时间数量和整数形式的时间数据按格式写入文件
        file.write(f"{time_count} {' '.join(map(str, time_sequence))}\n")

print("Data has been written to time_counts_and_values.txt")


Data has been written to time_counts_and_values.txt
