In [4]:
'''
构建 SUMO 路网信息映射成 int 格式的，Simulation Algorithm 可读的词典：
1. 把 string 结构的 node ID 转成 int 结构的：node_to_int，
2. 把 string 结构的 edge ID 转成 int 结构的：edge_to_int，
3. 把 int 结构的 edge ID 转成 string 结构的：int_to_edge，
4. 储存 int 结构的 node ID 和 edge ID 到 Manhattan_network_mapped.csv 中。
'''

import os
import pandas as pd
import numpy as np

# Map edge and node ID from 0 to their length

# Read data
directory_path = os.path.abspath('./')
path = os.path.join(directory_path, 'Manhattan_network_raw.csv')

df = pd.read_csv(path)
print(len(df))

# Convert unique string ids of "node_start" and "node_end" to unique integers.
# Get a list of unique nodes
unique_nodes = pd.concat([df['node_start'], df['node_end']]).unique()
# Create a mapping of node string id to integer
node_to_int = {node: idx for idx, node in enumerate(unique_nodes)}
# Replace the string ids in the dataframe
df['node_start'] = df['node_start'].map(node_to_int)
df['node_end'] = df['node_end'].map(node_to_int)

# Convert "edge_id" to unique integers.
# Create a mapping of edge string id to integer
edge_to_int = {edge: idx for idx, edge in enumerate(df['edge_id'].unique())}
# Replace the string ids in the dataframe
df['edge_id'] = df['edge_id'].map(edge_to_int)


# 反转 edge_to_int 映射
int_to_edge = {v: k for k, v in edge_to_int.items()}

29493


In [5]:
int_to_edge[25897]

'727135244#3'

In [6]:
# Edge embedding

# 目前我们不需要这个

In [8]:
import pandas as pd
import numpy as np
import os

# Define path
path = os.path.abspath('./../../../Traffic_Simulation_Data_Generation_for_Baselines/0/') 
# Define data path
data_path = os.path.join(path,'TraCI_output_adjusted.csv') 
print('Data path is:', data_path) 

# 读取CSV文件
dataFrame = pd.read_csv(data_path)

# 选择所需的列
# selected_columns = ['Edge_ID', 'Lanes_Net', 'Speed_Net', 'E_Length', 'Driving_Num', 'Travel_Time', 'Delay_Time', 'LowSpee_Time', 'Wait_Time']
selected_columns = ['Lanes_Net', 'Speed_Net', 'E_Length', 'Driving_Num', 'Travel_Time', 'Delay_Time', 'LowSpee_Time', 'Wait_Time']


dataFrame = dataFrame[selected_columns]

# 检查 'Delay_Time', 'LowSpee_Time', 'Wait_Time' 是否有小于0的值
for col in ['Delay_Time', 'LowSpee_Time', 'Wait_Time']:
    if (dataFrame[col] < 0).any(): print(f"Error: {col} contains values less than 0")
        
print("数据形状:", dataFrame.shape)

Data path is: /data/zxucj/Traffic_Simulation_Work/Traffic_Simulation_Data_Generation_for_Baselines/0/TraCI_output_adjusted.csv
数据形状: (42911018, 8)


In [None]:
# 特征处理


In [9]:
df = dataFrame.copy()

# Travel_Time 应该是堵车，低速，和正常行驶的和
df['Travel_Time'] = pd.to_numeric(df['Travel_Time'], errors='coerce') + pd.to_numeric(df['Delay_Time'], errors='coerce') + pd.to_numeric(df['LowSpee_Time'], errors='coerce')

# 将这 'Delay_Time', 'LowSpee_Time', 'Wait_Time' 转换为 0 和 1 的形式
df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']] = df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']].applymap(lambda x: 0 if x == 0 else 1)
# 删除多个列，并指定 axis=1 表示按列删除
# df = df.drop(columns=['Delay_Time', 'LowSpee_Time', 'Wait_Time'])

# 小路段的 Travel_Time 更新成 0 
# 测试效果后再决定是否删去
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Travel_Time'] = 0
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Delay_Time'] = 0
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'LowSpee_Time'] = 0

# 将这些特征转换为 category 类型
df['Delay_Time'] = df['Delay_Time'].astype('category')
df['LowSpee_Time'] = df['LowSpee_Time'].astype('category')
df['Wait_Time'] = df['Wait_Time'].astype('category')

# 输出处理后的DataFrame
print(df)

          Lanes_Net  Speed_Net  E_Length  Driving_Num  Travel_Time Delay_Time  \
0                 1      11.18      5.56            1            0          0   
1                 1      11.18      5.96            1            0          0   
2                 1      11.18      7.25            1            0          0   
3                 1      11.18      7.10            1            0          0   
4                 1      11.18      5.71            1            0          0   
...             ...        ...       ...          ...          ...        ...   
42911013          2      27.78     67.38            1            3          0   
42911014          2      27.78      9.54            1            0          0   
42911015          2      27.78     68.26            1            3          0   
42911016          2      27.78      9.04            1            0          0   
42911017          2      27.78      9.04            1            0          0   

         LowSpee_Time Wait_

In [10]:

# 创建长度与限速的交互项 (Length to Speed ratio)
df['Length_Speed_Ratio'] = df['E_Length'] / df['Speed_Net']

# 对数变换速度和长度以减少极值对模型的影响
df['Log_E_Length'] = np.log1p(df['E_Length'])
df['E_Length_Squared'] = df['E_Length'] ** 2

# 先四舍五入，再转换为整数
cols_to_convert = ['E_Length', 'Length_Speed_Ratio', 'Log_E_Length', 'E_Length_Squared', 'Speed_Net', 'Driving_Num']
df[cols_to_convert] = df[cols_to_convert].round().astype(int)

# df['Edge_ID'] = df['Edge_ID'].astype(str)

# 创建一个总的非行驶时间特征
# df['Total_NonDriving_Time'] = df['Delay_Time'] + df['LowSpee_Time'] + df['Wait_Time']
# 对于分类变量(0/1)，可以创建组合特征，例如延迟时间与等待时间的交互
# df['Delay_Wait_Interaction'] = df['Delay_Time'] * df['Wait_Time']

# 删除异常值 (例如去除超过 99 百分位的值)
q_high = df['Travel_Time'].quantile(0.99)
df = df[df['Travel_Time'] <= q_high]

# from sklearn.preprocessing import StandardScaler

# 对连续型变量进行标准化
# continuous_features = ['E_Length', 'Speed_Net', 'Driving_Num', 'Length_Speed_Ratio']
# scaler = StandardScaler()# 创建二次项 (Polynomial Features)
# df[continuous_features] = scaler.fit_transform(df[continuous_features])

In [11]:
print(df.dtypes)

Lanes_Net                int64
Speed_Net                int64
E_Length                 int64
Driving_Num              int64
Travel_Time              int64
Delay_Time            category
LowSpee_Time          category
Wait_Time             category
Length_Speed_Ratio       int64
Log_E_Length             int64
E_Length_Squared         int64
dtype: object


In [12]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [21]:
# 对特征组合进行统计分析
unique_features = df.drop_duplicates(subset=['Lanes_Net','Speed_Net','E_Length','Driving_Num','Travel_Time','Delay_Time','LowSpee_Time',
                                             'Wait_Time','Length_Speed_Ratio','Log_E_Length','E_Length_Squared'])
print(len(unique_features))

447974


In [22]:
unique_features

Unnamed: 0,Lanes_Net,Speed_Net,E_Length,Driving_Num,Travel_Time,Delay_Time,LowSpee_Time,Wait_Time,Length_Speed_Ratio,Log_E_Length,E_Length_Squared
0,1,11,6,1,0,0,0,1,0,2,31
1,1,11,6,1,0,0,0,0,1,2,36
2,1,11,7,1,0,0,0,0,1,2,53
3,1,11,7,1,0,0,0,0,1,2,50
4,1,11,6,1,0,0,0,0,1,2,33
...,...,...,...,...,...,...,...,...,...,...,...
42906235,2,28,78,3,11,1,1,1,3,4,6126
42907401,5,11,62,1,9,0,1,0,6,4,3863
42909885,1,11,116,1,12,0,0,0,10,5,13561
42910527,2,11,65,2,10,1,1,1,6,4,4233


In [14]:
# 检查列名是否一致
columns_match = list(unique_features.columns) == list(df.columns)
if columns_match:
    print("列名完全匹配")
else:
    print("列名不匹配")
    print("unique_features 列名:", list(unique_features.columns))
    print("df 列名:", list(df.columns))

# 检查数据类型是否一致
dtypes_match = (unique_features.dtypes == df.dtypes).all()
if dtypes_match:
    print("数据类型完全匹配")
else:
    print("数据类型不匹配")
    print("unique_features 数据类型:\n", unique_features.dtypes)
    print("df 数据类型:\n", df.dtypes)


列名完全匹配
数据类型完全匹配


In [23]:
print(unique_features.columns)

Index(['Lanes_Net', 'Speed_Net', 'E_Length', 'Driving_Num', 'Travel_Time',
       'Delay_Time', 'LowSpee_Time', 'Wait_Time', 'Length_Speed_Ratio',
       'Log_E_Length', 'E_Length_Squared'],
      dtype='object')


In [25]:
# 加载预训练模型
model_directory_path = os.path.abspath('./../../../Traffic_Simulation_Data_Generation_for_Model_Training/Model_Training/')
model_path = os.path.join(model_directory_path, 'AutogluonModels/ag-20241004_160814')
predictor = TabularPredictor.load(model_path, require_py_version_match=False)

# 对唯一的特征组合进行预测
predictions = predictor.predict(unique_features)

# 将特征组合和预测结果保存到txt文件
output_path = 'model_catching_with_travel_time_1.txt'
with open(output_path, 'w', encoding='utf-8') as out_file:
    for _, row in unique_features.iterrows():
        
        feature_data = [row['Lanes_Net'],row['Speed_Net'],row['E_Length'],row['Driving_Num'],row['Travel_Time'],row['Delay_Time'],
                        row['LowSpee_Time'],row['Wait_Time'],row['Length_Speed_Ratio'],row['Log_E_Length'],row['E_Length_Squared']]
        prediction = predictions.loc[_]
        # print(prediction)
        out_file.write(f"{' '.join(map(str, feature_data))} {prediction}\n")

print("特征组合和预测结果已保存到", output_path)

特征组合和预测结果已保存到 model_catching_with_travel_time_1.txt
