In [None]:
import os
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor

In [4]:
# 1. Build a dictionary that maps SUMO network information into int format and is readable by the Simulation Algorithm:

# 1). Convert the node ID of the string structure to the int structure: node_to_int,
# 2). Convert the edge ID of the string structure to the int structure: edge_to_int,
# 3). Convert the edge ID of the int structure to the string structure: int_to_edge,
# 4). Store the node ID and edge ID of the int structure in Manhattan_network_mapped.csv.

# Read data
directory_path = os.path.abspath('./')
path = os.path.join(directory_path, 'Manhattan_network_raw.csv')

df = pd.read_csv(path)
print(len(df))

# Convert unique string ids of "node_start" and "node_end" to unique integers.
# Get a list of unique nodes
unique_nodes = pd.concat([df['node_start'], df['node_end']]).unique()
# Create a mapping of node string id to integer
node_to_int = {node: idx for idx, node in enumerate(unique_nodes)}
# Replace the string ids in the dataframe
df['node_start'] = df['node_start'].map(node_to_int)
df['node_end'] = df['node_end'].map(node_to_int)

# Convert "edge_id" to unique integers.
# Create a mapping of edge string id to integer
edge_to_int = {edge: idx for idx, edge in enumerate(df['edge_id'].unique())}
# Replace the string ids in the dataframe
df['edge_id'] = df['edge_id'].map(edge_to_int)

# Reverse edge_to_int
int_to_edge = {v: k for k, v in edge_to_int.items()}

29493


In [8]:
# 2. Feature Engineering

# Define path
path = os.path.abspath('./../../../Realistic_Event_Data_Generation_Procedure/') 
data_path = os.path.join(path,'TraCI_output_adjusted.csv') 
print('Data path is:', data_path) 

# Read .csv file
dataFrame = pd.read_csv(data_path)
selected_columns = ['Lanes_Net', 'Speed_Net', 'E_Length', 'Driving_Num', 'Travel_Time', 'Delay_Time', 'LowSpee_Time', 'Wait_Time']
dataFrame = dataFrame[selected_columns]

# Check if 'Delay_Time', 'LowSpee_Time', 'Wait_Time' have values less than 0
for col in ['Delay_Time', 'LowSpee_Time', 'Wait_Time']:
    if (dataFrame[col] < 0).any(): print(f"Error: {col} contains values less than 0")
        
print("Datashape:", dataFrame.shape)

df = dataFrame.copy()

# Travel_Time should be the sum of traffic jam, low speed, and normal driving.
df['Travel_Time'] = pd.to_numeric(df['Travel_Time'], errors='coerce') + pd.to_numeric(df['Delay_Time'], errors='coerce') + pd.to_numeric(df['LowSpee_Time'], errors='coerce')

# Convert 'Delay_Time', 'LowSpee_Time', 'Wait_Time' to 0 and 1
df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']] = df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']].applymap(lambda x: 0 if x == 0 else 1)

# The Travel_Time of the small road segment is updated to 0
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Travel_Time'] = 0
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Delay_Time'] = 0
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'LowSpee_Time'] = 0

# Convert these features to category type
df['Delay_Time'] = df['Delay_Time'].astype('category')
df['LowSpee_Time'] = df['LowSpee_Time'].astype('category')
df['Wait_Time'] = df['Wait_Time'].astype('category')


# Create an interaction term between length and speed limit (Length to Speed ​​ratio)
df['Length_Speed_Ratio'] = df['E_Length'] / df['Speed_Net']

# Logarithmically transform speed and length to reduce the impact of extreme values ​​on the model
df['Log_E_Length'] = np.log1p(df['E_Length'])
df['E_Length_Squared'] = df['E_Length'] ** 2

# Round first, then convert to integer
cols_to_convert = ['E_Length', 'Length_Speed_Ratio', 'Log_E_Length', 'E_Length_Squared', 'Speed_Net', 'Driving_Num']
df[cols_to_convert] = df[cols_to_convert].round().astype(int)

print(df.dtypes)

Data path is: /data/zxucj/Traffic_Simulation_Work/Traffic_Simulation_Data_Generation_for_Baselines/0/TraCI_output_adjusted.csv
数据形状: (42911018, 8)


In [21]:
# Find unique input feature combination
unique_features = df.drop_duplicates(subset=['Lanes_Net','Speed_Net','E_Length','Driving_Num','Travel_Time','Delay_Time','LowSpee_Time',
                                             'Wait_Time','Length_Speed_Ratio','Log_E_Length','E_Length_Squared'])
print(len(unique_features))

447974


In [14]:
# Check that column names are consistent
columns_match = list(unique_features.columns) == list(df.columns)
if columns_match:
    print("Exact column name match")
else:
    print("Column name mismatch")
    print("unique_features column:", list(unique_features.columns))
    print("df columns:", list(df.columns))

# 检查数据类型是否一致
dtypes_match = (unique_features.dtypes == df.dtypes).all()
if dtypes_match:
    print("Data type exactly matches")
else:
    print("Data type mismatch")
    print("unique_features data types:\n", unique_features.dtypes)
    print("df data types:\n", df.dtypes)

    print(unique_features.columns)

列名完全匹配
数据类型完全匹配


In [25]:
# Load Model
model_directory_path = os.path.abspath('./../../../Model_Training_on_Manual_Allocation_Data/Model_Training/')
model_path = os.path.join(model_directory_path, 'AutogluonModels/ag-20241207_012554')
predictor = TabularPredictor.load(model_path, require_py_version_match=False)

# Make predictions for unique feature combinations
predictions = predictor.predict(unique_features)

# Save the feature combination and prediction results to a txt file
output_path = 'model_catching_with_travel_time_1.txt'
with open(output_path, 'w', encoding='utf-8') as out_file:
    for _, row in unique_features.iterrows():
        
        feature_data = [row['Lanes_Net'],row['Speed_Net'],row['E_Length'],row['Driving_Num'],row['Travel_Time'],row['Delay_Time'],
                        row['LowSpee_Time'],row['Wait_Time'],row['Length_Speed_Ratio'],row['Log_E_Length'],row['E_Length_Squared']]
        prediction = predictions.loc[_]
        out_file.write(f"{' '.join(map(str, feature_data))} {prediction}\n")

print("Saved in file: ", output_path)

特征组合和预测结果已保存到 model_catching_with_travel_time_1.txt
