In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# 1. Read Mannual Allocation Data

# Define data path
path = os.path.abspath('./../SUMO_Simulation') 
# Define data path
data_path = os.path.join(path,'TraCI_output_adjusted.csv') # Mannual Allocation Data
print('Data path is:', data_path) 

# Read csv file
df = pd.read_csv(data_path)

# Choose input features
selected_columns = ['Lanes_Net',    # Number of lanes
                    'Speed_Net',    # Speed limit
                    'E_Length',     # Edge (Road) length
                    'Driving_Num',  # Traffic flow
                    'Travel_Time',  # Normal travel time (speed > 5 m/s)
                    'Delay_Time',   # Wait time caused by congestion (green traffic signal)
                    'LowSpee_Time', # Low speed travel time  (speed < 5 m/s)
                    'Wait_Time'     # Wait time caused by waiting red light
                   ]
df = df[selected_columns]

# Check correctness of travel time 
for col in ['Delay_Time', 'LowSpee_Time', 'Wait_Time']:
    if (df[col] < 0).any(): print(f"Error: {col} contains values less than 0")       
        
print("Data Shape:", df.shape)

Data path is: /data5/zxucj/Traffic_Simulation_Work/Traffic_Simulation_Data_Generation_for_Model_Training/SUMO_Simulation/TraCI_output_adjusted.csv
Data Shape: (39845224, 8)


In [3]:
# 2. Feature Engineering

# Redefine travel time
df['Travel_Time'] = pd.to_numeric(df['Travel_Time'], errors='coerce') + pd.to_numeric(df['Delay_Time'], errors='coerce') + pd.to_numeric(df['LowSpee_Time'], errors='coerce')
# To boolean
df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']] = df[['Delay_Time', 'LowSpee_Time', 'Wait_Time']].applymap(lambda x: 0 if x == 0 else 1)
df['Delay_Time'] = df['Delay_Time'].astype('category')
df['LowSpee_Time'] = df['LowSpee_Time'].astype('category')
df['Wait_Time'] = df['Wait_Time'].astype('category')

# Define travel time on tiny road to 0
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Travel_Time'] = 0
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'Delay_Time'] = 0
df.loc[df['E_Length'] / df['Speed_Net'] < 1, 'LowSpee_Time'] = 0

# Round to two decimal places
df['Speed_Net'] = df['Speed_Net'].round(2)
df['E_Length'] = df['E_Length'].round(2)

# Add new features to enhence importance of ratio and length
df['Length_Speed_Ratio'] = df['E_Length'] / df['Speed_Net']
df['E_Length_Squared'] = df['E_Length'] ** 2
# To int.
cols_to_convert = ['Length_Speed_Ratio', 'E_Length_Squared']
df[cols_to_convert] = df[cols_to_convert].round().astype(int)

# Remove outliers
q_high = df['Travel_Time'].quantile(0.99)
df = df[df['Travel_Time'] <= q_high]

df.head()

Unnamed: 0,Lanes_Net,Speed_Net,E_Length,Driving_Num,Travel_Time,Delay_Time,LowSpee_Time,Wait_Time,Length_Speed_Ratio,E_Length_Squared
0,4,27.78,7.36,1,0,0,0,0,0,54
1,4,11.18,16.9,1,3,1,1,0,2,286
2,3,11.18,11.76,1,2,1,1,0,1,138
3,1,11.18,14.52,1,3,1,1,0,1,211
4,4,11.18,10.62,1,0,0,0,0,1,113


In [4]:
# 3. Balance samples
# Since the number of samples with low traffic flow exceeds those with high traffic flow, 
# this module restricts the maximum number of samples for each traffic flow category.	

# Define max threshold
desired_samples = 20000

# For each traffic flow category, retain all samples if their count is below the maximum threshold; 
# otherwise, randomly select samples up to the maximum threshold.	
df_filter = pd.DataFrame()
for value in df['Driving_Num'].unique():
    current_df = df[df['Driving_Num'] == value]
    if current_df.shape[0] > desired_samples:
        sampled_df = current_df.sample(n=desired_samples, random_state=42)
    else:
        sampled_df = current_df
    df_filter = pd.concat([df_filter, sampled_df])
# Reset index
df_filter.reset_index(drop=True, inplace=True)

print("数据形状:", df_filter.shape)

数据形状: (253754, 10)


In [5]:
# 4. Split data into training and testing

train_data, test_data = train_test_split(df_filter, test_size=0.2, random_state=42)
print("Shape of training:", train_data.shape)
print("Shape of testing:", test_data.shape)

Shape of training: (203003, 10)
Shape of testing: (50751, 10)


In [6]:
# 5. Model training

predictor = TabularPredictor(label='Travel_Time', problem_type='regression', eval_metric='mean_absolute_error').fit(train_data)

print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)

No path specified. Models will be saved in: "AutogluonModels/ag-20241207_012554"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #134-Ubuntu SMP Fri Sep 27 20:20:17 UTC 2024
CPU Count:          128
Memory Avail:       1487.54 GB / 1511.51 GB (98.4%)
Disk Space Avail:   12015.81 GB / 13484.30 GB (89.1%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.

[1000]	valid_set's l1: 2.4064
[2000]	valid_set's l1: 2.10853
[3000]	valid_set's l1: 1.92785
[4000]	valid_set's l1: 1.81294
[5000]	valid_set's l1: 1.73316
[6000]	valid_set's l1: 1.66123
[7000]	valid_set's l1: 1.61056
[8000]	valid_set's l1: 1.56515
[9000]	valid_set's l1: 1.5266
[10000]	valid_set's l1: 1.49196


	-1.492	 = Validation score   (-mean_absolute_error)
	41.91s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 1.24934
[2000]	valid_set's l1: 1.16049
[3000]	valid_set's l1: 1.13693
[4000]	valid_set's l1: 1.1239
[5000]	valid_set's l1: 1.11194
[6000]	valid_set's l1: 1.10411
[7000]	valid_set's l1: 1.09938
[8000]	valid_set's l1: 1.09467
[9000]	valid_set's l1: 1.09193
[10000]	valid_set's l1: 1.09004


	-1.09	 = Validation score   (-mean_absolute_error)
	32.53s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-0.9318	 = Validation score   (-mean_absolute_error)
	2.18s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: CatBoost ...
	-1.1601	 = Validation score   (-mean_absolute_error)
	168.94s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.9469	 = Validation score   (-mean_absolute_error)
	2.45s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-3.0394	 = Validation score   (-mean_absolute_error)
	180.67s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	-1.143	 = Validation score   (-mean_absolute_error)
	22.32s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-1.4115	 = Validation score   (-mean_absolute_error)
	928.88s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: Ligh

[1000]	valid_set's l1: 1.11791
[2000]	valid_set's l1: 1.08005
[3000]	valid_set's l1: 1.07079
[4000]	valid_set's l1: 1.06858
[5000]	valid_set's l1: 1.06716
[6000]	valid_set's l1: 1.06674
[7000]	valid_set's l1: 1.06664
[8000]	valid_set's l1: 1.06654


	-1.066	 = Validation score   (-mean_absolute_error)
	89.13s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'RandomForestMSE': 0.96, 'ExtraTreesMSE': 0.04}
	-0.9316	 = Validation score   (-mean_absolute_error)
	0.06s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 1473.15s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 7773.4 rows/s (2500 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20241207_012554")


AutoGluon infers problem type is:  regression
AutoGluon identified the following types of features:
('float', [])     : 2 | ['Speed_Net', 'E_Length']
('int', [])       : 4 | ['Lanes_Net', 'Driving_Num', 'Length_Speed_Ratio', 'E_Length_Squared']
('int', ['bool']) : 3 | ['Delay_Time', 'LowSpee_Time', 'Wait_Time']


In [9]:
# 6. Prediction results on testing data

predictor.evaluate(test_data)

{'mean_absolute_error': -0.9955793761091155,
 'root_mean_squared_error': -2.0892561740404547,
 'mean_squared_error': -4.364991360766159,
 'r2': 0.9748989075984149,
 'pearsonr': 0.9873757619613595,
 'median_absolute_error': -0.49695301055908203}

In [10]:
# 7. Check overfitting

# Preidcted results on training data
train_predictions = predictor.predict(train_data)
train_mae = mean_absolute_error(train_data['Travel_Time'], train_predictions)
train_mse = mean_squared_error(train_data['Travel_Time'], train_predictions)

# Preidcted results on testing data
test_predictions = predictor.predict(test_data)
test_mae = mean_absolute_error(test_data['Travel_Time'], test_predictions)
test_mse = mean_squared_error(test_data['Travel_Time'], test_predictions)

print(f'Training MAE: {train_mae}, Test MAE: {test_mae}')
print(f'Training MSE: {train_mse}, Test MSE: {test_mse}')

Training MAE: 0.8515853539943992, Test MAE: 0.9955793761091155
Training MSE: 3.105289659827365, Test MSE: 4.364991360766159
