In [41]:
# Import necessary libraries
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from scipy.stats import shapiro
from sklearn.preprocessing import QuantileTransformer, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier 

 
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import StandardScaler, OneHotEncoder  
from sklearn.compose import ColumnTransformer  
from sklearn.pipeline import Pipeline  
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  
from sklearn.svm import SVC  
from sklearn.metrics import accuracy_score, classification_report  
import xgboost as xgb
from joblib import dump


# Identify and remove outliers
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df




        
#preprocess_traffic_data(combined_df)

In [42]:
def preprocess_traffic_data(combined_df):  
    """  
    对交通数据进行预处理。  
    
    :param combined_df: 包含交通数据的 DataFrame  
    :param vehicle_counts: 需要标准化的车辆计数列名列表  
    :return: 经过预处理的 DataFrame  
    """  
    vehicle_counts = ['CarCount', 'BikeCount', 'BusCount', 'TruckCount']
    # 1. 将 'Traffic Situation' 列转换为分类变量并提取编码  
    combined_df['Traffic Situation'] = combined_df['Traffic Situation'].astype('category').cat.codes  
    
    # 2. 从 'Time' 列提取小时信息  
    combined_df['Hour'] = pd.to_datetime(combined_df['Time'], format='%I:%M:%S %p').dt.hour  
    
    # 3. 创建一个布尔列 'Weekend'，判断是否为周末  
    combined_df['Weekend'] = combined_df['Day of the week'].isin(['Saturday', 'Sunday'])  
    
    # 4. 移除异常值  
    combined_df = remove_outliers(combined_df, vehicle_counts)  
    
    # 5. 使用 QuantileTransformer 进行标准化  
    scaler = QuantileTransformer(output_distribution='normal')  
    combined_df[vehicle_counts] = scaler.fit_transform(combined_df[vehicle_counts])  
    
    return combined_df  


# model_type = sys.argv[3] # 新增model_type变量
# model_save_path = sys.argv[2]
# input_train_filepath = sys.argv[1]

# combined_df = pd.read_csv("..\Traffic_analysis\Traffic.csv")
# # 使用示例：  
# combined_df = preprocess_traffic_data(combined_df) 





In [43]:
traffic_df = pd.read_csv("..\Traffic_analysis\TrafficTwoMonth.csv")
# 使用示例：  


traffic_two_month_df = pd.read_csv("..\Traffic_analysis\TrafficTwoMonth.csv")

In [44]:
combined_df = pd.concat([traffic_df, traffic_two_month_df], ignore_index=True)

Traffic Situation   
3    7200   -normal   
0    1970   -heavy   
2    1668   -low    
1     726   -high    
Name: count, dtype: int64

In [45]:
combined_df = preprocess_traffic_data(combined_df)

In [46]:
# 显示数据的前几行（可选）  
print("Original Data:")  


# 分割数据集为训练集和测试集，80% 作为训练集，20% 作为测试集  
train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42)  

# 显示分割后的数据集大小（可选）  
print(f"\nTraining Set Size: {train_df.shape[0]}")  
print(f"Test Set Size: {test_df.shape[0]}")  

# # 保存数据集为 CSV 文件（可选）  
# train_df.to_csv('traffic_data_train.csv', index=False)  
# test_df.to_csv('traffic_data_test.csv', index=False)  

# print("\nTraining and testing datasets have been saved.")  

Original Data:

Training Set Size: 9251
Test Set Size: 2313


In [47]:
def train_traffic_data(combined_df):  
  
    # Prepare the features and target
    X = combined_df.drop(columns=['Traffic Situation'])
    y = combined_df['Traffic Situation']
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Preprocessing pipeline for numeric and categorical features
    numeric_features = ['CarCount', 'BikeCount', 'BusCount', 'TruckCount', 'Total', 'Hour']
    categorical_features = ['Time', 'Date', 'Day of the week', 'Weekend']
    report = None
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])

    # 定义各个模型管道  
    rf_model = Pipeline(steps=[  
        ('preprocessor', preprocessor),  
        ('classifier', RandomForestClassifier(random_state=42))  
    ])  

    xgb_model = Pipeline(steps=[  
        ('preprocessor', preprocessor),  
        ('classifier', xgb.XGBClassifier(random_state=42))  
    ])  

    svm_model = Pipeline(steps=[  
        ('preprocessor', preprocessor),  
        ('classifier', SVC(random_state=42,probability=True))  
    ])  

    gb_model = Pipeline(steps=[  
        ('preprocessor', preprocessor),  
        ('classifier', GradientBoostingClassifier(random_state=42))  
    ])  

       # 添加决策树模型  
    dt_model = Pipeline(steps=[  
        ('preprocessor', preprocessor),  
        ('classifier', DecisionTreeClassifier(random_state=42))  
    ]) 
        # 训练随机森林模型  
    dt_model.fit(X_train, y_train)
        # 评估模型表现  
    df_y_pred = dt_model.predict(X_test)
    print("****************************************************************")
    print("Random Forest Model Accuracy:", accuracy_score(y_test, df_y_pred))  
    print("Random Forest Classification Report:")  
    print(classification_report(y_test, df_y_pred))  
    print("****************************************************************")
    report = classification_report(y_test, df_y_pred, output_dict=True)

In [48]:


train_traffic_data(train_df)

****************************************************************
Random Forest Model Accuracy: 0.9983792544570502
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       0.97      1.00      0.99       107
           2       1.00      1.00      1.00       270
           3       1.00      1.00      1.00      1141

    accuracy                           1.00      1851
   macro avg       0.99      1.00      1.00      1851
weighted avg       1.00      1.00      1.00      1851

****************************************************************
