In [162]:
import sys

In [163]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from scipy.stats import shapiro
from sklearn.preprocessing import QuantileTransformer, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

 
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import StandardScaler, OneHotEncoder  
from sklearn.compose import ColumnTransformer  
from sklearn.pipeline import Pipeline  
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  
from sklearn.svm import SVC  
from sklearn.metrics import accuracy_score, classification_report  
import xgboost as xgb  

In [164]:
# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")
input_train_filepath = sys.argv[1]
traffic_df = pd.read_csv("..\Traffic_analysis\Traffic.csv")
traffic_two_month_df = pd.read_csv("..\Traffic_analysis\TrafficTwoMonth.csv")
# Combine datasets
traffic_df['Source'] = 'OneMonth'
traffic_two_month_df['Source'] = 'TwoMonth'
combined_df = pd.concat([traffic_df, traffic_two_month_df], ignore_index=True)

#查看异常数据

### 删除异常数据
这个函数使用四分位数法（IQR）来确定异常值，并从数据集中删除它们。该方法对数值列特别有效。

In [165]:
combined_df['Traffic Situation'] = combined_df['Traffic Situation'].astype('category').cat.codes
combined_df['Hour'] = pd.to_datetime(combined_df['Time'], format='%I:%M:%S %p').dt.hour
combined_df['Weekend'] = combined_df['Day of the week'].isin(['Saturday', 'Sunday'])
# Identify and remove outliers
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

vehicle_counts = ['CarCount', 'BikeCount', 'BusCount', 'TruckCount']
combined_df = remove_outliers(combined_df, vehicle_counts)


In [166]:
# Normalize data using QuantileTransformer
scaler = QuantileTransformer(output_distribution='normal')
combined_df[['CarCount', 'BikeCount', 'BusCount', 'TruckCount']] = scaler.fit_transform(combined_df[['CarCount', 'BikeCount', 'BusCount', 'TruckCount']])


# Prepare the features and target
X = combined_df.drop(columns=['Traffic Situation'])
y = combined_df['Traffic Situation']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [173]:




# Preprocessing pipeline for numeric and categorical features
numeric_features = ['CarCount', 'BikeCount', 'BusCount', 'TruckCount', 'Total', 'Hour']
categorical_features = ['Time', 'Date', 'Day of the week', 'Source', 'Weekend']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# # 创建预处理管道  
# preprocessor = ColumnTransformer(  
#     transformers=[  
#         ('num', StandardScaler(), numeric_features),  
#         ('cat', OneHotEncoder(), categorical_features)  
#     ])  

# 定义各个模型管道  
rf_model = Pipeline(steps=[  
    ('preprocessor', preprocessor),  
    ('classifier', RandomForestClassifier(random_state=42))  
])  

xgb_model = Pipeline(steps=[  
    ('preprocessor', preprocessor),  
    ('classifier', xgb.XGBClassifier(random_state=42))  
])  

svm_model = Pipeline(steps=[  
    ('preprocessor', preprocessor),  
    ('classifier', SVC(random_state=42,probability=True))  
])  

gb_model = Pipeline(steps=[  
    ('preprocessor', preprocessor),  
    ('classifier', GradientBoostingClassifier(random_state=42))  
])  

# 训练随机森林模型  
rf_model.fit(X_train, y_train)  

# 训练XGBoost模型  
xgb_model.fit(X_train, y_train)  

# 训练支持向量机模型  
svm_model.fit(X_train, y_train)  

# 训练梯度提升模型  
gb_model.fit(X_train, y_train)  

# 对测试集进行预测  
rf_y_pred = rf_model.predict(X_test)  
xgb_y_pred = xgb_model.predict(X_test)  
svm_y_pred = svm_model.predict(X_test)  
gb_y_pred = gb_model.predict(X_test)  

# 评估模型表现  
print("Random Forest Model Accuracy:", accuracy_score(y_test, rf_y_pred))  
print("Random Forest Classification Report:")  
print(classification_report(y_test, rf_y_pred))  

print("XGBoost Model Accuracy:", accuracy_score(y_test, xgb_y_pred))  
print("XGBoost Classification Report:")  
print(classification_report(y_test, xgb_y_pred))  

print("Support Vector Machine Model Accuracy:", accuracy_score(y_test, svm_y_pred))  
print("Support Vector Machine Classification Report:")  
print(classification_report(y_test, svm_y_pred))  

print("Gradient Boosting Model Accuracy:", accuracy_score(y_test, gb_y_pred))  
print("Gradient Boosting Classification Report:")  
print(classification_report(y_test, gb_y_pred))

Random Forest Model Accuracy: 0.9902522935779816
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       333
           1       0.98      1.00      0.99       122
           2       0.98      0.97      0.97       238
           3       0.99      0.99      0.99      1051

    accuracy                           0.99      1744
   macro avg       0.99      0.99      0.99      1744
weighted avg       0.99      0.99      0.99      1744

XGBoost Model Accuracy: 0.9988532110091743
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       122
           2       1.00      0.99      1.00       238
           3       1.00      1.00      1.00      1051

    accuracy                           1.00      1744
   macro avg       1.00      1.00      1.00      1744
weighted avg       1.00  

In [174]:
from sklearn.metrics import accuracy_score,classification_report,f1_score,precision_score,recall_score

results = {  
    'Model': ['Random Forest', 'XGBoost', 'Support Vector Machine', 'Gradient Boosting'],  
    'Accuracy': [  
        accuracy_score(y_test, rf_y_pred),  
        accuracy_score(y_test, xgb_y_pred),  
        accuracy_score(y_test, svm_y_pred),  
        accuracy_score(y_test, gb_y_pred)  
    ],  
    'Precision': [  
        precision_score(y_test, rf_y_pred, average='weighted'),  
        precision_score(y_test, xgb_y_pred, average='weighted'),  
        precision_score(y_test, svm_y_pred, average='weighted'),  
        precision_score(y_test, gb_y_pred, average='weighted')  
    ],  
    'Recall': [  
        recall_score(y_test, rf_y_pred, average='weighted'),  
        recall_score(y_test, xgb_y_pred, average='weighted'),  
        recall_score(y_test, svm_y_pred, average='weighted'),  
        recall_score(y_test, gb_y_pred, average='weighted')  
    ],  
    'F1 Score': [  
        f1_score(y_test, rf_y_pred, average='weighted'),  
        f1_score(y_test, xgb_y_pred, average='weighted'),  
        f1_score(y_test, svm_y_pred, average='weighted'),  
        f1_score(y_test, gb_y_pred, average='weighted')  
    ],  
}  

# 将结果转化为DataFrame  
results_df = pd.DataFrame(results)  

# 打印模型比较结果  
print(results_df)  

# 输出详细分类报告（可选）  
print("\nClassification Reports:")  
print("Random Forest Classification Report:\n", classification_report(y_test, rf_y_pred))  
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_y_pred))  
print("Support Vector Machine Classification Report:\n", classification_report(y_test, svm_y_pred))  
print("Gradient Boosting Classification Report:\n", classification_report(y_test, gb_y_pred))  

                    Model  Accuracy  Precision    Recall  F1 Score
0           Random Forest  0.990252   0.990257  0.990252  0.990233
1                 XGBoost  0.998853   0.998855  0.998853  0.998851
2  Support Vector Machine  0.917431   0.916876  0.917431  0.915059
3       Gradient Boosting  0.998853   0.998855  0.998853  0.998851

Classification Reports:
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       333
           1       0.98      1.00      0.99       122
           2       0.98      0.97      0.97       238
           3       0.99      0.99      0.99      1051

    accuracy                           0.99      1744
   macro avg       0.99      0.99      0.99      1744
weighted avg       0.99      0.99      0.99      1744

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00   

In [175]:

# for i in range(n_classes):  
#     fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], rf_y_prob[:, i])  
#     roc_auc[i] = auc(fpr[i], tpr[i])  

# # 绘制 ROC 曲线  
# plt.figure(figsize=(10, 8))  

# for i in range(n_classes):  
#     plt.plot(fpr[i], tpr[i], lw=2, label='Class {0} (AUC = {1:0.2f})'.format(classes[i], roc_auc[i]))  

# # 绘制对角线  
# plt.plot([0, 1], [0, 1], 'k--', lw=2)  

# # 设置图形属性  
# plt.xlim([0.0, 1.0])  
# plt.ylim([0.0, 1.05])  
# plt.xlabel('False Positive Rate')  
# plt.ylabel('True Positive Rate')  
# plt.title('Receiver Operating Characteristic (ROC) Curve')  
# plt.legend(loc="lower right")  
# plt.grid()  

# # 显示图形  
# plt.show()  
