In [None]:
# notebooks/03_model_building.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
import joblib
import os

In [None]:
# Load processed data
train_path = "C:/Users/Anupam/Desktop/traffic-prediction-project/data_processed/train_data.csv"
test_path = "C:/Users/Anupam/Desktop/traffic-prediction-project/data_processed/test_data.csv"

In [None]:
print("Loading processed data...")
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
X_train = train_df.drop('Severity', axis=1)
y_train = train_df['Severity']
X_test = test_df.drop('Severity', axis=1)
y_test = test_df['Severity']

In [None]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
print("Training columns:", X_train.columns.tolist())
print("Number of features:", len(X_train.columns))


In [None]:
# =============== RANDOM FOREST MODEL ===============
print("\nTraining Random Forest model...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
print("\nRandom Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", round(accuracy_score(y_test, y_pred_rf), 4))

In [None]:
# Fix label encoding (shift all labels to start from 0)
y_train = y_train - y_train.min()
y_test = y_test - y_test.min()

# Re-run the model
print("\nTraining XGBoost model...")
xgb = XGBClassifier(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

In [None]:
print("\nXGBoost Performance:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", round(accuracy_score(y_test, y_pred_xgb), 4))

In [None]:
# =============== CONFUSION MATRIX ===============
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {title}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")
plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost")

In [None]:

# =============== CROSS-VALIDATION ===============
print("\nPerforming cross-validation (5-fold)...")
cv_rf = cross_val_score(rf, X_train, y_train, cv=5)
cv_xgb = cross_val_score(xgb, X_train, y_train, cv=5)

print(f"RandomForest CV Accuracy: {cv_rf.mean():.4f}")
print(f"XGBoost CV Accuracy: {cv_xgb.mean():.4f}")

In [None]:
# =============== SAVE BEST MODEL ===============
best_model = rf if accuracy_score(y_test, y_pred_rf) >= accuracy_score(y_test, y_pred_xgb) else xgb

os.makedirs("C:/Users/Anupam/Desktop/traffic-prediction-project/models", exist_ok=True)
joblib.dump(best_model, "C:/Users/Anupam/Desktop/traffic-prediction-project/models/accident_severity_model.pkl")

print("\nâœ… Model training complete. Best model saved to 'models/accident_severity_model.pkl'.")

In [None]:
# ======= Baseline & Proposed models training & save =======
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [None]:
# load processed train/test if not already in memory
train_path = "C:/Users/Anupam/Desktop/traffic-prediction-project/data_processed/train_data.csv"
test_path = "C:/Users/Anupam/Desktop/traffic-prediction-project/data_processed/test_data.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
X_train = train_df.drop('Severity', axis=1)
y_train = train_df['Severity']
X_test = test_df.drop('Severity', axis=1)
y_test = test_df['Severity']

In [None]:

os.makedirs("C:/Users/Anupam/Desktop/traffic-prediction-project/models", exist_ok=True)
os.makedirs("C:/Users/Anupam/Desktop/traffic-prediction-project/results", exist_ok=True)

In [None]:
models = {}
metrics = []

In [None]:
# 1) Logistic Regression (baseline)
lr = LogisticRegression(max_iter=1000, n_jobs=-1)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
models['logreg'] = lr

metrics.append({
    'model':'LogisticRegression',
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
    'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
    'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0)
})

In [None]:
# save cm
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
plt.title("Confusion Matrix - LogisticRegression")
plt.imshow(cm, interpolation='nearest')
plt.colorbar()
plt.savefig("C:/Users/Anupam/Desktop/traffic-prediction-project/results/cm_logreg.png")
plt.close()

In [None]:
# 2) Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
models['rf'] = rf
metrics.append({
    'model':'RandomForest',
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
    'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
    'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0)
})
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
plt.title("Confusion Matrix - RandomForest")
plt.imshow(cm, interpolation='nearest')
plt.colorbar()
plt.savefig("C:/Users/Anupam/Desktop/traffic-prediction-project/results/cm_rf.png")
plt.close()

In [None]:
# Ensure target labels start from 0 instead of 1
y_train = y_train - y_train.min()
y_test = y_test - y_test.min()

In [None]:
# 3) XGBoost
xgb = XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
models['xgb'] = xgb
metrics.append({
    'model':'XGBoost',
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
    'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
    'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0)
})
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
plt.title("Confusion Matrix - XGBoost")
plt.imshow(cm, interpolation='nearest')
plt.colorbar()
plt.savefig("C:/Users/Anupam/Desktop/traffic-prediction-project/results/cm_xgb.png")
plt.close()

In [None]:
# 4) Proposed model: example -> XGBoost + SelectKBest (feature selection)
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=12)  # choose k based on experiments
X_train_sel = selector.fit_transform(X_train, y_train)
X_test_sel = selector.transform(X_test)

xgb2 = XGBClassifier(n_estimators=200, learning_rate=0.08, max_depth=5, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb2.fit(X_train_sel, y_train)
y_pred = xgb2.predict(X_test_sel)
models['proposed'] = (xgb2, selector)  # store selector with model

metrics.append({
    'model':'Proposed_XGB_SelectK',
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
    'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
    'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0)
})
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
plt.title("Confusion Matrix - Proposed")
plt.imshow(cm, interpolation='nearest')
plt.colorbar()
plt.savefig("C:/Users/Anupam/Desktop/traffic-prediction-project/results/cm_proposed.png")
plt.close()

In [None]:
# Save models
joblib.dump(lr, "C:/Users/Anupam/Desktop/traffic-prediction-project/models/logreg.pkl")
joblib.dump(rf, "C:/Users/Anupam/Desktop/traffic-prediction-project/models/rf.pkl")
joblib.dump(xgb, "C:/Users/Anupam/Desktop/traffic-prediction-project/models/xgb.pkl")
# save proposed as tuple: (selector, model)
joblib.dump({'selector': selector, 'model': xgb2}, "C:/Users/Anupam/Desktop/traffic-prediction-project/models/proposed_model.pkl")


In [None]:
# Save metrics to CSV
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("C:/Users/Anupam/Desktop/traffic-prediction-project/results/model_metrics.csv", index=False)
print("Saved models and metrics.")