In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# Step 1: Load the dataset
data_path = 'data.csv'  # Replace with your dataset path
data = pd.read_csv(data_path)

# Step 2: Encode categorical features
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Step 3: Randomize the dataset
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 4: Split features and target
X = data.drop('Crash_Severity', axis=1)
y = data['Crash_Severity']

# Step 5: Scale numeric features
scaler = StandardScaler()
X[X.select_dtypes(include=['float64', 'int64']).columns] = scaler.fit_transform(
    X.select_dtypes(include=['float64', 'int64'])
)

# Step 6: Handle class imbalance using SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 8: Define Models
models = {
    'RandomForest': RandomForestClassifier(random_state=42, n_estimators=200),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Train and evaluate each model
feature_importances = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Store feature importances if available
    if hasattr(model, 'feature_importances_'):
        feature_importances[model_name] = model.feature_importances_

# Step 9: Display Top Features for Each Model
for model_name, importances in feature_importances.items():
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    print(f"\nTop Features for {model_name}:")
    print(importance_df.head(10))


RandomForest Accuracy: 0.6667
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3

Confusion Matrix:
 [[0 1]
 [0 2]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


CatBoost Accuracy: 0.3333
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.50      0.50      0.50         2

    accuracy                           0.33         3
   macro avg       0.25      0.25      0.25         3
weighted avg       0.33      0.33      0.33         3

Confusion Matrix:
 [[0 1]
 [1 1]]
XGBoost Accuracy: 0.3333
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         0
           2       1.00      0.50      0.67         2

    accuracy                           0.33         3
   macro avg       0.33      0.17      0.22         3
weighted avg       0.67      0.33      0.44         3

Confusion Matrix:
 [[0 1 0]
 [0 0 0]
 [0 1 1]]

Top Features for RandomForest:
                   Feature  Importance
1               Crash_Time    0.183427
4 

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
