In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from category_encoders import BinaryEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ML models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('data/encoded_data.csv') 

In [3]:
data.head()

Unnamed: 0,ROAD_CLASS_Expressway,ROAD_CLASS_Expressway Ramp,ROAD_CLASS_Laneway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Major Arterial.1,ROAD_CLASS_Major Shoreline,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other,ROAD_CLASS_Pending,...,DIVISION_D33,DIVISION_D41,DIVISION_D42,DIVISION_D43,DIVISION_D51,DIVISION_D52,DIVISION_D53,DIVISION_D55,DIVISION_NSA,INJURY_NUM
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3


In [7]:
# Separate X and y
X = data.drop(columns=['INJURY_NUM'])
y = data['INJURY_NUM']

In [4]:
# ✅ Use the top 20 features from your Random Forest bar chart
selected_features = [
    'ACCLASS_Non-Fatal Injury',
    'LIGHT_Daylight',
    'INVTYPE_Passenger',
    'IMPACTYPE_Pedestrian Collisions',
    'TRAFFCTL_Traffic Signal',
    'INVAGE_20 to 24',
    'LIGHT_Dark, artificial',
    'INVAGE_25 to 29',
    'INITDIR_North',
    'INVTYPE_Pedestrian',
    'INITDIR_West',
    'RDSFCOND_Wet',
    'INVTYPE_Driver',
    'ROAD_CLASS_Major Arterial',
    'INITDIR_South',
    'INVAGE_30 to 34',
    'IMPACTYPE_Turning Movement',
    'INVAGE_35 to 39',
    'INVAGE_unknown',
    'INVAGE_40 to 44'
]

In [8]:
# Filter selected columns
X_selected = X[selected_features]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [9]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

In [10]:
# Train and evaluate
print("✅ Accuracy using Random Forest Selected Features:\n")
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name}: {acc:.4f}")

✅ Accuracy using Random Forest Selected Features:

Logistic Regression: 0.8402
Random Forest: 0.8289
Decision Tree: 0.8244
K-Nearest Neighbors: 0.8201
Support Vector Machine: 0.8407
Gradient Boosting: 0.8399
