In [27]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import joblib
from sklearn.preprocessing import LabelEncoder

In [28]:
# Load dataset
data = pd.read_csv('data/encoded_data.csv')
data

Unnamed: 0,ROAD_CLASS_Expressway,ROAD_CLASS_Expressway Ramp,ROAD_CLASS_Laneway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Major Arterial.1,ROAD_CLASS_Major Shoreline,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other,ROAD_CLASS_Pending,...,DIVISION_D33,DIVISION_D41,DIVISION_D42,DIVISION_D43,DIVISION_D51,DIVISION_D52,DIVISION_D53,DIVISION_D55,DIVISION_NSA,INJURY_NUM
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18952,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
18953,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
18954,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
18955,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [30]:
# Split the dataset
X_common = [
"INVTYPE_Driver",
"IMPACTYPE_Rear End",
"INVTYPE_Pedestrian",
'INVTYPE_Motorcycle Driver',
'INVTYPE_Vehicle Owner', 
'IMPACTYPE_Approaching', 
'IMPACTYPE_Turning Movement',  
'INVTYPE_Passenger',  
'IMPACTYPE_Cyclist Collisions', 
'IMPACTYPE_Pedestrian Collisions',
'ACCLASS_Non-Fatal Injury'
]
X = data[X_common]
# y = data['INJURY_NUM']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
le = LabelEncoder()
y = le.fit_transform(data["INJURY_NUM"])

In [31]:
# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [32]:
# Train & evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name}: {acc:.4f}")

Logistic Regression: 0.8397
Random Forest: 0.8447
Decision Tree: 0.8444
K-Nearest Neighbors: 0.7975
Support Vector Machine: 0.8447
Gradient Boosting: 0.8410


In [33]:
# Fit model with your selected features
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

rf_model.fit(X_train, y_train)

In [34]:
# Save the trained model
joblib.dump(rf_model, "injury_model.pkl")

['injury_model.pkl']

In [35]:
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']