In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load dataset
data = pd.read_csv('data/encoded_data.csv')
data

Unnamed: 0,ROAD_CLASS_Expressway,ROAD_CLASS_Expressway Ramp,ROAD_CLASS_Laneway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Major Arterial.1,ROAD_CLASS_Major Shoreline,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other,ROAD_CLASS_Pending,...,DIVISION_D33,DIVISION_D41,DIVISION_D42,DIVISION_D43,DIVISION_D51,DIVISION_D52,DIVISION_D53,DIVISION_D55,DIVISION_NSA,INJURY_NUM
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18952,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
18953,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
18954,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
18955,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [11]:
features = [
    'INVAGE_unknown', 'IMPACTYPE_Rear End', 'INVTYPE_Passenger', 'IMPACTYPE_Turning Movement',
    'ACCLASS_Non-Fatal Injury', 'IMPACTYPE_Pedestrian Collisions', 'VEHTYPE_Motorcycle',
    'INVTYPE_Driver', 'IMPACTYPE_Approaching', 'DRIVCOND_Unknown',
    'DISTRICT_Toronto and East York', 'INVTYPE_Motorcycle Driver',
    'IMPACTYPE_Cyclist Collisions', 'INVTYPE_Vehicle Owner', 'INVTYPE_Pedestrian'
]
X = data[features]
y = data['INJURY_NUM']

In [12]:
# Train-Test Split with Stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [13]:
# Models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [14]:
# Compare models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: {accuracy:.4f}")

Logistic Regression: 0.8386
Random Forest: 0.8457
Decision Tree: 0.8449
K-Nearest Neighbors: 0.8378
Support Vector Machine: 0.8436
Gradient Boosting: 0.8426
