In [1]:
#Import libraries
import os
import pandas as pd
import numpy as np
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

### Data preparation

In [2]:
#Read dataset
df = pd.read_csv('src/data/final/data_final.csv', encoding='ISO-8859-1')

In [3]:
X = df.drop(columns=['Severity'])
y = df['Severity']

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
#Identify categorical and numerical features
categorical_features = [
    'Safety_Equipment', 'Department_Code', 'Mobile_Obstacle',
    'Vehicle_Category', 'Position_In_Vehicle', 'Collision_Type', 
    'Time_of_Day', 'Journey_Type', 'Obstacle_Hit', 
    'Road_Category', 'Gender', 'User_Category', 'Intersection_Type'
]
numerical_features = ['Driver_Age', 'Number_of_Lanes']

In [6]:
#Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [7]:
#Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

### Model training and evaluation

In [50]:
#Train Logistic Regression model
start_time = time.time()
logistic_model = LogisticRegression(class_weight='balanced', C=0.5, solver='saga', max_iter=200)
logistic_model.fit(X_train_preprocessed, y_train)
execution_time_logistic = time.time() - start_time

In [52]:
#Save the model
logistic_model_filename = 'src/models/logistic_model.joblib'
joblib.dump(logistic_model, logistic_model_filename)

['src/models/logistic_model.joblib']

In [53]:
#Evaluate Logistic Regression model
logistic_predictions = logistic_model.predict(X_test_preprocessed)
logistic_score = logistic_model.score(X_test_preprocessed, y_test)
model_size_logistic = os.path.getsize(logistic_model_filename)

In [54]:
print(f"Logistic Regression Accuracy: {logistic_score:.2f}")
print("Logistic Regression Classification Report:\n", classification_report(y_test, logistic_predictions))
print(f"Execution Time: {execution_time_logistic:.2f} seconds")
print(f"Model Size: {model_size_logistic} bytes")

Logistic Regression Accuracy: 0.58
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           1       0.75      0.76      0.76    362257
           2       0.11      0.65      0.19     21167
           3       0.44      0.35      0.39    168698
           4       0.64      0.48      0.55    290978

    accuracy                           0.58    843100
   macro avg       0.48      0.56      0.47    843100
weighted avg       0.63      0.58      0.60    843100

Execution Time: 134.12 seconds
Model Size: 8439 bytes


In [42]:
#Train Decision Tree model
start_time = time.time()
decision_tree_model = DecisionTreeClassifier(class_weight='balanced', max_depth=10, min_samples_leaf=5, min_samples_split=5)
decision_tree_model.fit(X_train_preprocessed, y_train)
execution_time_tree = time.time() - start_time

In [45]:
#Save the model
decision_tree_model_filename = 'src/models/decision_tree_model.joblib'
joblib.dump(decision_tree_model, decision_tree_model_filename)

['src/models/decision_tree_model.joblib']

In [46]:
#Evaluate Decision Tree model
tree_predictions = decision_tree_model.predict(X_test_preprocessed)
tree_score = decision_tree_model.score(X_test_preprocessed, y_test)
model_size_tree = os.path.getsize(decision_tree_model_filename)

In [47]:
print(f"Decision Tree Accuracy: {tree_score:.2f}")
print("Decision Tree Classification Report:\n", classification_report(y_test, tree_predictions))
print(f"Execution Time: {execution_time_tree:.2f} seconds")
print(f"Model Size: {model_size_tree} bytes")

Decision Tree Accuracy: 0.55
Decision Tree Classification Report:
               precision    recall  f1-score   support

           1       0.75      0.73      0.74    362257
           2       0.09      0.65      0.16     21167
           3       0.44      0.23      0.30    168698
           4       0.59      0.52      0.55    290978

    accuracy                           0.55    843100
   macro avg       0.47      0.53      0.44    843100
weighted avg       0.61      0.55      0.57    843100

Execution Time: 86.20 seconds
Model Size: 168633 bytes


In [48]:
#Train Random Forest model
start_time = time.time()
random_forest_model = RandomForestClassifier(class_weight='balanced', n_estimators=30, max_depth=5, min_samples_split=5)
random_forest_model.fit(X_train_preprocessed, y_train)
execution_time_forest = time.time() - start_time

In [51]:
#Save the model
random_forest_model_filename = 'src/models/random_forest_model.joblib'
joblib.dump(random_forest_model, random_forest_model_filename)

['src/models/random_forest_model.joblib']

In [55]:
#Evaluate Random Forest model
forest_predictions = random_forest_model.predict(X_test_preprocessed)
forest_score = random_forest_model.score(X_test_preprocessed, y_test)
model_size_forest = os.path.getsize(random_forest_model_filename)

In [56]:
print(f"Random Forest Accuracy: {forest_score:.2f}")
print("Random Forest Classification Report:\n", classification_report(y_test, forest_predictions))
print(f"Execution Time: {execution_time_forest:.2f} seconds")
print(f"Model Size: {model_size_forest} bytes")

Random Forest Accuracy: 0.54
Random Forest Classification Report:
               precision    recall  f1-score   support

           1       0.69      0.78      0.73    362257
           2       0.08      0.62      0.15     21167
           3       0.44      0.15      0.23    168698
           4       0.62      0.45      0.52    290978

    accuracy                           0.54    843100
   macro avg       0.46      0.50      0.41    843100
weighted avg       0.60      0.54      0.54    843100

Execution Time: 41.42 seconds
Model Size: 194009 bytes
