In [17]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score,classification_report,confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
import joblib


In [18]:
#pip install --upgrade imbalanced-learn
import imblearn
print(imblearn.__version__)



0.14.1


In [19]:
# read the dataset as a dataframe
df = pd.read_excel('traffic_case_study.xlsx')
df.head()

Unnamed: 0,AUTOMOBILE_COUNT,SPEED_LIMIT,URBAN_RURAL,URBAN,YOUNG_DRIVER,COLLISION_TYPE,LANE_COUNT,BICYCLE_COUNT,BUS_COUNT,ILLUMINATION,...,DRUGGED_DRIVER,FATAL_OR_SUSP_SERIOUS_INJ,FATIGUE_ASLEEP,HIT_FIXED_OBJECT,LANE_DEPARTURE,MATURE_DRIVER,OVERTURNED,SCHOOL_ZONE,SPEEDING_RELATED,WET_ROAD
0,1,25.0,3,1.0,0.0,2,2.0,1,0.0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,35.0,3,1.0,0.0,2,4.0,1,0.0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,25.0,1,0.0,1.0,7,2.0,0,0.0,1,...,0,0,0,1,1,0,0,0,0,0
3,0,35.0,3,1.0,0.0,7,2.0,0,0.0,1,...,1,0,0,1,1,0,0,0,1,0
4,3,35.0,3,1.0,0.0,2,2.0,0,0.0,1,...,0,0,0,0,1,1,0,0,0,0


In [20]:
# read the info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   AUTOMOBILE_COUNT           600 non-null    int64  
 1   SPEED_LIMIT                540 non-null    float64
 2   URBAN_RURAL                600 non-null    int64  
 3   URBAN                      540 non-null    float64
 4   YOUNG_DRIVER               540 non-null    float64
 5   COLLISION_TYPE             600 non-null    int64  
 6   LANE_COUNT                 540 non-null    float64
 7   BICYCLE_COUNT              600 non-null    int64  
 8   BUS_COUNT                  540 non-null    float64
 9   ILLUMINATION               600 non-null    int64  
 10  MOTORCYCLE_COUNT           600 non-null    int64  
 11  PED_COUNT                  600 non-null    int64  
 12  UNBELTED_OCC_COUNT         600 non-null    int64  
 13  CURVED_ROAD                600 non-null    int64  

In [21]:
# check for duplicates
df.duplicated().sum()

101

In [22]:
# check for missing values
df.isna().sum()

AUTOMOBILE_COUNT              0
SPEED_LIMIT                  60
URBAN_RURAL                   0
URBAN                        60
YOUNG_DRIVER                 60
COLLISION_TYPE                0
LANE_COUNT                   60
BICYCLE_COUNT                 0
BUS_COUNT                    60
ILLUMINATION                  0
MOTORCYCLE_COUNT              0
PED_COUNT                     0
UNBELTED_OCC_COUNT            0
CURVED_ROAD                   0
DEER_RELATED                  0
DRINKING_DRIVER               0
DRUGGED_DRIVER                0
FATAL_OR_SUSP_SERIOUS_INJ     0
FATIGUE_ASLEEP                0
HIT_FIXED_OBJECT              0
LANE_DEPARTURE                0
MATURE_DRIVER                 0
OVERTURNED                    0
SCHOOL_ZONE                   0
SPEEDING_RELATED              0
WET_ROAD                      0
dtype: int64

#### Some columns have missing values and they yave to be dealt with

In [23]:
# classify the columns into features and target
X = df.drop(columns=['FATAL_OR_SUSP_SERIOUS_INJ'])
y = df['FATAL_OR_SUSP_SERIOUS_INJ']

# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)


In [24]:
# group columns with same data types

# Nominal categorical
categorical_features = [
    "COLLISION_TYPE",
    "ILLUMINATION",
    "URBAN_RURAL"
]

# Numeric counts & continuous
numeric_features = [
    "AUTOMOBILE_COUNT",
    "SPEED_LIMIT",
    "LANE_COUNT",
    "BICYCLE_COUNT",
    "BUS_COUNT",
    "MOTORCYCLE_COUNT",
    "PED_COUNT",
    "UNBELTED_OCC_COUNT"
]

# Binary variables
binary_features = list(
    set(X.columns) - set(categorical_features) - set(numeric_features)
)

In [25]:
# preprocessing steps using pipelines

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
]))

binary_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
])


preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
    ("bin", binary_transformer, binary_features)
])


In [26]:
# training the model

model = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", LogisticRegression(max_iter=1000, solver="liblinear"))
])

# fit the model
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print(f' Training score: {y_train_pred}')
print(f'Test score{y_test_pred}')

print(f'f1_score:, {f1_score(y_test, y_test_pred)}')
print('...........Classification Report:........')
print(classification_report(y_test, y_test_pred))

print('..........Confusion Matrix:........')
print(f'{confusion_matrix(y_test, y_test_pred)}')

 Training score: [0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 1 1
 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0
 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1
 0 0 0 0 1 0]
Test score[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 

#### I chose the F1-score because the dataset is imbalanced, meaning serious or fatal crashes occur much less frequently than non-serious ones, which makes accuracy an unreliable measure. The F1-score combines precision and recall, allowing the model to correctly identify serious crashes while also controlling the number of false positive predictions. This balance is important to ensure the model is both sensitive to high-risk crashes and reliable in its predictions.

In [27]:
# finetune using gridsearch

# hyperparameter grid

param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__penalty": ["l1", "l2"],
    "classifier__solver": ["liblinear"],  # supports l1 and l2
    "classifier__max_iter": [1000],
    "smote__k_neighbors": [3, 5, 7]
}

# gridsearchcv

grid_search = GridSearchCV(
    estimator=model,          
    param_grid=param_grid,
    scoring="f1",             
    cv=5,
    n_jobs=-1,
    verbose=1
)

#fit gridsearchcv
grid_search.fit(X_train, y_train)

# best results
print("Best parameters:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_

y_train_pred_best = best_model.predict(X_train)
y_test_pred_best = best_model.predict(X_test)

print(f'Optimized Model Accuracy:, {accuracy_score(y_test, y_test_pred_best)}')
print('Optimized Classification Report:')
print(classification_report(y_test, y_test_pred_best ))
print('Optimized Confusion Matrix:')
print(confusion_matrix(y_test, y_test_pred_best))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters:
{'classifier__C': 1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'smote__k_neighbors': 7}
Optimized Model Accuracy:, 0.8866666666666667
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       143
           1       0.25      0.71      0.37         7

    accuracy                           0.89       150
   macro avg       0.62      0.80      0.65       150
weighted avg       0.95      0.89      0.91       150

Optimized Confusion Matrix:
[[128  15]
 [  2   5]]


In [30]:
# save the model
joblib.dump(model, "traffic_severity_model.pkl")

['traffic_severity_model.pkl']