In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from mord import LogisticIT
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pylab as plt
import seaborn as sns
import dmba
from dmba import classificationSummary, gainsChart, liftChart
from dmba.metric import AIC_score


%matplotlib inline

no display found. Using non-interactive Agg backend


In [None]:
crash_df = pd.read_csv("Crash_Excel.csv")

In [None]:
new_df = crash_df.dropna()

new_df.columns = [s.strip().replace(' ', '_') for s in new_df.columns]
new_df.columns = [s.strip().replace('-', '_') for s in new_df.columns]

new_df = new_df[(new_df['Vehicle_Year'] >= 1960) & (new_df['Vehicle_Year'] <= 2025)].copy()

bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020, 2025]
labels = ['1960s', '1970s', '1980s', '1990s', '2000s', '2010s', '2020s']

new_df['Year_Bin'] = pd.cut(new_df['Vehicle_Year'], bins=bins, labels=labels, right=True)

In [None]:
bins = [0, 20, 40, 60, 80]
labels = ['Slow', 'Average', 'Fast', 'Fastest']

new_df['Speed_Category'] = pd.cut(new_df['Speed_Limit'], bins=bins, labels=labels, right=True)

In [None]:
new_df['Injury_Severity'] = new_df['Injury_Severity'].str.upper()
new_df['Severity'] = new_df['Injury_Severity'].apply( lambda x: 1 if x in ['SUSPECTED SERIOUS INJURY', 'FATAL INJURY'] else 0)

In [None]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115192 entries, 5 to 195089
Data columns (total 29 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   Agency_Name                    115192 non-null  object  
 1   ACRS_Report_Type               115192 non-null  object  
 2   Crash_Day                      115192 non-null  object  
 3   Crash_Time                     115192 non-null  object  
 4   Route_Type                     115192 non-null  object  
 5   Non_Motor_Related              115192 non-null  int64   
 6   Collision_Type                 115192 non-null  object  
 7   Weather                        115192 non-null  object  
 8   Surface_Condition              115192 non-null  object  
 9   Light                          115192 non-null  object  
 10  Traffic_Control                115192 non-null  object  
 11  Driver_Substance_Abuse         115192 non-null  object  
 12  Driver_At_Fault      

In [None]:
columns = ['Weather', 'Surface_Condition', 'Collision_Type', 'Crash_Day', 'Vehicle_Body_Type', 'Severity', 'Vehicle_Make']

for x in columns:
    new_df[x] = new_df[x].astype('category')

In [None]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115192 entries, 5 to 195089
Data columns (total 29 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   Agency_Name                    115192 non-null  object  
 1   ACRS_Report_Type               115192 non-null  object  
 2   Crash_Day                      115192 non-null  category
 3   Crash_Time                     115192 non-null  object  
 4   Route_Type                     115192 non-null  object  
 5   Non_Motor_Related              115192 non-null  int64   
 6   Collision_Type                 115192 non-null  category
 7   Weather                        115192 non-null  category
 8   Surface_Condition              115192 non-null  category
 9   Light                          115192 non-null  object  
 10  Traffic_Control                115192 non-null  object  
 11  Driver_Substance_Abuse         115192 non-null  object  
 12  Driver_At_Fault      

In [None]:
predictors = ['Weather', 'Surface_Condition', 'Speed_Category', 'Collision_Type', 'Crash_Day', 'Vehicle_Body_Type', 'Vehicle_Year', 'Vehicle_Make']
outcome = 'Severity'

X = pd.get_dummies(new_df[predictors])
y = new_df[outcome]
classes = list(y.cat.categories)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)

In [None]:
injury_nb = MultinomialNB(alpha=0.01)
injury_nb.fit(X_train, y_train)

In [None]:
predProb_train = injury_nb.predict_proba(X_train)
predProb_valid = injury_nb.predict_proba(X_valid)

In [None]:
y_valid_pred = injury_nb.predict(X_valid)
y_train_pred = injury_nb.predict(X_train)

In [None]:
classificationSummary(y_train, y_train_pred, class_names=classes)
classificationSummary(y_valid, y_valid_pred, class_names=classes)

Confusion Matrix (Accuracy 0.9880)

       Prediction
Actual     0     1
     0 68177   204
     1   622   112
Confusion Matrix (Accuracy 0.9852)

       Prediction
Actual     0     1
     0 45348   265
     1   418    46
