In [4]:
pip install dmba


Collecting dmba
  Downloading dmba-0.2.4-py3-none-any.whl.metadata (1.9 kB)
Downloading dmba-0.2.4-py3-none-any.whl (11.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.2.4


In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import matplotlib.pylab as plt
import dmba
from sklearn.naive_bayes import MultinomialNB
from dmba import classificationSummary
from sklearn.feature_selection import mutual_info_classif


In [6]:
acci_df = dmba.load_data('accidentsFull.csv')
acci_df.dtypes
acci_df

Unnamed: 0,HOUR_I_R,ALCHL_I,ALIGN_I,STRATUM_R,WRK_ZONE,WKDY_I_R,INT_HWY,LGTCON_I_R,MANCOL_I_R,PED_ACC_R,...,SUR_COND,TRAF_CON_R,TRAF_WAY,VEH_INVL,WEATHER_R,INJURY_CRASH,NO_INJ_I,PRPTYDMG_CRASH,FATALITIES,MAX_SEV_IR
0,0,2,2,1,0,1,0,3,0,0,...,4,0,3,1,1,1,1,0,0,1
1,1,2,1,0,0,1,1,3,2,0,...,4,0,3,2,2,0,0,1,0,0
2,1,2,1,0,0,1,0,3,2,0,...,4,1,2,2,2,0,0,1,0,0
3,1,2,1,1,0,0,0,3,2,0,...,4,1,2,2,1,0,0,1,0,0
4,1,1,1,0,0,1,0,3,2,0,...,4,0,2,3,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42178,0,2,1,0,1,1,0,1,0,0,...,1,2,1,1,1,0,0,1,0,0
42179,1,2,1,1,0,0,0,1,0,0,...,1,0,1,1,1,1,1,0,0,1
42180,0,2,2,0,0,1,0,1,0,0,...,1,0,1,1,1,0,0,1,0,0
42181,1,2,1,1,0,1,0,1,0,0,...,1,0,1,1,1,0,0,1,0,0


In [7]:
# Create the dummy variable 'INJURY', as this dependent varible we need to add into the data set and use remaining as independt predictors
acci_df['INJURY'] = acci_df['MAX_SEV_IR'].apply(lambda x: 'yes' if x in [1, 2] else 'no')
acci_df.columns

Index(['HOUR_I_R', 'ALCHL_I', 'ALIGN_I', 'STRATUM_R', 'WRK_ZONE', 'WKDY_I_R',
       'INT_HWY', 'LGTCON_I_R', 'MANCOL_I_R', 'PED_ACC_R', 'RELJCT_I_R',
       'REL_RWY_R', 'PROFIL_I_R', 'SPD_LIM', 'SUR_COND', 'TRAF_CON_R',
       'TRAF_WAY', 'VEH_INVL', 'WEATHER_R', 'INJURY_CRASH', 'NO_INJ_I',
       'PRPTYDMG_CRASH', 'FATALITIES', 'MAX_SEV_IR', 'INJURY'],
      dtype='object')

In [8]:
# a. Using the information in this dataset, if an accident has just been reported and
#no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?

# Calculate the proportions of 'yes' and 'no' in the INJURY column
injury_Pred = acci_df['INJURY'].value_counts(normalize=True)
print(injury_Pred)

# Make a prediction based on the higher proportion
if injury_Pred.get('yes', 0) > injury_Pred.get('no', 0):
    prediction = 'yes'
else:
    prediction = 'no'

print(f"Predicted severity of the accident: {prediction}")

INJURY
yes    0.508783
no     0.491217
Name: proportion, dtype: float64
Predicted severity of the accident: yes


In [9]:
# b. Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R.

# i. Create a pivot table that examines INJURY as a function of the two predictors for
# these 12 records. Use all three variables in the pivot table as rows/columns and use 	counts for the cells.
# Select the first 12 records and relevant columns
filterData = acci_df[['INJURY', 'WEATHER_R', 'TRAF_CON_R']].head(12)
filterData

Unnamed: 0,INJURY,WEATHER_R,TRAF_CON_R
0,yes,1,0
1,no,2,0
2,no,2,1
3,no,1,1
4,no,1,0
5,yes,2,0
6,no,2,0
7,yes,1,0
8,no,2,0
9,no,2,0


In [23]:
X1 = acci_df[['WEATHER_R', 'TRAF_CON_R']]
y1 = acci_df['INJURY']
mod = MultinomialNB()
mod.fit(X1,y1)


In [29]:
y_pred = mod.predict(X1)
y_pred

array(['no', 'no', 'yes', ..., 'no', 'no', 'yes'], dtype='<U3')

In [10]:
# Create the pivot table
pivot_table = pd.pivot_table(filterData,
                             index='WEATHER_R',
                             columns='TRAF_CON_R',
                             values='INJURY',
                             aggfunc=lambda x: x.count(),
                             fill_value=0)

# Display the pivot table
print(pivot_table)

TRAF_CON_R  0  1  2
WEATHER_R          
1           3  1  1
2           6  1  0


In [11]:
# ii. Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given 	the six possible combinations of the predictors.
# Count injuries for each combination
injury_sum = pd.pivot_table(filterData,
                                index='WEATHER_R',
                                columns='TRAF_CON_R',
                                values='INJURY',
                                aggfunc=lambda x: (x == 'yes').sum(),
                                fill_value=0)
injury_sum

TRAF_CON_R,0,1,2
WEATHER_R,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,0,0
2,1,0,0


In [12]:
#  Calculate probabilities
probabilities = injury_sum / pivot_table

# Display the probabilities
probabilities

TRAF_CON_R,0,1,2
WEATHER_R,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.666667,0.0,0.0
2,0.166667,0.0,


In [13]:
# iii. Classify the 12 accidents OR RECORDS using these probabilities and a cutoff of 0.5.

pivot_table = pd.pivot_table(filterData, values='INJURY', index=['WEATHER_R', 'TRAF_CON_R'], columns='INJURY', aggfunc='size', fill_value=0)
print(pivot_table)

INJURY                no  yes
WEATHER_R TRAF_CON_R         
1         0            1    2
          1            1    0
          2            1    0
2         0            5    1
          1            1    0


In [14]:
# Calculate probabilities
total_counts = pivot_table.sum(axis=1)
prob_injury_yes = pivot_table['yes'] / total_counts
print(prob_injury_yes)

WEATHER_R  TRAF_CON_R
1          0             0.666667
           1             0.000000
           2             0.000000
2          0             0.166667
           1             0.000000
dtype: float64


In [15]:
# Using a cutoff of 0.5, classify each record:
# Classify based on probabilities
classification = prob_injury_yes.apply(lambda x: 'yes' if x > 0.5 else 'no')
print(classification)

WEATHER_R  TRAF_CON_R
1          0             yes
           1              no
           2              no
2          0              no
           1              no
dtype: object


In [64]:
# Manually compute the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1:
# Calculate naive Bayes probability

p_injury = filterData['INJURY'].value_counts(normalize=True)['yes']
p_weather = filterData['WEATHER_R'].value_counts(normalize=True)[1]
p_traf_con = filterData['TRAF_CON_R'].value_counts(normalize=True)[1]

# Assuming independence
p_injury_given_weather_traf_con = (p_injury * p_weather * p_traf_con) / (p_weather * p_traf_con)
print(f"the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1 is: \n {p_injury_given_weather_traf_con}")

the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1 is: 
 0.25


In [31]:
# Assuming that no information or initial reports about the accident itself are available at the time of
#prediction (only location characteristics, weather conditions, etc.), which predictors can we include in the analysis?

predictors = ['HOUR_I_R', 'ALCHL_I', 'ALIGN_I', 'STRATUM_R', 'WRK_ZONE', 'WKDY_I_R',
       'INT_HWY', 'LGTCON_I_R', 'MANCOL_I_R', 'PED_ACC_R', 'RELJCT_I_R',
       'REL_RWY_R', 'PROFIL_I_R', 'SPD_LIM', 'SUR_COND', 'TRAF_CON_R',
       'TRAF_WAY', 'VEH_INVL', 'WEATHER_R', 'INJURY_CRASH', 'NO_INJ_I',
       'PRPTYDMG_CRASH', 'FATALITIES', 'MAX_SEV_IR']  # Include all relevant predictors

X = acci_df[predictors].astype('category')
y = acci_df['INJURY'].astype('category')

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1) # no need to split in random forest beacause it randomly selects
model = MultinomialNB()
model.fit(train_X, train_y)

# Get the conditional probabilities
feature_importance = {}

for idx, feature in enumerate(predictors):
    prob_given_yes = model.feature_log_prob_[1][idx]  # Probability given class 'yes'
    prob_given_no = model.feature_log_prob_[0][idx]   # Probability given class 'no'
    feature_importance[feature] = np.exp(prob_given_yes) / np.exp(prob_given_no)  # Likelihood ratio

# Sort by importance
importance_df = pd.DataFrame.from_dict(feature_importance, orient='index', columns=['Importance'])
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Variable Importance in Naive Bayes:")
print(importance_df)

Variable Importance in Naive Bayes:
                  Importance
NO_INJ_I        18183.330974
MAX_SEV_IR      12113.345059
INJURY_CRASH    11608.354550
FATALITIES        253.413419
PED_ACC_R         117.525064
STRATUM_R           1.120411
TRAF_CON_R          1.058627
RELJCT_I_R          1.010008
REL_RWY_R           0.998587
VEH_INVL            0.992063
WRK_ZONE            0.973515
ALIGN_I             0.962118
TRAF_WAY            0.961650
SPD_LIM             0.957028
LGTCON_I_R          0.956455
HOUR_I_R            0.951182
ALCHL_I             0.941517
WKDY_I_R            0.930095
WEATHER_R           0.929994
INT_HWY             0.929805
MANCOL_I_R          0.927263
PROFIL_I_R          0.916087
SUR_COND            0.913798
PRPTYDMG_CRASH      0.000074


In [32]:
predct_y = model.predict(test_X)
conf_matrix = confusion_matrix(test_y, predct_y)

In [35]:
print(f"Confusion Matrix by including all the variables in addition to 2 predictors is :\n {conf_matrix}")

Confusion Matrix by including all the variables in addition to 2 predictors is :
 [[8329    0]
 [   0 8545]]


In [40]:
print(f"Accuracy to predict Injuries by including all is :")
classificationSummary(test_y,predct_y)


Accuracy to predict Injuries by including all is :
Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual    0    1
     0 8329    0
     1    0 8545


In [48]:
# considering only NO_INJ_I, MAX_SEV_IR, INJURY_CRASH in addition to WEATHER_R, TRAF_CON_R
# and calculate metrics to see if the accuracy is reducing or not
good_assu_pred = ['NO_INJ_I','MAX_SEV_IR','INJURY_CRASH','WEATHER_R','TRAF_CON_R']
X_good = acci_df[ good_assu_pred].astype('category')

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_good, y, test_size=0.4, random_state=1) # no need to split in random forest beacause it randomly selects
model1 = MultinomialNB()
model1.fit(train_X, train_y)

In [57]:
y_pred = model1.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confustion matrix for high important 3 varibales with the mentioned 2 predictorsn is same as above where we included all the variables: \n {conf_matrix}")

Confustion matrix for high important 3 varibales with the mentioned 2 predictorsn is same as above where we included all the variables: 
 [[8329    0]
 [   0 8545]]


In [58]:
print(f"classificationSummary for high important 3 varibales with the mentioned 2 predictorsn is same as above where we included all the variables: \n ")
classificationSummary(y_test,y_pred)

classificationSummary for high important 3 varibales with the mentioned 2 predictorsn is same as above where we included all the variables: 
 
Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual    0    1
     0 8329    0
     1    0 8545


In [63]:
# What is overall Error for validation set
accuracy_5_var = accuracy_score(y_test,y_pred)
print(f"overall accuracy for validation set is : \n {accuracy_5_var}")
# overall Error for validation set is
error_rate = 1 - accuracy_5_var
print(f"overall Error for validation set is: \n {error_rate}")


overall accuracy for validation set is : 
 1.0
overall Error for validation set is: 
 0.0


In [None]:
# From the above data we can notice that importanat variables to include only 3 high important variables NO_INJ_I,MAX_SEV_IR, INJURY_CRASH
# there is no significant reduce in accuracy or results and it's accuracy is same as above when included all variables from data table.