In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

### Importing the dataset

In [3]:
df = pd.read_csv("Data-Collisions.csv",low_memory=False)

In [4]:
df.shape

(194673, 38)

In [5]:
df.columns

Index(['SEVERITYCODE', 'X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO',
       'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE',
       'EXCEPTRSNDESC', 'SEVERITYCODE.1', 'SEVERITYDESC', 'COLLISIONTYPE',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INCDATE',
       'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC',
       'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND',
       'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC',
       'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR'],
      dtype='object')

### SEVERITYCODE are as follows:
#### 0: Little to no Probability (Clear Conditions)
#### 1: Very Low Probablility - Chance or Property Damage
#### 2 : Low Probability - Chance of Injury
#### 3 : Mild Probability - Chance of Serious Injury
#### 4 : High Probability - Chance of Fatality

#### selecting specific columns from the dataset

In [6]:
df = df.loc[:,['SEVERITYCODE','WEATHER','ROADCOND','LIGHTCOND']]
df.dropna(inplace=True)
df.head()

Unnamed: 0,SEVERITYCODE,WEATHER,ROADCOND,LIGHTCOND
0,2,Overcast,Wet,Daylight
1,1,Raining,Wet,Dark - Street Lights On
2,1,Overcast,Dry,Daylight
3,1,Clear,Dry,Daylight
4,2,Raining,Wet,Daylight


In [7]:
df.shape #size of our dataset

(189337, 4)

In [8]:
df['WEATHER'].value_counts().to_frame() #getting info about the WEATHER column

Unnamed: 0,WEATHER
Clear,111008
Raining,33117
Overcast,27681
Unknown,15039
Snowing,901
Other,824
Fog/Smog/Smoke,569
Sleet/Hail/Freezing Rain,113
Blowing Sand/Dirt,55
Severe Crosswind,25


In [9]:
df['ROADCOND'].value_counts().to_frame() #getting info about the ROADCOND column

Unnamed: 0,ROADCOND
Dry,124300
Wet,47417
Unknown,15031
Ice,1206
Snow/Slush,999
Other,131
Standing Water,115
Sand/Mud/Dirt,74
Oil,64


In [10]:
df['LIGHTCOND'].value_counts().to_frame() #getting info about the LIGHTCOND column

Unnamed: 0,LIGHTCOND
Daylight,116077
Dark - Street Lights On,48440
Unknown,13456
Dusk,5889
Dawn,2502
Dark - No Street Lights,1535
Dark - Street Lights Off,1192
Other,235
Dark - Unknown Lighting,11


In [11]:
for col in df.columns:  #checking the quantity of labels in each column
    print(col,": ",len(df[col].unique()),'labels')

SEVERITYCODE :  2 labels
WEATHER :  11 labels
ROADCOND :  9 labels
LIGHTCOND :  9 labels


### Creating filters to replace the categorical variables with the numerical ones

In [12]:
filt1 = {'Clear':1,'Raining':2,'Overcast':3,'Unknown':0,'Snowing':4,'Other':5,'Fog/Smog/Smoke':6,
         'Sleet/Hail/Freezing Rain':7,'Blowing Sand/Dirt':8,'Severe Crosswind':9,'Partly Cloudy':10}
filt2 = {'Dry':1,'Wet':2,'Unknown':0,'Ice':3,'Snow/Slush':4,'Other':5,'Standing Water':6,'Sand/Mud/Dirt':7,'Oil':8}
filt3 = {'Daylight':1,'Dark - Street Lights On':2,'Unknown':0,'Dusk':3,'Dawn':4,'Dark - No Street Lights':5,
         'Dark - Street Lights Off':6,'Other':7,'Dark - Unknown Lighting':8}

### Applying filters to existing columns and creating new ones from them

In [13]:
df['WEATHER_N'] = df['WEATHER'].replace(filt1)
df['ROADCOND_N'] = df['ROADCOND'].replace(filt2)
df['LIGHTCOND_N'] = df['LIGHTCOND'].replace(filt3)

In [14]:
df.head()

Unnamed: 0,SEVERITYCODE,WEATHER,ROADCOND,LIGHTCOND,WEATHER_N,ROADCOND_N,LIGHTCOND_N
0,2,Overcast,Wet,Daylight,3,2,1
1,1,Raining,Wet,Dark - Street Lights On,2,2,2
2,1,Overcast,Dry,Daylight,3,1,1
3,1,Clear,Dry,Daylight,1,1,1
4,2,Raining,Wet,Daylight,2,2,1


In [15]:
df.dtypes

SEVERITYCODE     int64
WEATHER         object
ROADCOND        object
LIGHTCOND       object
WEATHER_N        int64
ROADCOND_N       int64
LIGHTCOND_N      int64
dtype: object

In [16]:
df['SEVERITYCODE'].value_counts()

1    132285
2     57052
Name: SEVERITYCODE, dtype: int64

As we can see count of SEVERITYCODE 1 and 2 vary a lot and will lead to bad predictions hence we need to downsample the majority variable

### Downsampling

In [17]:
df_majority = df[df['SEVERITYCODE']==1]
df_minority = df[df['SEVERITYCODE']==2]

df_maj_downsample = resample(df_majority,replace=False,n_samples=57052,random_state = 123)
balanced_df = pd.concat([df_maj_downsample,df_minority])
balanced_df.SEVERITYCODE.value_counts()

2    57052
1    57052
Name: SEVERITYCODE, dtype: int64

In [18]:
balanced_df.columns

Index(['SEVERITYCODE', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'WEATHER_N',
       'ROADCOND_N', 'LIGHTCOND_N'],
      dtype='object')

## Define X and y

In [19]:
X = np.asarray(balanced_df[['WEATHER_N','ROADCOND_N','LIGHTCOND_N']])
X[0:5]

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [3, 1, 1],
       [2, 2, 2]], dtype=int64)

In [20]:
y = np.asarray(balanced_df['SEVERITYCODE'])
y[0:5]

array([1, 1, 1, 1, 1], dtype=int64)

## Normalize the dataset

In [22]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-0.51093777, -0.37723839, -0.43749442],
       [-0.51093777, -0.37723839, -0.43749442],
       [-0.51093777, -0.37723839, -0.43749442],
       [ 1.67561451, -0.37723839, -0.43749442],
       [ 0.58233837,  1.23207889,  0.73106905]])

## Train/Test Split

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
print('Train set: ',X_train.shape,y_train.shape)
print('Test set: ',X_test.shape,y_test.shape)

Train set:  (79872, 3) (79872,)
Test set:  (34232, 3) (34232,)


## K Nearest Neighbours (KNN)

In [24]:
from sklearn.neighbors import KNeighborsClassifier
k = 25

In [25]:
neigh = KNeighborsClassifier(n_neighbors=k).fit(X_train,y_train)
neigh
Kyhat = neigh.predict(X_test)
Kyhat[0:5]

array([1, 1, 2, 1, 1], dtype=int64)

## Logistic Regression

In [26]:
#Building the LR model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=6,solver='liblinear').fit(X_train,y_train)
LR

LogisticRegression(C=6, solver='liblinear')

In [27]:
LRyhat = LR.predict(X_test)
LRyhat

array([1, 2, 2, ..., 1, 1, 2], dtype=int64)

In [28]:
yhat_prob = LR.predict_proba(X_test)
yhat_prob

array([[0.52518893, 0.47481107],
       [0.48814809, 0.51185191],
       [0.43479349, 0.56520651],
       ...,
       [0.61396317, 0.38603683],
       [0.51189912, 0.48810088],
       [0.44791851, 0.55208149]])

## Results & Evaluation

Now checking the accuracy of the models

### K-Nearest Neighbor

In [29]:
from sklearn.metrics import jaccard_score as jss
#Jaccard Similarity Score
jss(y_test,Kyhat)

0.3790632777735877

In [31]:
from sklearn.metrics import f1_score
#f1-score
f1_score(y_test,Kyhat,average='macro')

0.5167337855319522

In [35]:
from sklearn.metrics import accuracy_score
acc= accuracy_score(y_test,Kyhat)
acc

0.5189880813274129

### Logistic regression

In [37]:
#Jaccard Similarity Score
jss(y_test,LRyhat)

0.4127661074298604

In [38]:
f1_score(y_test,LRyhat,average = 'macro')

0.4986449134746715

In [39]:
# LOGLOSS
from sklearn.metrics import log_loss
yhat_prob = LR.predict_proba(X_test)
log_loss(y_test,yhat_prob)

0.6888800273105687

In [40]:
from sklearn.metrics import accuracy_score
acc= accuracy_score(y_test,LRyhat)
acc

0.5132916569291891