In [2]:
import os
import pdb
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Encode
from sklearn.preprocessing import LabelBinarizer

# ML
from sklearn.tree import DecisionTreeClassifier

# Ensemble method
from sklearn.ensemble import RandomForestClassifier

# Split
from sklearn.model_selection import train_test_split

# Metric to evaluation
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Standard Scaler
from sklearn.preprocessing import StandardScaler

In [53]:
# What causes a more severe accident?
# Data dictionary https://www.seattle.gov/Documents/Departments/SDOT/GIS/Collisions_OD.pdf
X_org = pd.read_csv("/Users/ou/Projects/traffic_collisions_ml_team2/data/Collisions.csv")

In [54]:
# Unique id identify each accident
X_org['INCKEY'].duplicated().value_counts()
X_org['COLDETKEY'].duplicated().value_counts()

False    220436
Name: COLDETKEY, dtype: int64

In [126]:
# Drop 'LOCATION'
# Drop 'REPORTNO', 'STATUS', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'ST_COLCODE'
# Convert INCDATE, INCDTTM
X = X_org[['SEVERITYDESC', 'ADDRTYPE', 'COLLISIONTYPE', 'PERSONCOUNT',
       'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES', 'SERIOUSINJURIES',
       'FATALITIES', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE',
       'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND',
       'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE',
       'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR']]

In [56]:
for i in X.columns: print(X[i].value_counts())

Property Damage Only Collision    137026
Injury Collision                   58472
Unknown                            21510
Serious Injury Collision            3082
Fatality Collision                   346
Name: SEVERITYDESC, dtype: int64
Block           144344
Intersection     71516
Alley              869
Name: ADDRTYPE, dtype: int64
Parked Car    48364
Angles        35366
Rear Ended    34545
Other         24450
Sideswipe     18812
Left Turn     14047
Pedestrian     7637
Cycles         5893
Right Turn     2998
Head On        2172
Name: COLLISIONTYPE, dtype: int64
2     118466
3      36376
0      24928
4      14965
1      13827
5       6757
6       2782
7       1178
8        546
9        226
10       133
11        59
12        35
13        22
14        22
15        11
17        11
16         8
44         6
20         6
25         6
18         6
19         6
22         5
29         4
26         4
23         3
32         3
47         3
27         3
28         3
37         3
34         3
2

In [127]:
# INCDATE
# INCDTTM
X.INCDTTM = pd.to_datetime(X.INCDTTM).copy()
X.loc[:, 'year'] = X.INCDTTM.dt.year
X.loc[:, 'month'] = X.INCDTTM.dt.month
X.loc[:, 'hour'] = X.INCDTTM.dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [128]:
# Delete INCDTTM
X = X.drop(['INCDTTM'], axis=1).copy()
X.columns

Index(['SEVERITYDESC', 'ADDRTYPE', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT',
       'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES', 'SERIOUSINJURIES', 'FATALITIES',
       'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC', 'INATTENTIONIND',
       'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT',
       'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY',
       'CROSSWALKKEY', 'HITPARKEDCAR', 'year', 'month', 'hour'],
      dtype='object')

In [65]:
# Make 'UNDERINFL' consistent
pd.crosstab(X['UNDERINFL'], X['year']) 

year,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
UNDERINFL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,11517,14621,14728,13998,13226,11331,10376,10421,9812,9687,11293,12532,10584,10212,9496,8403,2439
1,600,728,785,717,647,627,627,650,627,472,526,444,546,460,612,560,0


In [129]:
X.loc[X['UNDERINFL']=="Y", 'UNDERINFL'] = "1"
X.loc[X['UNDERINFL']=="N", 'UNDERINFL'] = "0"

In [67]:
## Drop EXCEPTRSNCODE, EXCEPTRSNDESC
#X['EXCEPTRSNCODE'].value_counts()
#X['EXCEPTRSNDESC'].value_counts()
#X[['EXCEPTRSNDESC', 'EXCEPTRSNCODE']].head()

In [130]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
num_col = ["PERSONCOUNT", "PEDCOUNT", "PEDCYLCOUNT", "VEHCOUNT", "INJURIES",
    "SERIOUSINJURIES", "FATALITIES"]
num_mask = X.columns.isin(num_col)
cat_col = X.columns[~num_mask].tolist()

In [131]:
# Fill missing values with 0
X.loc[:, num_col] = X.loc[:, num_col].apply(lambda x: x.astype(int).fillna(0))
X.loc[:, cat_col] = X.loc[:, cat_col].apply(lambda x: x.fillna('MISSING'))

In [77]:
# std doesn't look too high to me
X[num_col].describe()

Unnamed: 0,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INJURIES,SERIOUSINJURIES,FATALITIES
count,220436.0,220436.0,220436.0,220436.0,220436.0,220436.0,220436.0
mean,2.227145,0.038156,0.02731,1.730697,0.373868,0.015165,0.001674
std,1.471406,0.201881,0.164398,0.829041,0.732352,0.158077,0.044493
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,2.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,2.0,0.0,0.0,0.0
75%,3.0,0.0,0.0,2.0,1.0,0.0,0.0
max,93.0,6.0,2.0,15.0,78.0,41.0,5.0


In [132]:
# Create LabelEncoder object: le
le = LabelEncoder()
# Apply LabelEncoder to categorical columns
X.loc[:, cat_col] = X.loc[:, cat_col].apply(lambda x: le.fit_transform(x.astype(str)))

In [133]:
y = X['SEVERITYDESC'].to_frame().copy()
drop = ['INJURIES', 'SERIOUSINJURIES', 'FATALITIES', 'SEVERITYDESC']
X = X.drop(drop, axis=1).copy()

In [134]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)

In [135]:
# Random forest classifier;
# This time Scikit-Learn did not have to run OvA or OvO 
# because Random Forest classifiers can directly classify instances into multiple classes.
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train.values.ravel())
y_pred = forest_clf.predict(X_test)

In [136]:
### Classification
## ask for precision, recall, accuracy, F1 Score
## Precision: precision = TP/ (TP + FP)
from sklearn.metrics import precision_score
precision_score1 = precision_score(y_test, y_pred, average=None)
print(precision_score1)

[0.         0.55202788 0.78241617 0.07586207 0.98063973]


In [137]:
# See all numbers 
np.set_printoptions(threshold=sys.maxsize)

In [138]:
# See prediction v.s. test
df = pd.DataFrame(data=y_pred, columns=['pred'])
compare = df.join(y_test.reset_index(), how='outer').drop(['index'], axis=1)
compare['diff'] = compare['pred'] - compare['SEVERITYDESC']

In [139]:
## recall/ sensitivity/ true positive rate = TP/ (TP + FN)
from sklearn.metrics import recall_score
recall_score1 = recall_score(y_test, y_pred, average=None)
print(recall_score1)

[0.         0.45552476 0.85557072 0.0110331  0.9860347 ]


In [140]:
## accuracy = (TP + TN)/ Total
from sklearn.metrics import accuracy_score
accuracy_score1 = accuracy_score(y_test, y_pred)
print(accuracy_score1)

0.7503574177939074


In [141]:
## F1 Score = F = 2/ (1/precision + 1/recall)
from sklearn.metrics import f1_score
f1_score1 = f1_score(y_test, y_pred, average=None)
print(f1_score1)

[0.         0.4991548  0.81735986 0.01926445 0.98332982]


In [119]:
# Important features: 
col = ['ADDRTYPE', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT',
       'VEHCOUNT', 'INJURIES', 'SERIOUSINJURIES', 'FATALITIES', 'JUNCTIONTYPE',
       'SDOT_COLCODE', 'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL',
       'WEATHER', 'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM',
       'SPEEDING', 'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY',
       'HITPARKEDCAR', 'year', 'month', 'hour']
rank = pd.DataFrame(data={'colname':col, 'importance':forest_clf.feature_importances_})

In [123]:
# Variables should be removed:  INJURIES, SERIOUSINJURIES, FATALITIES
rank.sort_values('importance', ascending=False) 

Unnamed: 0,colname,importance
6,INJURIES,0.599976
2,PERSONCOUNT,0.075656
5,VEHCOUNT,0.05929
13,UNDERINFL,0.052695
7,SERIOUSINJURIES,0.042817
16,LIGHTCOND,0.032401
20,ST_COLCODE,0.019784
1,COLLISIONTYPE,0.014893
21,ST_COLDESC,0.014714
11,SDOT_COLDESC,0.01336


In [125]:
X['INJURIES'].value_counts()

0     158768
1      47040
2      10644
3       2722
4        812
5        272
6        100
7         40
8         12
9         10
10         6
11         5
13         2
78         1
15         1
12         1
Name: INJURIES, dtype: int64

In [110]:
# Set threshold
mask_feature = forest_clf.feature_importances_ > 0.1
# Apply the mask to the feature dataset X
reduced_X = X.loc[:, mask_feature]

Unnamed: 0,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INJURIES,SERIOUSINJURIES,JUNCTIONTYPE,SDOT_COLCODE,SDOT_COLDESC,UNDERINFL,WEATHER,LIGHTCOND,ST_COLCODE,ST_COLDESC,year,hour
0,8,2,1,0,1,1,0,1,3,3,0,7,2,17,53,1,2
1,5,2,0,0,0,0,0,5,12,11,2,5,7,0,14,9,0
2,0,4,0,0,2,0,0,1,1,7,0,9,5,23,34,9,19
3,2,2,0,0,2,0,0,1,1,7,0,7,5,34,8,9,15
4,5,0,0,0,0,0,0,5,1,7,2,5,7,0,14,9,0
5,5,2,0,0,0,1,0,5,1,7,2,5,7,0,14,9,0
6,6,2,0,0,2,0,0,5,34,10,0,7,5,9,6,9,15
7,5,2,0,0,0,0,0,5,1,7,2,5,7,0,14,9,0
8,6,2,0,0,2,0,0,4,34,10,0,1,5,7,17,1,12
9,4,4,0,0,2,0,0,1,1,7,0,7,5,15,4,1,0


In [88]:
# One-vs-All: SGDClassifier

'''
This strategy, also known as one-vs-all, is implemented in 
OneVsRestClassifier. The strategy consists in fitting one classifier 
per class. For each classifier, the class is fitted against all the 
other classes. In addition to its computational efficiency 
(only n_classes classifiers are needed), one advantage of this 
approach is its interpretability. Since each class is represented 
by one and only one classifier, it is possible to gain knowledge 
about the class by inspecting its corresponding classifier. 
This is the most commonly used strategy and is a fair default choice.
'''

'\nThis strategy, also known as one-vs-all, is implemented in \nOneVsRestClassifier. The strategy consists in fitting one classifier \nper class. For each classifier, the class is fitted against all the \nother classes. In addition to its computational efficiency \n(only n_classes classifiers are needed), one advantage of this \napproach is its interpretability. Since each class is represented \nby one and only one classifier, it is possible to gain knowledge \nabout the class by inspecting its corresponding classifier. \nThis is the most commonly used strategy and is a fair default choice.\n'

In [89]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
y_pred1 = OneVsRestClassifier(SGDClassifier(random_state=42)).fit(X_train, y_train).predict(X_test)

In [90]:
precision_score2 = precision_score(y_test, y_pred1, average=None)
print(precision_score2)
recall_score2 = recall_score(y_test, y_pred1, average=None)
print(recall_score2)
accuracy_score2 = accuracy_score(y_test, y_pred1)
print(accuracy_score2)
f1_score2 = f1_score(y_test, y_pred1, average=None)
print(f1_score2)

[0.         0.26640303 0.86678944 0.         0.16788321]
[0.         0.98284788 0.03107805 0.         0.01297785]
0.27904706917408995
[0.         0.41918505 0.06000467 0.         0.02409323]


In [91]:
# One-vs-One: SGDClassifier

'''
OneVsOneClassifier constructs one classifier per pair of classes. 
At prediction time, the class which received the most votes is selected. 
In the event of a tie (among two classes with an equal number of votes), 
it selects the class with the highest aggregate classification confidence 
by summing over the pair-wise classification 
confidence levels computed by the underlying binary classifiers.
'''

'\nOneVsOneClassifier constructs one classifier per pair of classes. \nAt prediction time, the class which received the most votes is selected. \nIn the event of a tie (among two classes with an equal number of votes), \nit selects the class with the highest aggregate classification confidence \nby summing over the pair-wise classification \nconfidence levels computed by the underlying binary classifiers.\n'

In [92]:
from sklearn.multiclass import OneVsOneClassifier

y_pred2 = OneVsOneClassifier(SGDClassifier(random_state=42)).fit(X_train, y_train.values.ravel()).predict(X_test)

In [93]:
precision_score2 = precision_score(y_test, y_pred2, average=None)
print(precision_score2)
recall_score2 = recall_score(y_test, y_pred2, average=None)
print(recall_score2)
accuracy_score2 = accuracy_score(y_test, y_pred2)
print(accuracy_score2)
f1_score2 = f1_score(y_test, y_pred2, average=None)
print(f1_score2)

[0.         0.72255193 0.67040942 0.07267645 0.58532024]
[0.         0.02546672 0.94750627 0.10431294 0.52983496]
0.6515451446167382
[0.         0.04919937 0.78522896 0.08566722 0.55619725]


In [104]:
y_test.__class__

pandas.core.frame.DataFrame

In [105]:
df.__class__

pandas.core.frame.DataFrame

In [108]:
y_pred1.__class__

numpy.ndarray