In [33]:
import os
import pdb
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Encode
from sklearn.preprocessing import LabelBinarizer

# ML
from sklearn.tree import DecisionTreeClassifier

# Ensemble method
from sklearn.ensemble import RandomForestClassifier

# Split
from sklearn.model_selection import train_test_split

# Metric to evaluation
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Standard Scaler
from sklearn.preprocessing import StandardScaler

In [17]:
# What causes a more severe accident?
# Data dictionary https://www.seattle.gov/Documents/Departments/SDOT/GIS/Collisions_OD.pdf
X_org = pd.read_csv("/Users/ou/Projects/traffic_collisions_ml_team2/data/Collisions.csv")

In [18]:
# Unique id identify each accident
X_org['INCKEY'].duplicated().value_counts()
X_org['COLDETKEY'].duplicated().value_counts()

False    220436
Name: COLDETKEY, dtype: int64

In [19]:
# Drop 'LOCATION'
# Drop 'REPORTNO', 'STATUS', EXCEPTRSNCODE, EXCEPTRSNDESC ST_COLCODE
# Convert  INCDATE, INCDTTM
X = X_org[['SEVERITYDESC', 'ADDRTYPE', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'COLLISIONTYPE', 'PERSONCOUNT',
       'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES', 'SERIOUSINJURIES',
       'FATALITIES', 'INCDATE', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE',
       'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND',
       'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE',
       'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR']]

In [20]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
num_col = ["PERSONCOUNT", "PEDCOUNT", "PEDCYLCOUNT", "VEHCOUNT", "INJURIES",
    "SERIOUSINJURIES", "FATALITIES"]
num_mask = X.columns.isin(num_col)
cat_col = X.columns[~num_mask].tolist()

In [21]:
# Fill missing values with 0
X.loc[:, num_col] = X.loc[:, num_col].apply(lambda x: x.astype(int).fillna(0))
X.loc[:, cat_col] = X[cat_col].apply(lambda x: x.fillna('MISSING'))

In [22]:
# Create LabelEncoder object: le
le = LabelEncoder()
# Apply LabelEncoder to categorical columns
X[cat_col] = X[cat_col].apply(lambda x: le.fit_transform(x.astype(str)))
# Print the head of the LabelEncoded categorical columns
print(X[cat_col].head())

   SEVERITYDESC  ADDRTYPE  EXCEPTRSNCODE  EXCEPTRSNDESC  COLLISIONTYPE  \
0             1         2              1              0              7   
1             2         1              2              1              4   
2             2         2              0              0              0   
3             2         2              0              0             10   
4             4         1              0              0              4   

   INCDATE  INCDTTM  JUNCTIONTYPE  SDOT_COLCODE  SDOT_COLDESC  ...  ROADCOND  \
0      288    16775             1            11            11  ...         0   
1     5866     7210             5             2            19  ...         2   
2     5849    12947             1             1            15  ...         9   
3     5888    59502             1             1            15  ...         0   
4     5883    69282             5             1            15  ...         2   

   LIGHTCOND  PEDROWNOTGRNT  SDOTCOLNUM  SPEEDING  ST_COLCODE  ST_COLDESC 

In [23]:
y = X['SEVERITYDESC'].to_frame().copy()
X = X.drop(['SEVERITYDESC'], axis=1).copy()

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)

In [25]:
# Random forest classifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train.values.ravel())
y_pred = forest_clf.predict(X_test)

In [26]:
### Classification
## ask for precision, recall, accuracy, F1 Score
## Precision: precision = TP/ (TP + FP)
from sklearn.metrics import precision_score
precision_score = precision_score(y_test, y_pred, average=None)
print(precision_score)

[1.         0.99921618 0.99757843 0.99898271 0.98323944]


In [34]:
# See all numbers 
np.set_printoptions(threshold=sys.maxsize)

In [41]:
# See prediction v.s. test
df = pd.DataFrame(data=y_pred, columns=['pred'])
compare = df.join(y_test.reset_index(), how='outer').drop(['index'], axis=1)
compare['diff'] = compare['pred'] - compare['SEVERITYDESC']

Unnamed: 0,pred,SEVERITYDESC,diff
0,2,2,0
1,2,2,0
2,2,2,0
3,2,2,0
4,2,2,0


 0    72499
-2      122
 2      121
-1        1
 1        1
Name: diff, dtype: int64