In [126]:
import os
import pdb
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [127]:
# Encode
from sklearn.preprocessing import LabelBinarizer

# ML
from sklearn.tree import DecisionTreeClassifier

# Ensemble method
from sklearn.ensemble import RandomForestClassifier

# Split
from sklearn.model_selection import train_test_split

# Metric to evaluation
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Standard Scaler
from sklearn.preprocessing import StandardScaler

In [118]:
# What causes a more severe accident?
# Data dictionary https://www.seattle.gov/Documents/Departments/SDOT/GIS/Collisions_OD.pdf
X_org = pd.read_csv("/Users/ou/Projects/traffic_collisions_ml_team2/data/Collisions.csv")

In [119]:
# Unique id identify each accident
X_org['INCKEY'].duplicated().value_counts()
X_org['COLDETKEY'].duplicated().value_counts()

False    220436
Name: COLDETKEY, dtype: int64

In [120]:
# Drop 'LOCATION'
# Drop 'REPORTNO', 'STATUS', EXCEPTRSNCODE, EXCEPTRSNDESC ST_COLCODE
# Convert  INCDATE, INCDTTM
X = X_org[['SEVERITYDESC', 'ADDRTYPE', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'COLLISIONTYPE', 'PERSONCOUNT',
       'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES', 'SERIOUSINJURIES',
       'FATALITIES', 'INCDATE', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE',
       'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND',
       'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE',
       'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR']]

In [121]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
num_col = ["PERSONCOUNT", "PEDCOUNT", "PEDCYLCOUNT", "VEHCOUNT", "INJURIES",
    "SERIOUSINJURIES", "FATALITIES"]
num_mask = X.columns.isin(num_col)
cat_col = X.columns[~num_mask].tolist()

In [125]:
# Fill missing values with 0
X.loc[:, num_col] = X.loc[:, num_col].apply(lambda x: x.astype(int).fillna(0))
X.loc[:, cat_col] = X.loc[:, cat_col].apply(lambda x: x.fillna('MISSING'))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


KeyError: "['SEVERITYDESC'] not in index"

In [123]:
# Create LabelEncoder object: le
le = LabelEncoder()
# Apply LabelEncoder to categorical columns
X.loc[:, cat_col] = X.loc[:, cat_col].apply(lambda x: le.fit_transform(x.astype(str)))

In [124]:
y = X['SEVERITYDESC'].to_frame().copy()
X = X.drop(['SEVERITYDESC'], axis=1).copy()

In [89]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)

In [25]:
# Random forest classifier;
# This time Scikit-Learn did not have to run OvA or OvO 
# because Random Forest classifiers can directly classify instances into multiple classes.
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train.values.ravel())
y_pred = forest_clf.predict(X_test)

In [26]:
### Classification
## ask for precision, recall, accuracy, F1 Score
## Precision: precision = TP/ (TP + FP)
from sklearn.metrics import precision_score
precision_score = precision_score(y_test, y_pred, average=None)
print(precision_score)

[1.         0.99921618 0.99757843 0.99898271 0.98323944]


In [34]:
# See all numbers 
np.set_printoptions(threshold=sys.maxsize)

In [41]:
# See prediction v.s. test
df = pd.DataFrame(data=y_pred, columns=['pred'])
compare = df.join(y_test.reset_index(), how='outer').drop(['index'], axis=1)
compare['diff'] = compare['pred'] - compare['SEVERITYDESC']

In [69]:
## recall/ sensitivity/ true positive rate = TP/ (TP + FN)
from sklearn.metrics import recall_score
recall_score = recall_score(y_test, y_pred, average=None)
print(recall_score)

[0.98019802 0.99994771 0.99738082 0.98495486 0.98476513]


In [72]:
## accuracy = (TP + TN)/ Total
from sklearn.metrics import accuracy_score
accuracy_score = accuracy_score(y_test, y_pred)
print(accuracy_score)

0.9966320246343341


In [71]:
## F1 Score = F = 2/ (1/precision + 1/recall)
from sklearn.metrics import f1_score
f1_score = f1_score(y_test, y_pred, average=None)
print(f1_score)

[0.99       0.99958181 0.99747961 0.99191919 0.98400169]


In [None]:
# One-vs-All

'''
This strategy, also known as one-vs-all, is implemented in 
OneVsRestClassifier. The strategy consists in fitting one classifier 
per class. For each classifier, the class is fitted against all the 
other classes. In addition to its computational efficiency 
(only n_classes classifiers are needed), one advantage of this 
approach is its interpretability. Since each class is represented 
by one and only one classifier, it is possible to gain knowledge 
about the class by inspecting its corresponding classifier. 
This is the most commonly used strategy and is a fair default choice.
'''

In [111]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
y_pred1 = OneVsRestClassifier(SGDClassifier(random_state=42)).fit(X_train, y_train).predict(X_test)

In [112]:
precision_score = precision_score(y_test, y_pred1, average=None)
print(precision_score)
recall_score = recall_score(y_test, y_pred1, average=None)
print(recall_score)
accuracy_score = accuracy_score(y_test, y_pred1)
print(accuracy_score)
f1_score = f1_score(y_test, y_pred1, average=None)
print(f1_score)

TypeError: 'numpy.ndarray' object is not callable

In [92]:
# One-vs-One

'''
OneVsOneClassifier constructs one classifier per pair of classes. 
At prediction time, the class which received the most votes is selected. 
In the event of a tie (among two classes with an equal number of votes), 
it selects the class with the highest aggregate classification confidence 
by summing over the pair-wise classification 
confidence levels computed by the underlying binary classifiers.
'''

In [109]:
from sklearn.multiclass import OneVsOneClassifier

y_pred2 = OneVsOneClassifier(SGDClassifier(random_state=42)).fit(X_train, y_train.values.ravel()).predict(X_test)

In [100]:
precision_score = precision_score(y_test, y_pred2, average=None)
print(precision_score)
recall_score = recall_score(y_test, y_pred2, average=None)
print(recall_score)
accuracy_score = accuracy_score(y_test, y_pred2)
print(accuracy_score)
f1_score = f1_score(y_test, y_pred2, average=None)
print(f1_score)

TypeError: 'numpy.ndarray' object is not callable

In [104]:
y_test.__class__

pandas.core.frame.DataFrame

In [105]:
df.__class__

pandas.core.frame.DataFrame

In [108]:
y_pred1.__class__

numpy.ndarray