In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.max_rows = 500
pd.options.display.max_columns = 100

import warnings
warnings.filterwarnings("ignore") 

from datetime import datetime
from IPython.display import display, Markdown
# Run this cell to display all output within each cell in Jupyter Notebook, 
# instead of just the last statement
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
#Load the data
collisions_df = pd.read_csv('../data/Collisions.csv',
                            parse_dates={'Datetime': ['INCDTTM']}, 
                            infer_datetime_format=True)
intersections_df = pd.read_csv('../data/Intersections.csv')
streets_df = pd.read_csv('../data/Seattle_Streets.csv')

In [3]:
#Look at Collisions not occuring at Intersection & Only Including pedestrians or cyclists
pedorcycle_mask = collisions_df['SDOT_COLDESC'].map(lambda s: ('PEDALCYCLIST' in s) or ('PEDESTRIAN' in s) if type(s)==str else False)
block_mask = collisions_df['ADDRTYPE'] == 'Block'

collision_streets = collisions_df[block_mask & pedorcycle_mask]['LOCATION'].apply(lambda string: string.split(' BETWEEN ')[0])

In [4]:
collision_streets.value_counts()

RAINIER AVE S      172
AURORA AVE N       138
2ND AVE             90
S JACKSON ST        75
DEXTER AVE N        75
                  ... 
INTERLAKEN DR E      1
23RD AVE SW          1
SUNNYSIDE AVE N      1
NW 47TH ST           1
MAGNOLIA BLVD W      1
Name: LOCATION, Length: 720, dtype: int64

In [108]:
streets_df = streets_df[streets_df['STATUS'] == 'INSVC']

In [109]:
sg = streets_df.groupby('STNAME_ORD')
d = {}
d['arterial_class'] = sg['ARTCLASS'].median().astype(int)
d['speed_limit'] =  sg['SPEEDLIMIT'].median().astype(int)
d['slope_percentage'] = sg['SLOPE_PCT'].median()
d['transit_class'] = sg['TRANCLASS'].median().astype(int)
d['pavement_condition'] = sg['PVMTCONDINDX1'].median()

In [110]:
columns = ['ARTCLASS', 'SPEEDLIMIT', 'SLOPE_PCT', 'TRANCLASS', 'PVMTCONDINDX']
clean_streets = pd.DataFrame(data=d)
clean_streets.drop

Unnamed: 0_level_0,arterial_class,speed_limit,slope_percentage,transit_class,pavement_condition
STNAME_ORD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1 AV-W SEA BR WB ON RP,0,20,0.0,0,0.0
10TH AVE,0,20,1.0,0,47.0
10TH AVE E,2,25,1.0,2,17.0
10TH AVE NE,0,20,2.5,0,0.0
10TH AVE NW,0,20,2.0,0,68.0
...,...,...,...,...,...
YALE PL E,0,20,2.0,0,100.0
YALE TER E,0,20,2.5,0,35.0
YESLER WAY,2,25,2.5,3,50.5
YORK RD S,0,20,3.5,0,86.0


In [111]:
clean_streets['accidents'] = collision_streets.value_counts()

In [112]:
clean_streets['accidents'].fillna(value=0.0, inplace=True)

In [113]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics

In [114]:
base_error = ((clean_streets.accidents.mean() - clean_streets.accidents)**2).mean()
base_error

63.65802683854458

In [115]:
kf = KFold(n_splits=5, shuffle=True, random_state=100)

feats = ['arterial_class', 'speed_limit', 'transit_class']
clf = CatBoostRegressor(verbose=False)

mse = []
y = clean_streets.pop('accidents')
for i, (train_index, test_index) in enumerate(kf.split(clean_streets, y)):
    X_train, X_test = clean_streets.iloc[train_index], clean_streets.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    cat_pool = Pool(X_train, y_train, cat_features=feats)
    clf.fit(cat_pool)
    y_hat = clf.predict(X_test)
    means = np.full(shape=y_test.shape, fill_value=y_test.mean())
    theor = metrics.mean_squared_error(y_test, means)
    err = metrics.mean_squared_error(y_test, y_hat)
    print (clf.feature_importances_)
    print (f'base_mse: {round(theor, 3)}, actual: {round(err, 3)}')


<catboost.core.CatBoostRegressor at 0x13e4e1050>

base_mse: 46.813, actual: 40.495


<catboost.core.CatBoostRegressor at 0x13e4e1050>

base_mse: 108.199, actual: 88.478


<catboost.core.CatBoostRegressor at 0x13e4e1050>

base_mse: 40.099, actual: 42.38


<catboost.core.CatBoostRegressor at 0x13e4e1050>

base_mse: 46.822, actual: 43.304


<catboost.core.CatBoostRegressor at 0x13e4e1050>

base_mse: 76.076, actual: 70.779


In [116]:
clf.feature_importances_

array([13.1174007 , 20.51483   , 18.86600045, 29.39703442, 18.10473444])