In [69]:
# Import Modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#modeling tools
import statsmodels.api as sm

import lightgbm as lgb

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, auc, roc_auc_score, roc_curve, log_loss, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

%matplotlib inline
plt.style.use('dark_background')

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

pd.set_option('display.min_row', 15)
pd.set_option('display.max_column', 100)
pd.set_option('display.max_colwidth', 300)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
clean_data = pd.read_pickle('../data/processed/cleaned_data.pkl')
intersection_n = pd.read_pickle('../data/processed/intersection_w_normalization.pkl')

# crosswalks = pd.read_csv('../data/Marked_Crosswalks.csv')
circles = pd.read_csv('../data/Traffic_Circles.csv')
intersections = pd.read_csv('../data/Intersections.csv')

## Intersections

In [71]:
# lower case the column names

intersections.columns = intersections.columns.map(str.lower)
clean_data.columns = clean_data.columns.map(str.lower)

In [72]:
# COMPKEY in Intersections is INTKEY

intersections.rename(columns={'compkey': 'intkey'}, inplace=True)
intersections.columns

Index(['x', 'y', 'objectid', 'intr_id', 'gis_xcoord', 'gis_ycoord', 'intkey',
       'comptype', 'unitid', 'subarea', 'unitdesc', 'arterialclasscd',
       'signal_maint_dist', 'signal_type', 'shape_lng', 'shape_lat'],
      dtype='object')

#### merge intersections with collisions

In [73]:
# Used to make sure number of rows are correct when merging

len(np.intersect1d(intersections['intkey'], clean_data['intkey']))

7576

In [74]:
# Create dataframe with only intersections that are also in the collisions data

intersections_accidents = clean_data[clean_data['intkey'].isin(intersections['intkey'])].copy()

In [75]:
intersections_accidents.head(3)

Unnamed: 0_level_0,x,y,objectid,inckey,addrtype,intkey,location,personcount,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,incdate,junctiontype,sdot_colcode,underinfl,st_colcode,crosswalkkey,speeding_y,inattentionind_y,hitparkedcar_y,pedrownotgrnt_y,weather_adverse,weather_good,weather_unknown,roadcond_adverse,roadcond_dry,roadcond_unknown,lightcond_dark,lightcond_daylight,lightcond_unknown,lightcond_verydark,severitycode_injury,severitycode_propertydamage,severitycode_unknown
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
2004-01-01,-122.32078,47.614076,14172,26463,Intersection,29745.0,BROADWAY AND E PIKE ST,4,0,0,2,1,0,0,2004/01/01 00:00:00+00,At Intersection (intersection related),11.0,0,28,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0
2004-01-01,-122.255895,47.509705,13453,27093,Intersection,38228.0,62ND AVE S AND S RYAN ST,3,0,0,2,1,0,0,2004/01/01 00:00:00+00,At Intersection (intersection related),11.0,0,10,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0
2004-01-01,-122.348859,47.619744,13420,26978,Intersection,28934.0,4TH AVE N AND BROAD ST,2,0,0,2,0,0,0,2004/01/01 00:00:00+00,At Intersection (intersection related),11.0,0,10,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0


In [76]:
# reset index to extract year
intersections_accidents.reset_index(inplace=True)
intersections_accidents['year'] = intersections_accidents['Datetime'].apply(lambda x: x.year)
intersections_accidents['month'] = intersections_accidents['Datetime'].apply(lambda x: x.month)
intersections_accidents['hour'] = intersections_accidents['Datetime'].apply(lambda x: x.hour)

In [77]:
intersections_accidents['addrtype'].value_counts()

Intersection    65481
Name: addrtype, dtype: int64

In [78]:
intersections_accidents.columns

Index(['Datetime', 'x', 'y', 'objectid', 'inckey', 'addrtype', 'intkey',
       'location', 'personcount', 'pedcount', 'pedcylcount', 'vehcount',
       'injuries', 'seriousinjuries', 'fatalities', 'incdate', 'junctiontype',
       'sdot_colcode', 'underinfl', 'st_colcode', 'crosswalkkey', 'speeding_y',
       'inattentionind_y', 'hitparkedcar_y', 'pedrownotgrnt_y',
       'weather_adverse', 'weather_good', 'weather_unknown',
       'roadcond_adverse', 'roadcond_dry', 'roadcond_unknown',
       'lightcond_dark', 'lightcond_daylight', 'lightcond_unknown',
       'lightcond_verydark', 'severitycode_injury',
       'severitycode_propertydamage', 'severitycode_unknown', 'year', 'month',
       'hour'],
      dtype='object')

In [79]:
'''
KEEP COLUMNS: [['intkey', 'year', 'pedcount', 'pedcylcount', 
'vehcount', 'injuries', 'seriousinjuries', 'fatalities', 'inattentionind_y']]
'''

temp = intersections_accidents[['intkey', 'year', 'month', 'hour', 'pedcount', 'pedcylcount', 'vehcount', 
                         'injuries', 'seriousinjuries', 'fatalities', 'inattentionind_y']].copy()

In [80]:
# Group by the intkey, year, month and hour to sum collisions of each intersection over the years

int_acc = temp.groupby(['intkey', 'year', 'month', 'hour']).sum().reset_index()

In [81]:
inter_collisions = intersections.merge(int_acc, how='left', on='intkey')

## EDA and clean data

In [82]:
inter_collisions.shape

(72699, 26)

In [83]:
inter_collisions.columns

Index(['x', 'y', 'objectid', 'intr_id', 'gis_xcoord', 'gis_ycoord', 'intkey',
       'comptype', 'unitid', 'subarea', 'unitdesc', 'arterialclasscd',
       'signal_maint_dist', 'signal_type', 'shape_lng', 'shape_lat', 'year',
       'month', 'hour', 'pedcount', 'pedcylcount', 'vehcount', 'injuries',
       'seriousinjuries', 'fatalities', 'inattentionind_y'],
      dtype='object')

In [84]:
# is intr_id connected to another dataset?

to_drop = ['objectid', 'x', 'y', 'gis_xcoord', 'gis_ycoord', 'comptype', 'unitid', 'signal_maint_dist']

inter_collisions.drop(columns=to_drop, inplace=True)

In [85]:
inter_collisions.isnull().sum()

intr_id                0
intkey                 0
subarea                0
unitdesc               0
arterialclasscd       20
signal_type            0
shape_lng              0
shape_lat              0
year                7865
month               7865
hour                7865
pedcount            7865
pedcylcount         7865
vehcount            7865
injuries            7865
seriousinjuries     7865
fatalities          7865
inattentionind_y    7865
dtype: int64

We will fill the arterial class missing values first before apply 0 for the rest of the missing values.

In [86]:
inter_collisions[inter_collisions['arterialclasscd'].isnull()].head(5)

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,year,month,hour,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
218,18220,286961,GRDWM,DUWAMISH RIVER TRL AND S HENDERSON ST,,NONE,-122.320599,47.523059,,,,,,,,,,
4015,14981,164765,SE,31ST AVE S AND DEAD END 1,,NONE,-122.292444,47.567429,,,,,,,,,,
12166,12413,30483,DWNTN,3RD AVE AND XW JEFFERSON,,MIDXWALK,-122.330756,47.602056,2018.0,6.0,11.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
36231,46498,36645,N,NE 125TH ST AND XW 26NE-27NE,,MIDXWALK,-122.299554,47.719303,2006.0,5.0,17.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
51018,46441,288331,E,E JAMES WAY AND XW 10TH AVE,,MIDXWALK,-122.319314,47.607829,2005.0,10.0,17.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


In [87]:
inter_collisions[inter_collisions['signal_type'] == 'MIDXWALK']['arterialclasscd'].mode()

0    1.0
dtype: float64

In [88]:
# Dead ends almost always have arterialclasscd of 0
# DUWAMISH RIVER TRL all have 0
# unitdesc with XW is a 1. MIDXWALK as signal type is typically 1
# 33RD AVE E with CNTRL is 0
# FAIRVIEW is all 1

inter_collisions[inter_collisions['unitdesc'].str.contains('FAIRVIEW AVE N')].groupby(['subarea', 'arterialclasscd']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,intr_id,intkey,unitdesc,signal_type,shape_lng,shape_lat,year,month,hour,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
subarea,arterialclasscd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
LKUN,1.0,276,276,276,276,276,276,276,276,276,276,276,276,276,276,276,276


In [89]:
def create_mask(string):
    mask = (inter_collisions['unitdesc'].str.contains(string)) & (inter_collisions['arterialclasscd'].isnull())
    return mask

In [90]:
mask_de = create_mask('DEAD END')
mask_xw = create_mask('XW')
mask_duwamish = create_mask('DUWAMISH RIVER TRL')
mask_fairview = create_mask('FAIRVIEW AVE N')
mask_33 = create_mask('33RD AVE E')

inter_collisions.loc[mask_de, 'arterialclasscd'] = 0
inter_collisions.loc[mask_xw, 'arterialclasscd'] = 1
inter_collisions.loc[mask_duwamish, 'arterialclasscd'] = 0
inter_collisions.loc[mask_fairview, 'arterialclasscd'] = 1
inter_collisions.loc[mask_33, 'arterialclasscd'] = 0

In [91]:
inter_collisions[inter_collisions['arterialclasscd'].isnull()]

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,year,month,hour,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y


In [92]:
# Proceed to fill nan with 0 for all the ones that didn't have a collision

inter_collisions.fillna(0, inplace=True)

In [93]:
# inter_collisions.to_csv('../data/processed/inter_collisions.csv')

## Removing Traffic Circles

In [94]:
circles.columns = circles.columns.map(str.lower)

In [95]:
cir_inter = np.intersect1d(circles['intkey'], inter_collisions['intkey'])

In [96]:
# dataset accounting for removing ones that are traffic circles

inter_nocircles = inter_collisions[~(inter_collisions['intkey'].isin(circles['intkey']))].copy()

In [97]:
inter_nocircles.shape

(70793, 18)

In [100]:
# about 89.18% of intersections have had accidents within 2004-2020, 16 year periods
# THIS STILL CONTAINS TRAFFIC CIRCLES

print('Percent of intersections (including traffic circles) with incidents: ',
      round((inter_collisions['year'] != 0).sum() / (len(inter_collisions['year']))*100 , 2),'%')

# about 89.54% of intersections have had accidents within 2004-2020, 16 year periods
# THIS DOES NOT CONTAIN TRAFFIC CIRCLES

print('Percent of intersections (excluding traffic circles) with incidents: ',
      round((inter_nocircles['year'] != 0).sum() / (len(inter_nocircles['year']))*100 , 2),'%')

Percent of intersections (including traffic circles) with incidents:  89.18 %
Percent of intersections (excluding traffic circles) with incidents:  89.54 %


Percent of intersections having collisions went up by a small amount after removing traffic circles data from the intersections data.

In [103]:
inter_nocircles.shape

(70793, 18)

In [104]:
inter_collisions.shape

(72699, 18)

In [105]:
# inter_nocircles.to_csv('../data/processed/inter_nocircles.csv')