In [2]:

# Import Modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#modeling tools
import statsmodels.api as sm

import lightgbm as lgb

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, auc, roc_auc_score, roc_curve, log_loss, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

%matplotlib inline
plt.style.use('dark_background')

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

pd.set_option('display.min_row', 15)
pd.set_option('display.max_column', 100)
pd.set_option('display.max_colwidth', 300)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
streets = pd.read_csv('../data/Seattle_Streets.csv')
clean_data = pd.read_pickle('../data/processed/cleaned_data.pkl')
crosswalks = pd.read_csv('../data/Marked_Crosswalks.csv')

In [6]:
crosswalks.columns = crosswalks.columns.map(str.lower)
clean_data.columns = clean_data.columns.map(str.lower)

In [7]:
crosswalks.columns

Index(['x', 'y', 'objectid', 'compkey', 'unitid', 'unitdesc', 'condition',
       'condition_assessment_date', 'ownership', 'current_status',
       'primarydistrictcd', 'secondarydistrictcd', 'overrideyn', 'comptype',
       'segkey', 'unittype', 'old_id', 'approach', 'marking_type', 'school',
       'midblock_crosswalk', 'install_date', 'color', 'comments', 'category',
       'ownership_date', 'current_status_date', 'maintained_by',
       'maintenance_agreement', 'curbspaceid', 'maint_district',
       'overridecomment', 'shape_lng', 'shape_lat', 'crosswalk_control',
       'material', 'onstreet', 'xstrlow', 'meas_from_low', 'disttolow',
       'stpoint', 'xstrhi', 'meas_from_hi', 'disttohi', 'offset', 'side',
       'measurement_origin', 'attachment_1', 'attachment_2', 'attachment_3',
       'attachment_4', 'attachment_5', 'attachment_6', 'attachment_7',
       'attachment_8', 'attachment_9', 'maintenance_group', 'num_attachments'],
      dtype='object')

In [17]:
# rename so the column is named crosswalkkey in both datasets

crosswalks.rename(columns={'compkey': 'crosswalkkey'}, inplace=True)

In [18]:
print(crosswalks.shape)
crosswalks.head(3)

(5683, 58)


Unnamed: 0,x,y,objectid,crosswalkkey,unitid,unitdesc,condition,condition_assessment_date,ownership,current_status,primarydistrictcd,secondarydistrictcd,overrideyn,comptype,segkey,unittype,old_id,approach,marking_type,school,midblock_crosswalk,install_date,color,comments,category,ownership_date,current_status_date,maintained_by,maintenance_agreement,curbspaceid,maint_district,overridecomment,shape_lng,shape_lat,crosswalk_control,material,onstreet,xstrlow,meas_from_low,disttolow,stpoint,xstrhi,meas_from_hi,disttohi,offset,side,measurement_origin,attachment_1,attachment_2,attachment_3,attachment_4,attachment_5,attachment_6,attachment_7,attachment_8,attachment_9,maintenance_group,num_attachments
0,1269560.0,223140.574842,1,522399,XWK-3325,ALASKAN WAY 0060 BLOCK C SIDE ( 35) 35 FT NW/O YESLER WAY,GOOD,2012/09/30 00:00:00+00,,INSVC,DISTRICT7,,N,16,8717,XWK,Old ID # 758,NW,LADER,N,N,1970/01/01 00:00:00+00,WHT,"ALASKAN WAY & YESLER WAY, N",XWK,1970/01/01 00:00:00+00,2009/01/22 00:00:00+00,,,,CENTRAL,,-122.336656,47.6018,,THRPL,ALASKAN WAY,YESLER WAY,Y,35,0,COLUMBIA ST,N,234,0,C,XStreet Centerline,,,,,,,,,,XWK-CD7-YEAR4,0
1,1283979.0,201215.071449,2,525064,XWK-3029,S HOLLY ST 0440 BLOCK C SIDE ( 385) 20 FT W/O 45TH AVE S,GOOD,2013/08/26 00:00:00+00,,INSVC,DISTRICT2,,N,16,20363,XWK,Old ID # 662,W,LADER,Y,N,1970/01/01 00:00:00+00,WHT,"WI #13959 8/27/98 <br> School # 8<br>45 AVE S & S HOLLY ST, W",XWK,1970/01/01 00:00:00+00,2008/11/14 00:00:00+00,,,,SOUTH,,-122.276553,47.542459,,THRPL,S HOLLY ST,44TH AVE S,N,385,0,45TH AVE S,Y,20,0,C,XStreet Centerline,,,,,,,,,,XWK-CD2-YEAR4,0
2,1268498.0,242364.672876,3,523429,XWK-416,STONE WAY N 0390 BLOCK C SIDE ( 115) 115 FT N/O N 39TH ST,GOOD,2017/06/29 00:00:00+00,SDOT,INSVC,DISTRICT4,,N,16,12959,XWK,Old ID # 485,S,LADER,N,N,1970/01/01 00:00:00+00,WHT,"<p>WI #18482 <br>STONE WAY N &amp; BRIDGE WAY N, S</p><p><br></p><p>Remarked per WO 528218</p>",XWK,1970/01/01 00:00:00+00,2009/02/10 00:00:00+00,SDOT,,,NORTH,,-122.342486,47.654435,SIGNAL,THRPL,STONE WAY N,N 39TH ST,Y,115,0,BRIDGE WAY N,N,83,0,C,XStreet Centerline,,,,,,,,,,XWK-CD4-YEAR1,0


2035 distinct blocks are in the collisions data

In [19]:
len(np.intersect1d(crosswalks['crosswalkkey'], clean_data['crosswalkkey']))

2035

In [58]:
# Create dataframe with only collision locations with crosswalks

crosswalk_accidents = clean_data[clean_data['crosswalkkey'].isin(crosswalks['crosswalkkey'])].copy()

In [59]:
crosswalk_accidents.reset_index(inplace=True)
crosswalk_accidents['year'] = crosswalk_accidents['Datetime'].apply(lambda x: x.year)
crosswalk_accidents['month'] = crosswalk_accidents['Datetime'].apply(lambda x: x.month)
crosswalk_accidents['hour'] = crosswalk_accidents['Datetime'].apply(lambda x: x.hour)

In [60]:
crosswalk_accidents.head(1)

Unnamed: 0,Datetime,x,y,objectid,inckey,addrtype,intkey,location,personcount,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,incdate,junctiontype,sdot_colcode,underinfl,st_colcode,crosswalkkey,speeding_y,inattentionind_y,hitparkedcar_y,pedrownotgrnt_y,weather_adverse,weather_good,weather_unknown,roadcond_adverse,roadcond_dry,roadcond_unknown,lightcond_dark,lightcond_daylight,lightcond_unknown,lightcond_verydark,severitycode_injury,severitycode_propertydamage,severitycode_unknown,year,month,hour
0,2007-01-01,-122.355404,47.624578,62847,77063,Intersection,28919.0,1ST AVE N AND MERCER ST,3,2,0,1,2,0,0,2007/01/01 00:00:00+00,At Intersection (intersection related),24.0,0,2,523108,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,0,2007,1,0


## Clean collisions data with crosswalks

In [61]:
crosswalk_accidents.columns

Index(['Datetime', 'x', 'y', 'objectid', 'inckey', 'addrtype', 'intkey',
       'location', 'personcount', 'pedcount', 'pedcylcount', 'vehcount',
       'injuries', 'seriousinjuries', 'fatalities', 'incdate', 'junctiontype',
       'sdot_colcode', 'underinfl', 'st_colcode', 'crosswalkkey', 'speeding_y',
       'inattentionind_y', 'hitparkedcar_y', 'pedrownotgrnt_y',
       'weather_adverse', 'weather_good', 'weather_unknown',
       'roadcond_adverse', 'roadcond_dry', 'roadcond_unknown',
       'lightcond_dark', 'lightcond_daylight', 'lightcond_unknown',
       'lightcond_verydark', 'severitycode_injury',
       'severitycode_propertydamage', 'severitycode_unknown', 'year', 'month',
       'hour'],
      dtype='object')

In [89]:
'''
KEEP COLUMNS: [['crosswalkkey', 'intkey', 'year', 'pedcount', 'pedcylcount', 
'vehcount', 'injuries', 'seriousinjuries', 'fatalities', 'inattentionind_y']]
'''

temp = crosswalk_accidents[['crosswalkkey', 'intkey', 'year', 'month', 'hour', 'pedcount', 'pedcylcount', 'vehcount', 
                         'injuries', 'seriousinjuries', 'fatalities', 'inattentionind_y']].copy()

In [90]:
# Group by the intkey, year, month and hour to sum collisions of each intersection over the years

crosswalk_acc = temp.groupby(['crosswalkkey', 'year', 'month', 'hour']).sum().reset_index()

In [92]:
crosswalk_acc.head()

Unnamed: 0,crosswalkkey,year,month,hour,intkey,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
0,520756,2010,12,16,25423.0,0,1,1,1,0,0,1
1,520757,2008,1,0,25423.0,0,1,1,1,0,0,0
2,520758,2007,12,11,25405.0,1,0,1,0,0,1,0
3,520762,2020,5,0,0.0,1,0,1,1,1,0,0
4,520764,2007,12,7,31246.0,1,0,1,1,0,0,0


## Clean crosswalks data

Clean up crosswalks data before merging

In [45]:
to_drop = ['x', 'y', 'objectid', 'unitid', 'condition_assessment_date', 'ownership', 'current_status',
           'secondarydistrictcd', 'overrideyn', 'comptype', 'segkey', 'unittype', 'old_id', 'comments', 'install_date',
           'category', 'ownership_date', 'current_status_date', 'maintained_by', 'maintenance_agreement', 
           'curbspaceid', 'overridecomment','material', 'meas_from_low', 'disttolow', 'stpoint', 'meas_from_hi',
           'disttohi', 'offset', 'side', 'measurement_origin', 'attachment_1', 'attachment_2', 'attachment_3', 
           'attachment_4', 'attachment_5', 'attachment_6', 'attachment_7', 'attachment_8', 'attachment_9',
          'maintenance_group', 'num_attachments']

crosswalks.drop(columns=to_drop, inplace=True)

In [93]:
crosswalk_acc[crosswalk_acc['crosswalkkey'] == 524995]

Unnamed: 0,crosswalkkey,year,month,hour,intkey,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
3119,524995,2008,4,15,33264.0,1,0,1,1,0,0,0


In [94]:
crosswalks[crosswalks['crosswalkkey'] == 524995]

Unnamed: 0,crosswalkkey,unitdesc,condition,primarydistrictcd,approach,marking_type,school,midblock_crosswalk,color,maint_district,shape_lng,shape_lat,crosswalk_control,onstreet,xstrlow,xstrhi
176,524995,S ORCAS ST 0280 BLOCK C SIDE ( 28) 28 FT E/O 28TH AVE S,GOOD,DISTRICT2,C,LADER,Y,Y,WHT,SOUTH,-122.296768,47.550887,,S ORCAS ST,28TH AVE S,30TH AVE S


In [111]:
crosswalk_acc.isnull().sum()

crosswalkkey        0
year                0
month               0
hour                0
intkey              0
pedcount            0
pedcylcount         0
vehcount            0
injuries            0
seriousinjuries     0
fatalities          0
inattentionind_y    0
dtype: int64

In [108]:
crosswalks['crosswalkkey'].nunique()

5683

In [105]:
crosswalk_collisions = crosswalks.merge(crosswalk_acc, how='left', on='crosswalkkey')

## NaNs after Merge

Fill missing values with 0 because it represents no accidents

In [114]:
crosswalk_collisions.fillna('0', inplace=True)

In [118]:
crosswalk_collisions.isnull().sum()

crosswalkkey          0
unitdesc              0
condition             0
primarydistrictcd     0
approach              0
marking_type          0
school                0
midblock_crosswalk    0
color                 0
maint_district        0
shape_lng             0
shape_lat             0
crosswalk_control     0
onstreet              0
xstrlow               0
xstrhi                0
year                  0
month                 0
hour                  0
intkey                0
pedcount              0
pedcylcount           0
vehcount              0
injuries              0
seriousinjuries       0
fatalities            0
inattentionind_y      0
dtype: int64

In [124]:
crosswalk_collisions.nunique()

crosswalkkey          5683
unitdesc              5680
condition                4
primarydistrictcd        7
approach                 9
marking_type             5
school                   2
midblock_crosswalk       2
color                    6
maint_district           4
shape_lng             5666
shape_lat             5666
crosswalk_control        5
onstreet               927
xstrlow               1399
xstrhi                1386
year                    15
month                   13
hour                    25
intkey                1128
pedcount                 7
pedcylcount              4
vehcount                 6
injuries                 7
seriousinjuries          5
fatalities               3
inattentionind_y         3
dtype: int64

In [121]:
# crosswalk_collisions.to_csv('../data/processed/crosswalk_collisions.csv')