In [1]:
# Import Modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#modeling tools
import statsmodels.api as sm

import lightgbm as lgb

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, auc, roc_auc_score, roc_curve, log_loss, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

%matplotlib inline
plt.style.use('dark_background')

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

pd.set_option('display.min_row', 15)
pd.set_option('display.max_column', 100)
pd.set_option('display.max_colwidth', 300)

In [25]:
clean_data = pd.read_pickle('../data/processed/cleaned_data.pkl')
intersection_n = pd.read_pickle('../data/processed/intersection_w_normalization.pkl')

# crosswalks = pd.read_csv('../data/Marked_Crosswalks.csv')
circles = pd.read_csv('../data/Traffic_Circles.csv')
intersections = pd.read_csv('../data/Intersections.csv')

## Intersections

In [7]:
# lower case the column names

intersections.columns = intersections.columns.map(str.lower)
clean_data.columns = clean_data.columns.map(str.lower)

In [4]:
# COMPKEY in Intersections is INTKEY

intersections.rename(columns={'compkey': 'intkey'}, inplace=True)
intersections.columns

Index(['x', 'y', 'objectid', 'intr_id', 'gis_xcoord', 'gis_ycoord', 'intkey',
       'comptype', 'unitid', 'subarea', 'unitdesc', 'arterialclasscd',
       'signal_maint_dist', 'signal_type', 'shape_lng', 'shape_lat'],
      dtype='object')

#### merge intersections with collisions

In [8]:
# Used to make sure number of rows are correct when merging

len(np.intersect1d(intersections['intkey'], clean_data['intkey']))

7576

In [11]:
# Create dataframe with only intersections that are also in the collisions data

intersections_accidents = clean_data[clean_data['intkey'].isin(intersections['intkey'])].copy()

In [10]:
intersections_accidents.head(3)

Unnamed: 0_level_0,x,y,objectid,inckey,addrtype,intkey,location,personcount,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,incdate,junctiontype,sdot_colcode,underinfl,st_colcode,crosswalkkey,speeding_y,inattentionind_y,hitparkedcar_y,pedrownotgrnt_y,weather_adverse,weather_good,weather_unknown,roadcond_adverse,roadcond_dry,roadcond_unknown,lightcond_dark,lightcond_daylight,lightcond_unknown,lightcond_verydark,severitycode_injury,severitycode_propertydamage,severitycode_unknown
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
2004-01-01,-122.32078,47.614076,14172,26463,Intersection,29745.0,BROADWAY AND E PIKE ST,4,0,0,2,1,0,0,2004/01/01 00:00:00+00,At Intersection (intersection related),11.0,0,28,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0
2004-01-01,-122.255895,47.509705,13453,27093,Intersection,38228.0,62ND AVE S AND S RYAN ST,3,0,0,2,1,0,0,2004/01/01 00:00:00+00,At Intersection (intersection related),11.0,0,10,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0
2004-01-01,-122.348859,47.619744,13420,26978,Intersection,28934.0,4TH AVE N AND BROAD ST,2,0,0,2,0,0,0,2004/01/01 00:00:00+00,At Intersection (intersection related),11.0,0,10,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0


In [12]:
# reset index to extract year
intersections_accidents.reset_index(inplace=True)
intersections_accidents['year'] = intersections_accidents['Datetime'].apply(lambda x: x.year)
intersections_accidents['month'] = intersections_accidents['Datetime'].apply(lambda x: x.month)
intersections_accidents['hour'] = intersections_accidents['Datetime'].apply(lambda x: x.hour)

In [14]:
intersections_accidents['addrtype'].value_counts()

Intersection    65481
Name: addrtype, dtype: int64

In [13]:
intersections_accidents.columns

Index(['Datetime', 'x', 'y', 'objectid', 'inckey', 'addrtype', 'intkey',
       'location', 'personcount', 'pedcount', 'pedcylcount', 'vehcount',
       'injuries', 'seriousinjuries', 'fatalities', 'incdate', 'junctiontype',
       'sdot_colcode', 'underinfl', 'st_colcode', 'crosswalkkey', 'speeding_y',
       'inattentionind_y', 'hitparkedcar_y', 'pedrownotgrnt_y',
       'weather_adverse', 'weather_good', 'weather_unknown',
       'roadcond_adverse', 'roadcond_dry', 'roadcond_unknown',
       'lightcond_dark', 'lightcond_daylight', 'lightcond_unknown',
       'lightcond_verydark', 'severitycode_injury',
       'severitycode_propertydamage', 'severitycode_unknown', 'year', 'month',
       'hour'],
      dtype='object')

In [15]:
'''
KEEP COLUMNS: [['intkey', 'year', 'pedcount', 'pedcylcount', 
'vehcount', 'injuries', 'seriousinjuries', 'fatalities', 'inattentionind_y']]
'''

temp = intersections_accidents[['intkey', 'year', 'month', 'hour', 'pedcount', 'pedcylcount', 'vehcount', 
                         'injuries', 'seriousinjuries', 'fatalities', 'inattentionind_y']].copy()

In [21]:
# Group by the intkey, year, month and hour to sum collisions of each intersection over the years

int_acc = temp.groupby(['intkey', 'year', 'month', 'hour']).sum().reset_index()

In [22]:
inter_collisions = intersections.merge(int_acc, how='left', on='intkey')

In [26]:
# about 89.18% of intersections have had accidents within 2004-2020, 16 year periods
# THIS STILL CONTAINS TRAFFIC CIRCLES

print('Percent of intersections (including traffic circles) with incidents: ',
      round((~inter_collisions['year'].isnull()).sum() / (len(inter_collisions['year']))*100 , 2),'%')

Percent of intersections (including traffic circles) with incidents:  89.18 %


## Removing Traffic Circles

## EDA and clean data

In [33]:
inter_collisions.columns = inter_collisions.columns.map(str.lower)

In [34]:
inter_collisions.shape

(15441, 17)

In [38]:
inter_collisions.columns

Index(['x', 'y', 'objectid', 'intr_id', 'gis_xcoord', 'gis_ycoord', 'intkey',
       'comptype', 'unitid', 'subarea', 'unitdesc', 'arterialclasscd',
       'signal_maint_dist', 'signal_type', 'shape_lng', 'shape_lat',
       'collision'],
      dtype='object')

In [39]:
# is intr_id connected to another dataset?

to_drop = ['objectid', 'x', 'y', 'gis_xcoord', 'gis_ycoord', 'comptype', 'unitid', 'signal_maint_dist']

inter_collisions.drop(columns=to_drop, inplace=True)

In [42]:
inter_collisions['collision'].fillna(0, inplace=True)

In [45]:
inter_collisions[inter_collisions['arterialclasscd'].isnull()]

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,collision
59,18220,286961,GRDWM,DUWAMISH RIVER TRL AND S HENDERSON ST,,NONE,-122.320599,47.523059,0.0
836,14981,164765,SE,31ST AVE S AND DEAD END 1,,NONE,-122.292444,47.567429,0.0
2458,12413,30483,DWNTN,3RD AVE AND XW JEFFERSON,,MIDXWALK,-122.330756,47.602056,1.0
7277,46498,36645,N,NE 125TH ST AND XW 26NE-27NE,,MIDXWALK,-122.299554,47.719303,1.0
10498,46441,288331,E,E JAMES WAY AND XW 10TH AVE,,MIDXWALK,-122.319314,47.607829,1.0
11643,47277,38473,CNTRL,S WASHINGTON ST AND DEAD END 4,,NONE,-122.317945,47.600877,0.0
13445,46997,511219,LKUN,FAIRVIEW AVE N AND SLUSC N OF WARD,,MIDXWALK,-122.331568,47.62819,0.0
13761,10717,28733,LKUN,TERRY AVE N AND DEAD END,,NONE,-122.337108,47.627394,0.0
15042,47525,615993,CNTRL,33RD AVE E AND E SPRING E ST,,NONE,-122.290347,47.611678,0.0
15061,12656,30534,DWNTN,4TH AVE S AND XW WELLER,,CITY,-122.328973,47.597603,10.0


In [99]:
inter_collisions[inter_collisions['signal_type'] == 'MIDXWALK']['arterialclasscd'].mode()

0    1.0
dtype: float64

In [185]:
# Dead ends almost always have arterialclasscd of 0
# DUWAMISH RIVER TRL all have 0
# unitdesc with XW is a 1. MIDXWALK as signal type is typically 1
# 33RD AVE E with CNTRL is 0
# FAIRVIEW is all 1

inter_collisions[inter_collisions['unitdesc'].str.contains('FAIRVIEW AVE N')].groupby(['subarea', 'arterialclasscd']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,intr_id,intkey,unitdesc,signal_type,shape_lng,shape_lat,collision
subarea,arterialclasscd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LKUN,1.0,15,15,15,15,15,15,15


In [196]:
def create_mask(string):
    mask = (inter_collisions['unitdesc'].str.contains(string)) & (inter_collisions['arterialclasscd'].isnull())
    return mask

In [200]:
mask_de = create_mask('DEAD END')
mask_xw = create_mask('XW')
mask_duwamish = create_mask('DUWAMISH RIVER TRL')
mask_fairview = create_mask('FAIRVIEW AVE N')
mask_33 = create_mask('33RD AVE E')

inter_collisions.loc[mask_de, 'arterialclasscd'] = 0
inter_collisions.loc[mask_xw, 'arterialclasscd'] = 1
inter_collisions.loc[mask_duwamish, 'arterialclasscd'] = 0
inter_collisions.loc[mask_fairview, 'arterialclasscd'] = 1
inter_collisions.loc[mask_33, 'arterialclasscd'] = 0

In [201]:
inter_collisions[inter_collisions['arterialclasscd'].isnull()]

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,collision


In [203]:
inter_collisions.head()

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,collision
0,18213,340313,GRDWM,4TH AVE S AND S HENDERSON N ST,0.0,NONE,-122.329732,47.523051,0.0
1,10302,157936,E,WOODROW PL E AND E GARFIELD ST,0.0,NONE,-122.284745,47.633387,0.0
2,4716,37264,BLRD,12TH AVE NW AND NW 87TH ST,0.0,NONE,-122.371401,47.692058,0.0
3,11483,30231,CNTRL,34TH AVE AND E HOWELL ST,3.0,NONE,-122.289176,47.617639,0.0
4,6406,25752,NE,NE 63RD ST AND NE RAVENNA EB BV,1.0,NONE,-122.320648,47.674,0.0


In [204]:
inter_collisions.to_csv('../data/processed/inter_collisions.csv')

## Add traffic circles info to Collisions

In [205]:
inter_collisions = pd.read_csv('../data/processed/inter_collisions.csv', index_col=0)
circles_collisions = pd.read_csv('../data/processed/circles_collisions.csv', index_col=0)

In [212]:
circles_collisions.head()

Unnamed: 0,compkey,intkey,shape_lat,shape_lng,unitdesc,primarydistrictcd,installed,landscaping,survey_monument,trcsize,trcshape,condition,collision
0,509338,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,2
1,509566,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,3
2,508937,24389,47.687493,-122.320263,8TH AVE NE AND NE 81ST ST,DISTRICT4,1991,1,0,16,CRC,GOOD,1
3,509329,31416,47.586905,-122.300247,25TH AVE S AND S GRAND ST,DISTRICT3,2003,1,0,0,OTHER,GOOD,2
4,509639,37684,47.690923,-122.293392,32ND AVE NE AND NE 86TH ST,DISTRICT5,1999,1,0,0,CRC,GOOD,0


In [215]:
inter_collisions[inter_collisions['intkey'].isin(
    np.intersect1d(inter_collisions['intkey'].unique(), circles_collisions['intkey'].unique()))]

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,collision
45,14959,31889,SW,38TH AVE SW AND SW DAKOTA ST,0.0,NONE,-122.380088,47.566489,2.0
46,3897,37392,NW,FREMONT AVE N AND N 102ND ST,0.0,NONE,-122.350050,47.702894,3.0
54,5152,24389,N,8TH AVE NE AND NE 81ST ST,0.0,NONE,-122.320263,47.687493,1.0
72,13277,31416,SE,25TH AVE S AND S GRAND ST,0.0,NONE,-122.300247,47.586905,2.0
90,4898,37684,NE,32ND AVE NE AND NE 86TH ST,0.0,NONE,-122.293392,47.690923,0.0
119,9433,27904,MGNL-QA,9TH AVE W AND W HALLADAY ST,0.0,NONE,-122.368931,47.642014,0.0
136,17257,33817,SW,39TH AVE SW AND SW WEBSTER ST,0.0,NONE,-122.381959,47.535698,0.0
...,...,...,...,...,...,...,...,...,...
14957,6191,25015,BLRD,34TH AVE NW AND NW 64TH ST,0.0,NONE,-122.401070,47.674850,5.0
14958,8399,27428,NE,43RD AVE NE AND NE 38TH ST,0.0,NONE,-122.281458,47.655783,0.0


In [224]:
(inter_collisions['signal_type'] != 'NONE').sum()

1258

In [229]:
(signals['INTKEY'].isin(inter_collisions['intkey'])).sum()

1106

## Traffic Signals

In [217]:
signals = pd.read_csv('../data/Traffic_Signals.csv')

In [218]:
signals.head()

Unnamed: 0,OBJECTID,COMPKEY,COMPTYPE,UNITID,DESCRIPTION,UNITDESC,SEGKEY,INTKEY,INT_UNITID,SIGNAL_TYPE,SIGNAL_MAINT_DIST,INSTALL_DATE,ADDDTTM,MODDTTM,OWNERSHIP,CONDITION,CONDITION_ASSESSMENT_DATE,CURRENT_STATUS,CURRENT_STATUS_DATE,ARTERIAL_CLASS,MAINT_AGREE,MAINT_BY,INT_SIGNAL_TYPE_CD,PRIMARYDISTRICTCD,SECONDARYDISTRICTCD,PEDAUDIODEVICEYN,PEDSIGNALYN,PH_MODELTYPE,PP_MODELTYPE,PP_QUANTITY,PEDHDFIRSTINSTALLDT,PH_QUANTITY,PEDAUDIOINSTALLDT,BIKESIGNALHDYN,ASBLT,HALFSIGNALYN,METEREDYN,LTTURNSIGNALYN,LTPROTECTEDPERMYN,LTPROTECTEDYN,RTTURNSIGNALYN,RTPROTECTEDPERMYN,RTPROTECTEDYN,LTFIRSTINSTALLDT,LTREMOVEDT,RTFIRSTINSTALLDT,RTREMOVEDT,DETDEVSTOPBARYN,DETDEVADVANCEDYN,DETDEVSYSTEMYN,CABMODELTYPECD,COMCONNTYPECD,MMUMODELTYPECD,PE_MODELTYPE,PI_MODELTYPE,INSTALLERCD,JOINTOWNERSHIPYN,CO_MODELTYPE,MMUMANCD,MMUINSTALLDT,LASTSIGNALOPTDT,PEDPSHINSTALLDT,NUM_ATTACHMENTS
0,1,272753,74,SGL-278,0.0,GREENWOOD AVE N AND N 80TH ST,1,24282,4521,SEMI,NORTH,1970/01/01 00:00:00+00,2005/03/17 17:11:06+00,2019/02/15 00:00:00+00,SDOT,GOOD,2008/03/06 00:00:00+00,INSVC,2008/12/15 00:00:00+00,2.0,,,CITY,DISTRICT6,,N,N,LEDCD,,,1970/01/01 00:00:00+00,8,1970/01/01 00:00:00+00,N,,N,N,Y,Y,N,N,N,N,2011/10/06 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,Y,N,N,3TS1,,NSM-12,,,,N,"LC8000, LMD9200",EDI,2019/02/12 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,0
1,2,272754,74,SGL-669,0.0,7TH AVE AND WESTLAKE AVE,1,29542,39139,PRE,DOWNTOWN,1970/01/01 00:00:00+00,2005/03/17 17:11:06+00,2020/04/01 00:00:00+00,SDOT,GOOD,2008/09/18 00:00:00+00,INSVC,2008/12/15 00:00:00+00,1.0,,,CITY,DISTRICT7,,N,N,LEDCD,,,1970/01/01 00:00:00+00,8,1970/01/01 00:00:00+00,N,,N,N,N,N,N,N,N,N,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,N,N,N,2,COPPER,NSM-12,OPT,,,Y,"M34, , M52, M60-ATC",EDI,2020/03/05 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,0
2,3,272755,74,SGL-178,0.0,2ND AVE AND BROAD ST,1,29666,39521,PRE,DOWNTOWN,1970/01/01 00:00:00+00,2005/03/17 17:11:06+00,2020/04/06 00:00:00+00,SDOT,POOR,2008/08/29 00:00:00+00,INSVC,2008/12/15 00:00:00+00,1.0,,,CITY,DISTRICT7,,N,N,LEDCD,NAV-2W-PPB,8.0,1970/01/01 00:00:00+00,8,1970/01/01 00:00:00+00,N,,N,N,N,N,N,N,N,N,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,N,N,N,3TS2-Type1,FIBER,MMU-16LEip,,,,Y,"M34, M60",EDI,2020/04/04 00:00:00+00,1970/01/01 00:00:00+00,2018/01/11 00:00:00+00,0
3,5,272757,74,SGL-022,0.0,DEARBORN OFF RP AND S DEARBORN ST,1,30444,44306,SEMI,SOUTH,1970/01/01 00:00:00+00,2005/03/17 17:11:07+00,1970/01/01 00:00:00+00,WSDOT,,1970/01/01 00:00:00+00,INSVC,2008/12/15 00:00:00+00,1.0,,,STATE,DISTRICT2,,N,N,,,,1970/01/01 00:00:00+00,,1970/01/01 00:00:00+00,N,,N,N,N,N,N,N,N,N,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,N,N,N,,,,,,,,,,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,0
4,6,272758,74,SGL-75,0.0,4TH AVE AND PIKE ST,1,29589,39315,PRE,DOWNTOWN,1970/01/01 00:00:00+00,2005/03/17 17:11:07+00,2019/07/22 00:00:00+00,SDOT,FAIR,2008/09/08 00:00:00+00,INSVC,2008/12/15 00:00:00+00,1.0,,,CITY,DISTRICT7,,N,N,"LED, LEDCD",,,1970/01/01 00:00:00+00,"8, 8",1970/01/01 00:00:00+00,N,,N,N,N,N,N,N,N,N,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,N,N,N,3TS2,COPPER,MMU2-16LEi,,,,N,"M34, , , M52, M52",EDI,2019/07/15 00:00:00+00,1970/01/01 00:00:00+00,1970/01/01 00:00:00+00,0


In [221]:
signals['CONDITION'].value_counts(dropna=False)

POOR         524
FAIR         315
GOOD         172
             101
EXCELLENT      1
Name: CONDITION, dtype: int64