In [158]:
# Import Modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#modeling tools
import statsmodels.api as sm

import lightgbm as lgb

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, auc, roc_auc_score, roc_curve, log_loss, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

%matplotlib inline
plt.style.use('dark_background')

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

pd.set_option('display.min_row', 15)
pd.set_option('display.max_column', 100)
pd.set_option('display.max_colwidth', 300)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
clean_data = pd.read_pickle('../data/processed/cleaned_data.pkl')
intersection_n = pd.read_pickle('../data/processed/intersection_w_normalization.pkl')

# crosswalks = pd.read_csv('../data/Marked_Crosswalks.csv')
intersections = pd.read_csv('../data/Intersections.csv')

## Intersections

In [3]:
# COMPKEY in Intersections is INTKEY

intersections.rename(columns={'COMPKEY': 'INTKEY'}, inplace=True)
intersections.columns

Index(['X', 'Y', 'OBJECTID', 'INTR_ID', 'GIS_XCOORD', 'GIS_YCOORD', 'INTKEY',
       'COMPTYPE', 'UNITID', 'SUBAREA', 'UNITDESC', 'ARTERIALCLASSCD',
       'SIGNAL_MAINT_DIST', 'SIGNAL_TYPE', 'SHAPE_LNG', 'SHAPE_LAT'],
      dtype='object')

#### merge intersections with collisions

In [4]:
# Used to make sure number of rows are correct when merging

len(np.intersect1d(intersections['INTKEY'].unique(), clean_data['INTKEY'].unique()))

7576

In [27]:
# Create dataframe with only intersections that are also in the collisions data

intersections_accidents = clean_data[clean_data['INTKEY'].isin(intersections['INTKEY'])].copy()

In [28]:
# Create new column indicating 1 as there was an accident

intersections_accidents['collision'] = np.ones((intersections_accidents.shape[0], 1))

In [29]:
# Group by the INTKEY and count number of collisions over the years

int_acc = intersections_accidents[['INTKEY', 'collision']].groupby('INTKEY').count().reset_index()

In [30]:
inter_collisions = intersections.merge(int_acc, how='left', on='INTKEY')

In [31]:
# Check for duplicates
inter_collisions['INTKEY'].duplicated().any()

False

In [32]:
# about 49% of intersections have had accidents within 2004-2020, 16 year periods
# THIS STILL CONTAINS TRAFFIC CIRCLES

print('Percent of intersections (including traffic circles) with incidents: ',
      round((~inter_collisions['collision'].isnull()).sum() / (len(inter_collisions['collision']))*100 , 2),'%')

Percent of intersections (including traffic circles) with incidents:  49.06 %


## EDA and clean data

In [33]:
inter_collisions.columns = inter_collisions.columns.map(str.lower)

In [34]:
inter_collisions.shape

(15441, 17)

In [38]:
inter_collisions.columns

Index(['x', 'y', 'objectid', 'intr_id', 'gis_xcoord', 'gis_ycoord', 'intkey',
       'comptype', 'unitid', 'subarea', 'unitdesc', 'arterialclasscd',
       'signal_maint_dist', 'signal_type', 'shape_lng', 'shape_lat',
       'collision'],
      dtype='object')

In [39]:
# is intr_id connected to another dataset?

to_drop = ['objectid', 'x', 'y', 'gis_xcoord', 'gis_ycoord', 'comptype', 'unitid', 'signal_maint_dist']

inter_collisions.drop(columns=to_drop, inplace=True)

In [42]:
inter_collisions['collision'].fillna(0, inplace=True)

In [45]:
inter_collisions[inter_collisions['arterialclasscd'].isnull()]

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,collision
59,18220,286961,GRDWM,DUWAMISH RIVER TRL AND S HENDERSON ST,,NONE,-122.320599,47.523059,0.0
836,14981,164765,SE,31ST AVE S AND DEAD END 1,,NONE,-122.292444,47.567429,0.0
2458,12413,30483,DWNTN,3RD AVE AND XW JEFFERSON,,MIDXWALK,-122.330756,47.602056,1.0
7277,46498,36645,N,NE 125TH ST AND XW 26NE-27NE,,MIDXWALK,-122.299554,47.719303,1.0
10498,46441,288331,E,E JAMES WAY AND XW 10TH AVE,,MIDXWALK,-122.319314,47.607829,1.0
11643,47277,38473,CNTRL,S WASHINGTON ST AND DEAD END 4,,NONE,-122.317945,47.600877,0.0
13445,46997,511219,LKUN,FAIRVIEW AVE N AND SLUSC N OF WARD,,MIDXWALK,-122.331568,47.62819,0.0
13761,10717,28733,LKUN,TERRY AVE N AND DEAD END,,NONE,-122.337108,47.627394,0.0
15042,47525,615993,CNTRL,33RD AVE E AND E SPRING E ST,,NONE,-122.290347,47.611678,0.0
15061,12656,30534,DWNTN,4TH AVE S AND XW WELLER,,CITY,-122.328973,47.597603,10.0


In [99]:
inter_collisions[inter_collisions['signal_type'] == 'MIDXWALK']['arterialclasscd'].mode()

0    1.0
dtype: float64

In [185]:
# Dead ends almost always have arterialclasscd of 0
# DUWAMISH RIVER TRL all have 0
# unitdesc with XW is a 1. MIDXWALK as signal type is typically 1
# 33RD AVE E with CNTRL is 0
# FAIRVIEW is all 1

inter_collisions[inter_collisions['unitdesc'].str.contains('FAIRVIEW AVE N')].groupby(['subarea', 'arterialclasscd']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,intr_id,intkey,unitdesc,signal_type,shape_lng,shape_lat,collision
subarea,arterialclasscd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LKUN,1.0,15,15,15,15,15,15,15


In [196]:
def create_mask(string):
    mask = (inter_collisions['unitdesc'].str.contains(string)) & (inter_collisions['arterialclasscd'].isnull())
    return mask

In [200]:
mask_de = create_mask('DEAD END')
mask_xw = create_mask('XW')
mask_duwamish = create_mask('DUWAMISH RIVER TRL')
mask_fairview = create_mask('FAIRVIEW AVE N')
mask_33 = create_mask('33RD AVE E')

inter_collisions.loc[mask_de, 'arterialclasscd'] = 0
inter_collisions.loc[mask_xw, 'arterialclasscd'] = 1
inter_collisions.loc[mask_duwamish, 'arterialclasscd'] = 0
inter_collisions.loc[mask_fairview, 'arterialclasscd'] = 1
inter_collisions.loc[mask_33, 'arterialclasscd'] = 0

In [201]:
inter_collisions[inter_collisions['arterialclasscd'].isnull()]

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,collision


In [None]:
# inter_collisions.to_csv('../data/processed/inter_collisions.csv')

## Add traffic circles info to Collisions

In [28]:
inter_collisions = pd.read_csv('../data/processed/inter_collisions.csv', index_col=0)
circles_collisions = pd.read_csv('../data/processed/circles_collisions.csv', index_col=0)

In [17]:
intersections[intersections['INTKEY'].isin(
    np.intersect1d(intersections['INTKEY'].unique(), circles_collisions['INTKEY'].unique()))].head()

Unnamed: 0,X,Y,OBJECTID,INTR_ID,GIS_XCOORD,GIS_YCOORD,INTKEY,COMPTYPE,UNITID,SUBAREA,UNITDESC,ARTERIALCLASSCD,SIGNAL_MAINT_DIST,SIGNAL_TYPE,SHAPE_LNG,SHAPE_LAT
45,1258588.0,210475.502411,46,14959,1258588.0,210475.50248,31889,13,54316,SW,38TH AVE SW AND SW DAKOTA ST,0.0,,NONE,-122.380088,47.566489
46,1266983.0,260074.185342,47,3897,1266983.0,260074.18541,37392,13,232513,NW,FREMONT AVE N AND N 102ND ST,0.0,,NONE,-122.35005,47.702894
54,1274207.0,254314.130296,55,5152,1274207.0,254314.13036,24389,13,5118,N,8TH AVE NE AND NE 81ST ST,0.0,,NONE,-122.320263,47.687493
72,1278437.0,217535.101459,73,13277,1278437.0,217535.10159,31416,13,52122,SE,25TH AVE S AND S GRAND ST,0.0,,NONE,-122.300247,47.586905
90,1280848.0,255438.451194,91,4898,1280848.0,255438.45113,37684,13,234350,NE,32ND AVE NE AND NE 86TH ST,0.0,,NONE,-122.293392,47.690923


In [29]:
clean_data.columns

Index(['X', 'Y', 'OBJECTID', 'INCKEY', 'ADDRTYPE', 'INTKEY', 'LOCATION',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES',
       'SERIOUSINJURIES', 'FATALITIES', 'INCDATE', 'JUNCTIONTYPE',
       'SDOT_COLCODE', 'UNDERINFL', 'ST_COLCODE', 'CROSSWALKKEY', 'SPEEDING_Y',
       'INATTENTIONIND_Y', 'HITPARKEDCAR_Y', 'PEDROWNOTGRNT_Y',
       'WEATHER_Adverse', 'WEATHER_Good', 'WEATHER_Unknown',
       'ROADCOND_Adverse', 'ROADCOND_Dry', 'ROADCOND_Unknown',
       'LIGHTCOND_Dark', 'LIGHTCOND_Daylight', 'LIGHTCOND_Unknown',
       'LIGHTCOND_VeryDark', 'SEVERITYCODE_Injury',
       'SEVERITYCODE_PropertyDamage', 'SEVERITYCODE_Unknown'],
      dtype='object')

In [30]:
# There are 593 distinct traffic circles in the collision data

len(np.intersect1d(clean_data['INTKEY'], circles_collisions['INTKEY']))

593

In [31]:
tf_circles = np.intersect1d(clean_data['INTKEY'], circles_collisions['INTKEY'])

clean_data.loc[clean_data['INTKEY'].isin(tf_circles), 'ADDRTYPE'] = 'Circle'

In [32]:
# All the rows with missing ADDRTYPE also have missing GIS coordinates and INTKEY.
# Drop these NaN rows since there is no way to identify the street/intersection

clean_data['ADDRTYPE'].value_counts(dropna=False)

Block           126091
Intersection     64185
NaN               1824
Circle            1449
Alley              755
Name: ADDRTYPE, dtype: int64

In [33]:
clean_data.dropna(subset=['ADDRTYPE'], axis=0, inplace=True)

In [34]:
# All locations missing values are in alleys

clean_data[clean_data['LOCATION'].isnull()]['ADDRTYPE'].value_counts(dropna=False)

Alley    755
Name: ADDRTYPE, dtype: int64

In [35]:
# Fill missing locations as alley

clean_data['LOCATION'].fillna('Alley', inplace=True)

In [36]:
clean_data.columns

Index(['X', 'Y', 'OBJECTID', 'INCKEY', 'ADDRTYPE', 'INTKEY', 'LOCATION',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES',
       'SERIOUSINJURIES', 'FATALITIES', 'INCDATE', 'JUNCTIONTYPE',
       'SDOT_COLCODE', 'UNDERINFL', 'ST_COLCODE', 'CROSSWALKKEY', 'SPEEDING_Y',
       'INATTENTIONIND_Y', 'HITPARKEDCAR_Y', 'PEDROWNOTGRNT_Y',
       'WEATHER_Adverse', 'WEATHER_Good', 'WEATHER_Unknown',
       'ROADCOND_Adverse', 'ROADCOND_Dry', 'ROADCOND_Unknown',
       'LIGHTCOND_Dark', 'LIGHTCOND_Daylight', 'LIGHTCOND_Unknown',
       'LIGHTCOND_VeryDark', 'SEVERITYCODE_Injury',
       'SEVERITYCODE_PropertyDamage', 'SEVERITYCODE_Unknown'],
      dtype='object')

In [37]:
inter_collisions.fillna('0', inplace=True)

In [38]:
inter_collisions['collision'] = inter_collisions['collision'].astype(int)

In [39]:
inter_collisions[inter_collisions['collision'] != 0]['collision'].value_counts()

1      2092
2      1169
3       709
4       527
5       374
6       302
7       229
       ... 
113       1
105       1
93        1
81        1
136       1
116       1
117       1
Name: collision, Length: 111, dtype: int64