In [114]:
# Import Modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#modeling tools
import statsmodels.api as sm

import lightgbm as lgb

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, auc, roc_auc_score, roc_curve, log_loss, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

%matplotlib inline
plt.style.use('dark_background')

%load_ext autoreload
%autoreload 2

pd.set_option('display.min_row', 15)
pd.set_option('display.max_column', 300)
pd.set_option('display.max_colwidth', 300)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [219]:
int_sev = pd.read_csv('../data/processed/int_sev_allsdot.csv')
blocks_sev = pd.read_csv('../data/processed/blocks_sev_allsdot.csv', index_col=0)
xwalks_sev = pd.read_csv('../data/processed/xwalks_sev_allsdot.csv', index_col=0)

In [220]:
int_sev.columns = int_sev.columns.map(str.lower)
blocks_sev.columns = blocks_sev.columns.map(str.lower)
xwalks_sev.columns = xwalks_sev.columns.map(str.lower)

## Traffic Circles with all_sdot Severity Metric

In [129]:
circles = pd.read_csv('../data/processed/old_circles_collisions.csv', index_col=0)

In [130]:
circles.head()

Unnamed: 0,intkey,shape_lat,shape_lng,unitdesc,primarydistrictcd,installed,landscaping,survey_monument,trcsize,trcshape,condition,year,month,hour,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
0,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,2006.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,2017.0,3.0,13.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
2,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,2005.0,5.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,2006.0,9.0,9.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
4,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,2011.0,4.0,8.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0


In [131]:
circles.columns

Index(['intkey', 'shape_lat', 'shape_lng', 'unitdesc', 'primarydistrictcd',
       'installed', 'landscaping', 'survey_monument', 'trcsize', 'trcshape',
       'condition', 'year', 'month', 'hour', 'pedcount', 'pedcylcount',
       'vehcount', 'injuries', 'seriousinjuries', 'fatalities',
       'inattentionind_y'],
      dtype='object')

Drop columns that will be included from the all_sdot severity dataset

In [132]:
drop_circles = ['year', 'month', 'hour', 'pedcount', 'pedcylcount',
       'vehcount', 'injuries', 'seriousinjuries', 'fatalities']

circles.drop(columns=drop_circles, inplace=True)

In [133]:
circles.head()

Unnamed: 0,intkey,shape_lat,shape_lng,unitdesc,primarydistrictcd,installed,landscaping,survey_monument,trcsize,trcshape,condition,inattentionind_y
0,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0
1,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0
2,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0
3,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0
4,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0


In [134]:
# merge collisions data with circles_acc to obtain dataset of all circles with labels of collisions or none

circles_collisions = circles.merge(int_sev, how='left', on='intkey')

In [135]:
circles_collisions.shape

(6290, 22)

In [136]:
circles_collisions.isnull().sum()

intkey                          0
shape_lat                       0
shape_lng                       0
unitdesc                        0
primarydistrictcd               0
installed                       0
landscaping                     0
survey_monument                 0
trcsize                         0
trcshape                        0
condition                       0
inattentionind_y                0
location                      419
year                          419
count                         419
total fatalities              419
total serious injuries        419
total evident injuries        419
total possible injuries       419
total vehicles                419
total pedestrians involved    419
total bicyclists involved     419
dtype: int64

In [137]:
# drop location, unitdesc has no nulls

circles_collisions.drop(columns='location', inplace=True)

In [138]:
# There are duplicates by intkey and year

circles_collisions.drop_duplicates(['intkey', 'year'], inplace=True)

In [139]:
circles_collisions.shape

(1798, 21)

In [140]:
# about 76.86% of traffic circles had collisions

print('Percent of traffic circles with incidents: ',
      round((~circles_collisions['year'].isnull()).sum() / (len(circles_collisions['year']))*100 , 2),'%')

Percent of traffic circles with incidents:  76.86 %


In [141]:
# fill missing values with 0 for no collision

circles_collisions.fillna(0, inplace=True)

In [143]:
circles_collisions.shape

(1798, 21)

In [144]:
circles_collisions.columns

Index(['intkey', 'shape_lat', 'shape_lng', 'unitdesc', 'primarydistrictcd',
       'installed', 'landscaping', 'survey_monument', 'trcsize', 'trcshape',
       'condition', 'inattentionind_y', 'year', 'count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries', 'total vehicles',
       'total pedestrians involved', 'total bicyclists involved'],
      dtype='object')

In [145]:
circles_collisions[['count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries']].sum()

count                      1615.0
total fatalities              1.0
total serious injuries       23.0
total evident injuries      181.0
total possible injuries     356.0
dtype: float64

In [150]:
circles_collisions['weigh_fatalities'] = circles_collisions['total fatalities'] * 5
circles_collisions['weigh_serious'] = circles_collisions['total serious injuries'] * 4
circles_collisions['weigh_evident'] = circles_collisions['total evident injuries'] * 3
circles_collisions['weigh_possible'] = circles_collisions['total possible injuries'] * 2

In [152]:
circles_collisions.columns

Index(['intkey', 'shape_lat', 'shape_lng', 'unitdesc', 'primarydistrictcd',
       'installed', 'landscaping', 'survey_monument', 'trcsize', 'trcshape',
       'condition', 'inattentionind_y', 'year', 'count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries', 'total vehicles',
       'total pedestrians involved', 'total bicyclists involved',
       'weigh_fatalities', 'weigh_serious', 'weigh_evident', 'weigh_possible'],
      dtype='object')

In [194]:
weigh_columns = ['count', 'weigh_fatalities', 'weigh_serious', 'weigh_evident', 'weigh_possible']
weigh_total = circles_collisions[weigh_columns].sum()

In [195]:
weigh_total

count               1615.0
weigh_fatalities       5.0
weigh_serious         92.0
weigh_evident        543.0
weigh_possible       712.0
dtype: float64

In [156]:
circles_collisions.head()

Unnamed: 0,intkey,shape_lat,shape_lng,unitdesc,primarydistrictcd,installed,landscaping,survey_monument,trcsize,trcshape,condition,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible
0,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0,2006.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0,2017.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
4,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2005.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0
5,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2006.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
6,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2011.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0


In [157]:
sev_score = circles_collisions[weigh_columns].sum(axis=1) / weigh_total.sum()
sev_score

0       0.000337
1       0.001011
4       0.001011
5       0.000337
6       0.001348
13      0.000337
14      0.000337
          ...   
6268    0.000337
6269    0.001011
6278    0.000337
6279    0.000337
6282    0.001685
6283    0.000337
6288    0.000000
Length: 1798, dtype: float64

In [159]:
circles_collisions['sev_score'] = sev_score

In [160]:
circles_collisions.head()

Unnamed: 0,intkey,shape_lat,shape_lng,unitdesc,primarydistrictcd,installed,landscaping,survey_monument,trcsize,trcshape,condition,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible,sev_score
0,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0,2006.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000337
1,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0,2017.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.001011
4,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2005.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.001011
5,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2006.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000337
6,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2011.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,0.001348


In [162]:
# circles_collisions.to_csv('../data/processed/circles_sev_score.csv')

## Intersections(circles removed) with all_sdot Severity Metric

In [235]:
intersections = pd.read_csv('../data/processed/inter_nocircles.csv', index_col=0)

In [236]:
intersections.head()

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,year,month,hour,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
0,18213,340313,GRDWM,4TH AVE S AND S HENDERSON N ST,0.0,NONE,-122.329732,47.523051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10302,157936,E,WOODROW PL E AND E GARFIELD ST,0.0,NONE,-122.284745,47.633387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4716,37264,BLRD,12TH AVE NW AND NW 87TH ST,0.0,NONE,-122.371401,47.692058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11483,30231,CNTRL,34TH AVE AND E HOWELL ST,3.0,NONE,-122.289176,47.617639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6406,25752,NE,NE 63RD ST AND NE RAVENNA EB BV,1.0,NONE,-122.320648,47.674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [237]:
intersections.shape

(70793, 18)

In [238]:
intersections.columns

Index(['intr_id', 'intkey', 'subarea', 'unitdesc', 'arterialclasscd',
       'signal_type', 'shape_lng', 'shape_lat', 'year', 'month', 'hour',
       'pedcount', 'pedcylcount', 'vehcount', 'injuries', 'seriousinjuries',
       'fatalities', 'inattentionind_y'],
      dtype='object')

Drop columns that will be included from the all_sdot severity dataset

In [239]:
drop_inters = ['intr_id', 'year', 'month', 'hour', 'pedcount', 'pedcylcount',
       'vehcount', 'injuries', 'seriousinjuries', 'fatalities']

intersections.drop(columns=drop_inters, inplace=True)

In [240]:
intersections.head()

Unnamed: 0,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,inattentionind_y
0,340313,GRDWM,4TH AVE S AND S HENDERSON N ST,0.0,NONE,-122.329732,47.523051,0.0
1,157936,E,WOODROW PL E AND E GARFIELD ST,0.0,NONE,-122.284745,47.633387,0.0
2,37264,BLRD,12TH AVE NW AND NW 87TH ST,0.0,NONE,-122.371401,47.692058,0.0
3,30231,CNTRL,34TH AVE AND E HOWELL ST,3.0,NONE,-122.289176,47.617639,0.0
4,25752,NE,NE 63RD ST AND NE RAVENNA EB BV,1.0,NONE,-122.320648,47.674,0.0


In [241]:
# merge collisions data with circles_acc to obtain dataset of all circles with labels of collisions or none

inters_collisions = intersections.merge(int_sev, how='left', on='intkey')

In [242]:
inters_collisions.shape

(726403, 18)

In [243]:
inters_collisions.isnull().sum()

intkey                           0
subarea                          0
unitdesc                         0
arterialclasscd                  0
signal_type                      0
shape_lng                        0
shape_lat                        0
inattentionind_y                 0
location                      7170
year                          7170
count                         7170
total fatalities              7170
total serious injuries        7170
total evident injuries        7170
total possible injuries       7170
total vehicles                7170
total pedestrians involved    7170
total bicyclists involved     7170
dtype: int64

In [244]:
# drop location, unitdesc has no nulls

inters_collisions.drop(columns='location', inplace=True)

In [245]:
# There are duplicates by intkey and year

inters_collisions.drop_duplicates(['intkey', 'year'], inplace=True)

In [246]:
# make sure all intersections are kept
intersections['intkey'].nunique()

14385

In [247]:
inters_collisions['intkey'].nunique()

14385

In [248]:
# about 83.63% of intersections(no circles) had collisions

print('Percent of intersections (excluding circles) with incidents: ',
      round((~inters_collisions['year'].isnull()).sum() / (len(inters_collisions['year']))*100 , 2),'%')

Percent of intersections (excluding circles) with incidents:  83.63 %


In [183]:
# fill missing values with 0 for no collision

inters_collisions.fillna(0, inplace=True)

In [184]:
inters_collisions.shape

(43800, 17)

In [185]:
inters_collisions.columns

Index(['intkey', 'subarea', 'unitdesc', 'arterialclasscd', 'signal_type',
       'shape_lng', 'shape_lat', 'inattentionind_y', 'year', 'count',
       'total fatalities', 'total serious injuries', 'total evident injuries',
       'total possible injuries', 'total vehicles',
       'total pedestrians involved', 'total bicyclists involved'],
      dtype='object')

In [186]:
inters_collisions[['count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries']].sum()

count                      69860.0
total fatalities             138.0
total serious injuries      1545.0
total evident injuries      9079.0
total possible injuries    24809.0
dtype: float64

In [187]:
inters_collisions['weigh_fatalities'] = inters_collisions['total fatalities'] * 5
inters_collisions['weigh_serious'] = inters_collisions['total serious injuries'] * 4
inters_collisions['weigh_evident'] = inters_collisions['total evident injuries'] * 3
inters_collisions['weigh_possible'] = inters_collisions['total possible injuries'] * 2

In [192]:
int_weigh_columns = ['count', 'weigh_fatalities', 'weigh_serious', 'weigh_evident', 'weigh_possible']
int_weigh_total = inters_collisions[int_weigh_columns].sum()

In [196]:
int_weigh_total

count               69860.0
weigh_fatalities      690.0
weigh_serious        6180.0
weigh_evident       27237.0
weigh_possible      49618.0
dtype: float64

In [197]:
inters_collisions.head()

Unnamed: 0,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible
0,340313,GRDWM,4TH AVE S AND S HENDERSON N ST,0.0,NONE,-122.329732,47.523051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,157936,E,WOODROW PL E AND E GARFIELD ST,0.0,NONE,-122.284745,47.633387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,37264,BLRD,12TH AVE NW AND NW 87TH ST,0.0,NONE,-122.371401,47.692058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30231,CNTRL,34TH AVE AND E HOWELL ST,3.0,NONE,-122.289176,47.617639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25752,NE,NE 63RD ST AND NE RAVENNA EB BV,1.0,NONE,-122.320648,47.674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [198]:
int_sev_score = inters_collisions[int_weigh_columns].sum(axis=1) / int_weigh_total.sum()
int_sev_score

0         0.000000
1         0.000000
2         0.000000
3         0.000000
4         0.000000
5         0.000007
6         0.000026
            ...   
726392    0.000000
726393    0.000007
726394    0.000026
726399    0.000000
726400    0.000000
726401    0.000007
726402    0.000000
Length: 43800, dtype: float64

In [199]:
inters_collisions['sev_score'] = int_sev_score

In [200]:
inters_collisions.head()

Unnamed: 0,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible,sev_score
0,340313,GRDWM,4TH AVE S AND S HENDERSON N ST,0.0,NONE,-122.329732,47.523051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,157936,E,WOODROW PL E AND E GARFIELD ST,0.0,NONE,-122.284745,47.633387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,37264,BLRD,12TH AVE NW AND NW 87TH ST,0.0,NONE,-122.371401,47.692058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30231,CNTRL,34TH AVE AND E HOWELL ST,3.0,NONE,-122.289176,47.617639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25752,NE,NE 63RD ST AND NE RAVENNA EB BV,1.0,NONE,-122.320648,47.674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [201]:
# inters_collisions.to_csv('../data/processed/intersections_sev_score.csv')

## Crosswalks with all_sdot Severity Metric

In [202]:
crosswalks = pd.read_csv('../data/processed/crosswalk_collisions.csv', index_col=0)

In [203]:
crosswalks.head()

Unnamed: 0,crosswalkkey,unitdesc,condition,primarydistrictcd,approach,marking_type,school,midblock_crosswalk,color,maint_district,shape_lng,shape_lat,crosswalk_control,onstreet,xstrlow,xstrhi,year,month,hour,intkey,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
0,522399,ALASKAN WAY 0060 BLOCK C SIDE ( 35) 35 FT NW/O YESLER WAY,GOOD,DISTRICT7,NW,LADER,N,N,WHT,CENTRAL,-122.336656,47.6018,,ALASKAN WAY,YESLER WAY,COLUMBIA ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,525064,S HOLLY ST 0440 BLOCK C SIDE ( 385) 20 FT W/O 45TH AVE S,GOOD,DISTRICT2,W,LADER,Y,N,WHT,SOUTH,-122.276553,47.542459,,S HOLLY ST,44TH AVE S,45TH AVE S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,523429,STONE WAY N 0390 BLOCK C SIDE ( 115) 115 FT N/O N 39TH ST,GOOD,DISTRICT4,S,LADER,N,N,WHT,NORTH,-122.342486,47.654435,SIGNAL,STONE WAY N,N 39TH ST,BRIDGE WAY N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,521447,35TH AVE SW 0390 BLOCK C SIDE ( 18) 18 FT S/O SW ANDOVER N ST,GOOD,DISTRICT1,S,LADER,Y,N,WHT,SOUTH,-122.376151,47.568308,,35TH AVE SW,SW ANDOVER N ST,SW ANDOVER S ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,524665,NE 145TH ST 0200 BLOCK C SIDE ( 5) 5 FT E/O 20TH AVE NE,GOOD,DISTRICT5,N,LADER,N,N,WHT,NORTH,-122.307324,47.73389,,NE 145TH ST,20TH AVE NE,22ND AVE NE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
crosswalks.shape

(7502, 27)

In [205]:
crosswalks.columns

Index(['crosswalkkey', 'unitdesc', 'condition', 'primarydistrictcd',
       'approach', 'marking_type', 'school', 'midblock_crosswalk', 'color',
       'maint_district', 'shape_lng', 'shape_lat', 'crosswalk_control',
       'onstreet', 'xstrlow', 'xstrhi', 'year', 'month', 'hour', 'intkey',
       'pedcount', 'pedcylcount', 'vehcount', 'injuries', 'seriousinjuries',
       'fatalities', 'inattentionind_y'],
      dtype='object')

Drop columns that will be included from the all_sdot severity dataset

In [206]:
drop_xwalks = ['year', 'month', 'hour', 'pedcount', 'pedcylcount',
       'vehcount', 'injuries', 'seriousinjuries', 'fatalities']

crosswalks.drop(columns=drop_xwalks, inplace=True)

In [212]:
crosswalks.head()

Unnamed: 0,crosswalkkey,unitdesc,condition,primarydistrictcd,approach,marking_type,school,midblock_crosswalk,color,maint_district,shape_lng,shape_lat,crosswalk_control,onstreet,xstrlow,xstrhi,intkey,inattentionind_y
0,522399,ALASKAN WAY 0060 BLOCK C SIDE ( 35) 35 FT NW/O YESLER WAY,GOOD,DISTRICT7,NW,LADER,N,N,WHT,CENTRAL,-122.336656,47.6018,,ALASKAN WAY,YESLER WAY,COLUMBIA ST,0.0,0.0
1,525064,S HOLLY ST 0440 BLOCK C SIDE ( 385) 20 FT W/O 45TH AVE S,GOOD,DISTRICT2,W,LADER,Y,N,WHT,SOUTH,-122.276553,47.542459,,S HOLLY ST,44TH AVE S,45TH AVE S,0.0,0.0
2,523429,STONE WAY N 0390 BLOCK C SIDE ( 115) 115 FT N/O N 39TH ST,GOOD,DISTRICT4,S,LADER,N,N,WHT,NORTH,-122.342486,47.654435,SIGNAL,STONE WAY N,N 39TH ST,BRIDGE WAY N,0.0,0.0
3,521447,35TH AVE SW 0390 BLOCK C SIDE ( 18) 18 FT S/O SW ANDOVER N ST,GOOD,DISTRICT1,S,LADER,Y,N,WHT,SOUTH,-122.376151,47.568308,,35TH AVE SW,SW ANDOVER N ST,SW ANDOVER S ST,0.0,0.0
4,524665,NE 145TH ST 0200 BLOCK C SIDE ( 5) 5 FT E/O 20TH AVE NE,GOOD,DISTRICT5,N,LADER,N,N,WHT,NORTH,-122.307324,47.73389,,NE 145TH ST,20TH AVE NE,22ND AVE NE,0.0,0.0


In [221]:
# merge collisions data with crosswalks to obtain dataset of all crosswalks with labels of collisions or none

xwalk_collisions = crosswalks.merge(xwalks_sev, how='left', on='crosswalkkey')

In [222]:
xwalk_collisions.shape

(14162, 28)

In [223]:
xwalk_collisions.isnull().sum()

crosswalkkey                     0
unitdesc                         0
condition                        0
primarydistrictcd                0
approach                         0
marking_type                     0
school                           0
midblock_crosswalk               0
color                            0
maint_district                   0
shape_lng                        0
shape_lat                        0
crosswalk_control                0
onstreet                         0
xstrlow                          0
xstrhi                           0
intkey                           0
inattentionind_y                 0
location                      3617
year                          3617
count                         3617
total fatalities              3617
total serious injuries        3617
total evident injuries        3617
total possible injuries       3617
total vehicles                3617
total pedestrians involved    3617
total bicyclists involved     3617
dtype: int64

In [224]:
# drop location, unitdesc has no nulls

xwalk_collisions.drop(columns='location', inplace=True)

In [225]:
# There are duplicates by intkey and year

xwalk_collisions.drop_duplicates(['crosswalkkey', 'year'], inplace=True)

In [229]:
# make sure all intersections are kept
crosswalks['crosswalkkey'].nunique()

5683

In [230]:
xwalk_collisions['crosswalkkey'].nunique()

5683

In [249]:
# about 49.95% of crosswalks had collisions

print('Percent of traffic circles with incidents: ',
      round((~xwalk_collisions['year'].isnull()).sum() / (len(xwalk_collisions['year']))*100 , 2),'%')

Percent of traffic circles with incidents:  49.95 %


In [250]:
# fill missing values with 0 for no collision

xwalk_collisions.fillna(0, inplace=True)

In [251]:
xwalk_collisions.shape

(7227, 27)

In [252]:
xwalk_collisions.columns

Index(['crosswalkkey', 'unitdesc', 'condition', 'primarydistrictcd',
       'approach', 'marking_type', 'school', 'midblock_crosswalk', 'color',
       'maint_district', 'shape_lng', 'shape_lat', 'crosswalk_control',
       'onstreet', 'xstrlow', 'xstrhi', 'intkey', 'inattentionind_y', 'year',
       'count', 'total fatalities', 'total serious injuries',
       'total evident injuries', 'total possible injuries', 'total vehicles',
       'total pedestrians involved', 'total bicyclists involved'],
      dtype='object')

In [253]:
xwalk_collisions[['count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries']].sum()

count                      3917.0
total fatalities             43.0
total serious injuries      321.0
total evident injuries     1421.0
total possible injuries    1828.0
dtype: float64

In [254]:
xwalk_collisions['weigh_fatalities'] = xwalk_collisions['total fatalities'] * 5
xwalk_collisions['weigh_serious'] = xwalk_collisions['total serious injuries'] * 4
xwalk_collisions['weigh_evident'] = xwalk_collisions['total evident injuries'] * 3
xwalk_collisions['weigh_possible'] = xwalk_collisions['total possible injuries'] * 2

In [255]:
xw_weigh_columns = ['count', 'weigh_fatalities', 'weigh_serious', 'weigh_evident', 'weigh_possible']
xw_weigh_total = xwalk_collisions[xw_weigh_columns].sum()

In [256]:
xw_weigh_total

count               3917.0
weigh_fatalities     215.0
weigh_serious       1284.0
weigh_evident       4263.0
weigh_possible      3656.0
dtype: float64

In [257]:
xwalk_collisions.head()

Unnamed: 0,crosswalkkey,unitdesc,condition,primarydistrictcd,approach,marking_type,school,midblock_crosswalk,color,maint_district,shape_lng,shape_lat,crosswalk_control,onstreet,xstrlow,xstrhi,intkey,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible
0,522399,ALASKAN WAY 0060 BLOCK C SIDE ( 35) 35 FT NW/O YESLER WAY,GOOD,DISTRICT7,NW,LADER,N,N,WHT,CENTRAL,-122.336656,47.6018,,ALASKAN WAY,YESLER WAY,COLUMBIA ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,525064,S HOLLY ST 0440 BLOCK C SIDE ( 385) 20 FT W/O 45TH AVE S,GOOD,DISTRICT2,W,LADER,Y,N,WHT,SOUTH,-122.276553,47.542459,,S HOLLY ST,44TH AVE S,45TH AVE S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,523429,STONE WAY N 0390 BLOCK C SIDE ( 115) 115 FT N/O N 39TH ST,GOOD,DISTRICT4,S,LADER,N,N,WHT,NORTH,-122.342486,47.654435,SIGNAL,STONE WAY N,N 39TH ST,BRIDGE WAY N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,521447,35TH AVE SW 0390 BLOCK C SIDE ( 18) 18 FT S/O SW ANDOVER N ST,GOOD,DISTRICT1,S,LADER,Y,N,WHT,SOUTH,-122.376151,47.568308,,35TH AVE SW,SW ANDOVER N ST,SW ANDOVER S ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,524665,NE 145TH ST 0200 BLOCK C SIDE ( 5) 5 FT E/O 20TH AVE NE,GOOD,DISTRICT5,N,LADER,N,N,WHT,NORTH,-122.307324,47.73389,,NE 145TH ST,20TH AVE NE,22ND AVE NE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [258]:
xw_sev_score = xwalk_collisions[xw_weigh_columns].sum(axis=1) / xw_weigh_total.sum()
xw_sev_score

0        0.000000
1        0.000000
2        0.000000
3        0.000000
4        0.000000
5        0.000000
6        0.000000
           ...   
14153    0.000000
14154    0.000000
14155    0.000000
14156    0.000075
14157    0.000075
14160    0.000300
14161    0.000000
Length: 7227, dtype: float64

In [259]:
xwalk_collisions['sev_score'] = xw_sev_score

In [260]:
xwalk_collisions.head()

Unnamed: 0,crosswalkkey,unitdesc,condition,primarydistrictcd,approach,marking_type,school,midblock_crosswalk,color,maint_district,shape_lng,shape_lat,crosswalk_control,onstreet,xstrlow,xstrhi,intkey,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible,sev_score
0,522399,ALASKAN WAY 0060 BLOCK C SIDE ( 35) 35 FT NW/O YESLER WAY,GOOD,DISTRICT7,NW,LADER,N,N,WHT,CENTRAL,-122.336656,47.6018,,ALASKAN WAY,YESLER WAY,COLUMBIA ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,525064,S HOLLY ST 0440 BLOCK C SIDE ( 385) 20 FT W/O 45TH AVE S,GOOD,DISTRICT2,W,LADER,Y,N,WHT,SOUTH,-122.276553,47.542459,,S HOLLY ST,44TH AVE S,45TH AVE S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,523429,STONE WAY N 0390 BLOCK C SIDE ( 115) 115 FT N/O N 39TH ST,GOOD,DISTRICT4,S,LADER,N,N,WHT,NORTH,-122.342486,47.654435,SIGNAL,STONE WAY N,N 39TH ST,BRIDGE WAY N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,521447,35TH AVE SW 0390 BLOCK C SIDE ( 18) 18 FT S/O SW ANDOVER N ST,GOOD,DISTRICT1,S,LADER,Y,N,WHT,SOUTH,-122.376151,47.568308,,35TH AVE SW,SW ANDOVER N ST,SW ANDOVER S ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,524665,NE 145TH ST 0200 BLOCK C SIDE ( 5) 5 FT E/O 20TH AVE NE,GOOD,DISTRICT5,N,LADER,N,N,WHT,NORTH,-122.307324,47.73389,,NE 145TH ST,20TH AVE NE,22ND AVE NE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
# xwalk_collisions.to_csv('../data/processed/crosswalks_sev_score.csv')