In [199]:
# Import Modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#modeling tools
import statsmodels.api as sm

import lightgbm as lgb

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, auc, roc_auc_score, roc_curve, log_loss, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

%matplotlib inline
plt.style.use('dark_background')

%load_ext autoreload
%autoreload 2

pd.set_option('display.min_row', 100)
pd.set_option('display.max_column', 300)
pd.set_option('display.max_colwidth', 300)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
int_sev = pd.read_csv('../data/processed/int_sev_allsdot.csv')
blocks_sev = pd.read_csv('../data/processed/blocks_sev_allsdot.csv', index_col=0)
xwalks_sev = pd.read_csv('../data/processed/xwalks_sev_allsdot.csv', index_col=0)

In [3]:
int_sev.columns = int_sev.columns.map(str.lower)
blocks_sev.columns = blocks_sev.columns.map(str.lower)
xwalks_sev.columns = xwalks_sev.columns.map(str.lower)

## Traffic Circles with all_sdot Severity Metric

In [129]:
circles = pd.read_csv('../data/processed/old_circles_collisions.csv', index_col=0)

In [130]:
circles.head()

Unnamed: 0,intkey,shape_lat,shape_lng,unitdesc,primarydistrictcd,installed,landscaping,survey_monument,trcsize,trcshape,condition,year,month,hour,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
0,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,2006.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,2017.0,3.0,13.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
2,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,2005.0,5.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,2006.0,9.0,9.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
4,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,2011.0,4.0,8.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0


In [131]:
circles.columns

Index(['intkey', 'shape_lat', 'shape_lng', 'unitdesc', 'primarydistrictcd',
       'installed', 'landscaping', 'survey_monument', 'trcsize', 'trcshape',
       'condition', 'year', 'month', 'hour', 'pedcount', 'pedcylcount',
       'vehcount', 'injuries', 'seriousinjuries', 'fatalities',
       'inattentionind_y'],
      dtype='object')

Drop columns that will be included from the all_sdot severity dataset

In [132]:
drop_circles = ['year', 'month', 'hour', 'pedcount', 'pedcylcount',
       'vehcount', 'injuries', 'seriousinjuries', 'fatalities']

circles.drop(columns=drop_circles, inplace=True)

In [133]:
circles.head()

Unnamed: 0,intkey,shape_lat,shape_lng,unitdesc,primarydistrictcd,installed,landscaping,survey_monument,trcsize,trcshape,condition,inattentionind_y
0,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0
1,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0
2,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0
3,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0
4,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0


In [134]:
# merge collisions data with circles_acc to obtain dataset of all circles with labels of collisions or none

circles_collisions = circles.merge(int_sev, how='left', on='intkey')

In [135]:
circles_collisions.shape

(6290, 22)

In [136]:
circles_collisions.isnull().sum()

intkey                          0
shape_lat                       0
shape_lng                       0
unitdesc                        0
primarydistrictcd               0
installed                       0
landscaping                     0
survey_monument                 0
trcsize                         0
trcshape                        0
condition                       0
inattentionind_y                0
location                      419
year                          419
count                         419
total fatalities              419
total serious injuries        419
total evident injuries        419
total possible injuries       419
total vehicles                419
total pedestrians involved    419
total bicyclists involved     419
dtype: int64

In [137]:
# drop location, unitdesc has no nulls

circles_collisions.drop(columns='location', inplace=True)

In [138]:
# There are duplicates by intkey and year

circles_collisions.drop_duplicates(['intkey', 'year'], inplace=True)

In [139]:
circles_collisions.shape

(1798, 21)

In [140]:
# about 76.86% of traffic circles had collisions

print('Percent of traffic circles with incidents: ',
      round((~circles_collisions['year'].isnull()).sum() / (len(circles_collisions['year']))*100 , 2),'%')

Percent of traffic circles with incidents:  76.86 %


In [141]:
# fill missing values with 0 for no collision

circles_collisions.fillna(0, inplace=True)

In [143]:
circles_collisions.shape

(1798, 21)

In [144]:
circles_collisions.columns

Index(['intkey', 'shape_lat', 'shape_lng', 'unitdesc', 'primarydistrictcd',
       'installed', 'landscaping', 'survey_monument', 'trcsize', 'trcshape',
       'condition', 'inattentionind_y', 'year', 'count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries', 'total vehicles',
       'total pedestrians involved', 'total bicyclists involved'],
      dtype='object')

In [145]:
circles_collisions[['count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries']].sum()

count                      1615.0
total fatalities              1.0
total serious injuries       23.0
total evident injuries      181.0
total possible injuries     356.0
dtype: float64

In [150]:
circles_collisions['weigh_fatalities'] = circles_collisions['total fatalities'] * 5
circles_collisions['weigh_serious'] = circles_collisions['total serious injuries'] * 4
circles_collisions['weigh_evident'] = circles_collisions['total evident injuries'] * 3
circles_collisions['weigh_possible'] = circles_collisions['total possible injuries'] * 2

In [152]:
circles_collisions.columns

Index(['intkey', 'shape_lat', 'shape_lng', 'unitdesc', 'primarydistrictcd',
       'installed', 'landscaping', 'survey_monument', 'trcsize', 'trcshape',
       'condition', 'inattentionind_y', 'year', 'count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries', 'total vehicles',
       'total pedestrians involved', 'total bicyclists involved',
       'weigh_fatalities', 'weigh_serious', 'weigh_evident', 'weigh_possible'],
      dtype='object')

In [194]:
weigh_columns = ['count', 'weigh_fatalities', 'weigh_serious', 'weigh_evident', 'weigh_possible']
weigh_total = circles_collisions[weigh_columns].sum()

In [195]:
weigh_total

count               1615.0
weigh_fatalities       5.0
weigh_serious         92.0
weigh_evident        543.0
weigh_possible       712.0
dtype: float64

In [156]:
circles_collisions.head()

Unnamed: 0,intkey,shape_lat,shape_lng,unitdesc,primarydistrictcd,installed,landscaping,survey_monument,trcsize,trcshape,condition,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible
0,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0,2006.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0,2017.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
4,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2005.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0
5,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2006.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
6,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2011.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0


In [157]:
sev_score = circles_collisions[weigh_columns].sum(axis=1) / weigh_total.sum()
sev_score

0       0.000337
1       0.001011
4       0.001011
5       0.000337
6       0.001348
13      0.000337
14      0.000337
          ...   
6268    0.000337
6269    0.001011
6278    0.000337
6279    0.000337
6282    0.001685
6283    0.000337
6288    0.000000
Length: 1798, dtype: float64

In [159]:
circles_collisions['sev_score'] = sev_score

In [160]:
circles_collisions.head()

Unnamed: 0,intkey,shape_lat,shape_lng,unitdesc,primarydistrictcd,installed,landscaping,survey_monument,trcsize,trcshape,condition,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible,sev_score
0,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0,2006.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000337
1,31889,47.566489,-122.380088,38TH AVE SW AND SW DAKOTA ST,DISTRICT1,2000,1,0,0,CRC,GOOD,0.0,2017.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.001011
4,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2005.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.001011
5,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2006.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000337
6,37392,47.702894,-122.35005,FREMONT AVE N AND N 102ND ST,DISTRICT5,1996,1,0,0,CRC,GOOD,0.0,2011.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,0.001348


In [162]:
# circles_collisions.to_csv('../data/processed/circles_sev_score.csv')

## Intersections(circles removed) with all_sdot Severity Metric

In [235]:
intersections = pd.read_csv('../data/processed/inter_nocircles.csv', index_col=0)

In [236]:
intersections.head()

Unnamed: 0,intr_id,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,year,month,hour,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
0,18213,340313,GRDWM,4TH AVE S AND S HENDERSON N ST,0.0,NONE,-122.329732,47.523051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10302,157936,E,WOODROW PL E AND E GARFIELD ST,0.0,NONE,-122.284745,47.633387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4716,37264,BLRD,12TH AVE NW AND NW 87TH ST,0.0,NONE,-122.371401,47.692058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11483,30231,CNTRL,34TH AVE AND E HOWELL ST,3.0,NONE,-122.289176,47.617639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6406,25752,NE,NE 63RD ST AND NE RAVENNA EB BV,1.0,NONE,-122.320648,47.674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [237]:
intersections.shape

(70793, 18)

In [238]:
intersections.columns

Index(['intr_id', 'intkey', 'subarea', 'unitdesc', 'arterialclasscd',
       'signal_type', 'shape_lng', 'shape_lat', 'year', 'month', 'hour',
       'pedcount', 'pedcylcount', 'vehcount', 'injuries', 'seriousinjuries',
       'fatalities', 'inattentionind_y'],
      dtype='object')

Drop columns that will be included from the all_sdot severity dataset

In [239]:
drop_inters = ['intr_id', 'year', 'month', 'hour', 'pedcount', 'pedcylcount',
       'vehcount', 'injuries', 'seriousinjuries', 'fatalities']

intersections.drop(columns=drop_inters, inplace=True)

In [240]:
intersections.head()

Unnamed: 0,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,inattentionind_y
0,340313,GRDWM,4TH AVE S AND S HENDERSON N ST,0.0,NONE,-122.329732,47.523051,0.0
1,157936,E,WOODROW PL E AND E GARFIELD ST,0.0,NONE,-122.284745,47.633387,0.0
2,37264,BLRD,12TH AVE NW AND NW 87TH ST,0.0,NONE,-122.371401,47.692058,0.0
3,30231,CNTRL,34TH AVE AND E HOWELL ST,3.0,NONE,-122.289176,47.617639,0.0
4,25752,NE,NE 63RD ST AND NE RAVENNA EB BV,1.0,NONE,-122.320648,47.674,0.0


In [241]:
# merge collisions data with circles_acc to obtain dataset of all circles with labels of collisions or none

inters_collisions = intersections.merge(int_sev, how='left', on='intkey')

In [242]:
inters_collisions.shape

(726403, 18)

In [243]:
inters_collisions.isnull().sum()

intkey                           0
subarea                          0
unitdesc                         0
arterialclasscd                  0
signal_type                      0
shape_lng                        0
shape_lat                        0
inattentionind_y                 0
location                      7170
year                          7170
count                         7170
total fatalities              7170
total serious injuries        7170
total evident injuries        7170
total possible injuries       7170
total vehicles                7170
total pedestrians involved    7170
total bicyclists involved     7170
dtype: int64

In [244]:
# drop location, unitdesc has no nulls

inters_collisions.drop(columns='location', inplace=True)

In [245]:
# There are duplicates by intkey and year

inters_collisions.drop_duplicates(['intkey', 'year'], inplace=True)

In [246]:
# make sure all intersections are kept
intersections['intkey'].nunique()

14385

In [247]:
inters_collisions['intkey'].nunique()

14385

In [248]:
# about 83.63% of intersections(no circles) had collisions

print('Percent of intersections (excluding circles) with incidents: ',
      round((~inters_collisions['year'].isnull()).sum() / (len(inters_collisions['year']))*100 , 2),'%')

Percent of intersections (excluding circles) with incidents:  83.63 %


In [183]:
# fill missing values with 0 for no collision

inters_collisions.fillna(0, inplace=True)

In [184]:
inters_collisions.shape

(43800, 17)

In [185]:
inters_collisions.columns

Index(['intkey', 'subarea', 'unitdesc', 'arterialclasscd', 'signal_type',
       'shape_lng', 'shape_lat', 'inattentionind_y', 'year', 'count',
       'total fatalities', 'total serious injuries', 'total evident injuries',
       'total possible injuries', 'total vehicles',
       'total pedestrians involved', 'total bicyclists involved'],
      dtype='object')

In [186]:
inters_collisions[['count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries']].sum()

count                      69860.0
total fatalities             138.0
total serious injuries      1545.0
total evident injuries      9079.0
total possible injuries    24809.0
dtype: float64

In [187]:
inters_collisions['weigh_fatalities'] = inters_collisions['total fatalities'] * 5
inters_collisions['weigh_serious'] = inters_collisions['total serious injuries'] * 4
inters_collisions['weigh_evident'] = inters_collisions['total evident injuries'] * 3
inters_collisions['weigh_possible'] = inters_collisions['total possible injuries'] * 2

In [192]:
int_weigh_columns = ['count', 'weigh_fatalities', 'weigh_serious', 'weigh_evident', 'weigh_possible']
int_weigh_total = inters_collisions[int_weigh_columns].sum()

In [196]:
int_weigh_total

count               69860.0
weigh_fatalities      690.0
weigh_serious        6180.0
weigh_evident       27237.0
weigh_possible      49618.0
dtype: float64

In [197]:
inters_collisions.head()

Unnamed: 0,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible
0,340313,GRDWM,4TH AVE S AND S HENDERSON N ST,0.0,NONE,-122.329732,47.523051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,157936,E,WOODROW PL E AND E GARFIELD ST,0.0,NONE,-122.284745,47.633387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,37264,BLRD,12TH AVE NW AND NW 87TH ST,0.0,NONE,-122.371401,47.692058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30231,CNTRL,34TH AVE AND E HOWELL ST,3.0,NONE,-122.289176,47.617639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25752,NE,NE 63RD ST AND NE RAVENNA EB BV,1.0,NONE,-122.320648,47.674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [198]:
int_sev_score = inters_collisions[int_weigh_columns].sum(axis=1) / int_weigh_total.sum()
int_sev_score

0         0.000000
1         0.000000
2         0.000000
3         0.000000
4         0.000000
5         0.000007
6         0.000026
            ...   
726392    0.000000
726393    0.000007
726394    0.000026
726399    0.000000
726400    0.000000
726401    0.000007
726402    0.000000
Length: 43800, dtype: float64

In [199]:
inters_collisions['sev_score'] = int_sev_score

In [200]:
inters_collisions.head()

Unnamed: 0,intkey,subarea,unitdesc,arterialclasscd,signal_type,shape_lng,shape_lat,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible,sev_score
0,340313,GRDWM,4TH AVE S AND S HENDERSON N ST,0.0,NONE,-122.329732,47.523051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,157936,E,WOODROW PL E AND E GARFIELD ST,0.0,NONE,-122.284745,47.633387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,37264,BLRD,12TH AVE NW AND NW 87TH ST,0.0,NONE,-122.371401,47.692058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30231,CNTRL,34TH AVE AND E HOWELL ST,3.0,NONE,-122.289176,47.617639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25752,NE,NE 63RD ST AND NE RAVENNA EB BV,1.0,NONE,-122.320648,47.674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [201]:
# inters_collisions.to_csv('../data/processed/intersections_sev_score.csv')

## Crosswalks with all_sdot Severity Metric

In [202]:
crosswalks = pd.read_csv('../data/processed/crosswalk_collisions.csv', index_col=0)

In [203]:
crosswalks.head()

Unnamed: 0,crosswalkkey,unitdesc,condition,primarydistrictcd,approach,marking_type,school,midblock_crosswalk,color,maint_district,shape_lng,shape_lat,crosswalk_control,onstreet,xstrlow,xstrhi,year,month,hour,intkey,pedcount,pedcylcount,vehcount,injuries,seriousinjuries,fatalities,inattentionind_y
0,522399,ALASKAN WAY 0060 BLOCK C SIDE ( 35) 35 FT NW/O YESLER WAY,GOOD,DISTRICT7,NW,LADER,N,N,WHT,CENTRAL,-122.336656,47.6018,,ALASKAN WAY,YESLER WAY,COLUMBIA ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,525064,S HOLLY ST 0440 BLOCK C SIDE ( 385) 20 FT W/O 45TH AVE S,GOOD,DISTRICT2,W,LADER,Y,N,WHT,SOUTH,-122.276553,47.542459,,S HOLLY ST,44TH AVE S,45TH AVE S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,523429,STONE WAY N 0390 BLOCK C SIDE ( 115) 115 FT N/O N 39TH ST,GOOD,DISTRICT4,S,LADER,N,N,WHT,NORTH,-122.342486,47.654435,SIGNAL,STONE WAY N,N 39TH ST,BRIDGE WAY N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,521447,35TH AVE SW 0390 BLOCK C SIDE ( 18) 18 FT S/O SW ANDOVER N ST,GOOD,DISTRICT1,S,LADER,Y,N,WHT,SOUTH,-122.376151,47.568308,,35TH AVE SW,SW ANDOVER N ST,SW ANDOVER S ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,524665,NE 145TH ST 0200 BLOCK C SIDE ( 5) 5 FT E/O 20TH AVE NE,GOOD,DISTRICT5,N,LADER,N,N,WHT,NORTH,-122.307324,47.73389,,NE 145TH ST,20TH AVE NE,22ND AVE NE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
crosswalks.shape

(7502, 27)

In [205]:
crosswalks.columns

Index(['crosswalkkey', 'unitdesc', 'condition', 'primarydistrictcd',
       'approach', 'marking_type', 'school', 'midblock_crosswalk', 'color',
       'maint_district', 'shape_lng', 'shape_lat', 'crosswalk_control',
       'onstreet', 'xstrlow', 'xstrhi', 'year', 'month', 'hour', 'intkey',
       'pedcount', 'pedcylcount', 'vehcount', 'injuries', 'seriousinjuries',
       'fatalities', 'inattentionind_y'],
      dtype='object')

Drop columns that will be included from the all_sdot severity dataset

In [206]:
drop_xwalks = ['year', 'month', 'hour', 'pedcount', 'pedcylcount',
       'vehcount', 'injuries', 'seriousinjuries', 'fatalities']

crosswalks.drop(columns=drop_xwalks, inplace=True)

In [212]:
crosswalks.head()

Unnamed: 0,crosswalkkey,unitdesc,condition,primarydistrictcd,approach,marking_type,school,midblock_crosswalk,color,maint_district,shape_lng,shape_lat,crosswalk_control,onstreet,xstrlow,xstrhi,intkey,inattentionind_y
0,522399,ALASKAN WAY 0060 BLOCK C SIDE ( 35) 35 FT NW/O YESLER WAY,GOOD,DISTRICT7,NW,LADER,N,N,WHT,CENTRAL,-122.336656,47.6018,,ALASKAN WAY,YESLER WAY,COLUMBIA ST,0.0,0.0
1,525064,S HOLLY ST 0440 BLOCK C SIDE ( 385) 20 FT W/O 45TH AVE S,GOOD,DISTRICT2,W,LADER,Y,N,WHT,SOUTH,-122.276553,47.542459,,S HOLLY ST,44TH AVE S,45TH AVE S,0.0,0.0
2,523429,STONE WAY N 0390 BLOCK C SIDE ( 115) 115 FT N/O N 39TH ST,GOOD,DISTRICT4,S,LADER,N,N,WHT,NORTH,-122.342486,47.654435,SIGNAL,STONE WAY N,N 39TH ST,BRIDGE WAY N,0.0,0.0
3,521447,35TH AVE SW 0390 BLOCK C SIDE ( 18) 18 FT S/O SW ANDOVER N ST,GOOD,DISTRICT1,S,LADER,Y,N,WHT,SOUTH,-122.376151,47.568308,,35TH AVE SW,SW ANDOVER N ST,SW ANDOVER S ST,0.0,0.0
4,524665,NE 145TH ST 0200 BLOCK C SIDE ( 5) 5 FT E/O 20TH AVE NE,GOOD,DISTRICT5,N,LADER,N,N,WHT,NORTH,-122.307324,47.73389,,NE 145TH ST,20TH AVE NE,22ND AVE NE,0.0,0.0


In [221]:
# merge collisions data with crosswalks to obtain dataset of all crosswalks with labels of collisions or none

xwalk_collisions = crosswalks.merge(xwalks_sev, how='left', on='crosswalkkey')

In [222]:
xwalk_collisions.shape

(14162, 28)

In [223]:
xwalk_collisions.isnull().sum()

crosswalkkey                     0
unitdesc                         0
condition                        0
primarydistrictcd                0
approach                         0
marking_type                     0
school                           0
midblock_crosswalk               0
color                            0
maint_district                   0
shape_lng                        0
shape_lat                        0
crosswalk_control                0
onstreet                         0
xstrlow                          0
xstrhi                           0
intkey                           0
inattentionind_y                 0
location                      3617
year                          3617
count                         3617
total fatalities              3617
total serious injuries        3617
total evident injuries        3617
total possible injuries       3617
total vehicles                3617
total pedestrians involved    3617
total bicyclists involved     3617
dtype: int64

In [224]:
# drop location, unitdesc has no nulls

xwalk_collisions.drop(columns='location', inplace=True)

In [225]:
# There are duplicates by intkey and year

xwalk_collisions.drop_duplicates(['crosswalkkey', 'year'], inplace=True)

In [229]:
# make sure all intersections are kept
crosswalks['crosswalkkey'].nunique()

5683

In [230]:
xwalk_collisions['crosswalkkey'].nunique()

5683

In [249]:
# about 49.95% of crosswalks had collisions

print('Percent of traffic circles with incidents: ',
      round((~xwalk_collisions['year'].isnull()).sum() / (len(xwalk_collisions['year']))*100 , 2),'%')

Percent of traffic circles with incidents:  49.95 %


In [250]:
# fill missing values with 0 for no collision

xwalk_collisions.fillna(0, inplace=True)

In [251]:
xwalk_collisions.shape

(7227, 27)

In [252]:
xwalk_collisions.columns

Index(['crosswalkkey', 'unitdesc', 'condition', 'primarydistrictcd',
       'approach', 'marking_type', 'school', 'midblock_crosswalk', 'color',
       'maint_district', 'shape_lng', 'shape_lat', 'crosswalk_control',
       'onstreet', 'xstrlow', 'xstrhi', 'intkey', 'inattentionind_y', 'year',
       'count', 'total fatalities', 'total serious injuries',
       'total evident injuries', 'total possible injuries', 'total vehicles',
       'total pedestrians involved', 'total bicyclists involved'],
      dtype='object')

In [253]:
xwalk_collisions[['count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries']].sum()

count                      3917.0
total fatalities             43.0
total serious injuries      321.0
total evident injuries     1421.0
total possible injuries    1828.0
dtype: float64

In [254]:
xwalk_collisions['weigh_fatalities'] = xwalk_collisions['total fatalities'] * 5
xwalk_collisions['weigh_serious'] = xwalk_collisions['total serious injuries'] * 4
xwalk_collisions['weigh_evident'] = xwalk_collisions['total evident injuries'] * 3
xwalk_collisions['weigh_possible'] = xwalk_collisions['total possible injuries'] * 2

In [255]:
xw_weigh_columns = ['count', 'weigh_fatalities', 'weigh_serious', 'weigh_evident', 'weigh_possible']
xw_weigh_total = xwalk_collisions[xw_weigh_columns].sum()

In [256]:
xw_weigh_total

count               3917.0
weigh_fatalities     215.0
weigh_serious       1284.0
weigh_evident       4263.0
weigh_possible      3656.0
dtype: float64

In [257]:
xwalk_collisions.head()

Unnamed: 0,crosswalkkey,unitdesc,condition,primarydistrictcd,approach,marking_type,school,midblock_crosswalk,color,maint_district,shape_lng,shape_lat,crosswalk_control,onstreet,xstrlow,xstrhi,intkey,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible
0,522399,ALASKAN WAY 0060 BLOCK C SIDE ( 35) 35 FT NW/O YESLER WAY,GOOD,DISTRICT7,NW,LADER,N,N,WHT,CENTRAL,-122.336656,47.6018,,ALASKAN WAY,YESLER WAY,COLUMBIA ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,525064,S HOLLY ST 0440 BLOCK C SIDE ( 385) 20 FT W/O 45TH AVE S,GOOD,DISTRICT2,W,LADER,Y,N,WHT,SOUTH,-122.276553,47.542459,,S HOLLY ST,44TH AVE S,45TH AVE S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,523429,STONE WAY N 0390 BLOCK C SIDE ( 115) 115 FT N/O N 39TH ST,GOOD,DISTRICT4,S,LADER,N,N,WHT,NORTH,-122.342486,47.654435,SIGNAL,STONE WAY N,N 39TH ST,BRIDGE WAY N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,521447,35TH AVE SW 0390 BLOCK C SIDE ( 18) 18 FT S/O SW ANDOVER N ST,GOOD,DISTRICT1,S,LADER,Y,N,WHT,SOUTH,-122.376151,47.568308,,35TH AVE SW,SW ANDOVER N ST,SW ANDOVER S ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,524665,NE 145TH ST 0200 BLOCK C SIDE ( 5) 5 FT E/O 20TH AVE NE,GOOD,DISTRICT5,N,LADER,N,N,WHT,NORTH,-122.307324,47.73389,,NE 145TH ST,20TH AVE NE,22ND AVE NE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [258]:
xw_sev_score = xwalk_collisions[xw_weigh_columns].sum(axis=1) / xw_weigh_total.sum()
xw_sev_score

0        0.000000
1        0.000000
2        0.000000
3        0.000000
4        0.000000
5        0.000000
6        0.000000
           ...   
14153    0.000000
14154    0.000000
14155    0.000000
14156    0.000075
14157    0.000075
14160    0.000300
14161    0.000000
Length: 7227, dtype: float64

In [259]:
xwalk_collisions['sev_score'] = xw_sev_score

In [260]:
xwalk_collisions.head()

Unnamed: 0,crosswalkkey,unitdesc,condition,primarydistrictcd,approach,marking_type,school,midblock_crosswalk,color,maint_district,shape_lng,shape_lat,crosswalk_control,onstreet,xstrlow,xstrhi,intkey,inattentionind_y,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible,sev_score
0,522399,ALASKAN WAY 0060 BLOCK C SIDE ( 35) 35 FT NW/O YESLER WAY,GOOD,DISTRICT7,NW,LADER,N,N,WHT,CENTRAL,-122.336656,47.6018,,ALASKAN WAY,YESLER WAY,COLUMBIA ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,525064,S HOLLY ST 0440 BLOCK C SIDE ( 385) 20 FT W/O 45TH AVE S,GOOD,DISTRICT2,W,LADER,Y,N,WHT,SOUTH,-122.276553,47.542459,,S HOLLY ST,44TH AVE S,45TH AVE S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,523429,STONE WAY N 0390 BLOCK C SIDE ( 115) 115 FT N/O N 39TH ST,GOOD,DISTRICT4,S,LADER,N,N,WHT,NORTH,-122.342486,47.654435,SIGNAL,STONE WAY N,N 39TH ST,BRIDGE WAY N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,521447,35TH AVE SW 0390 BLOCK C SIDE ( 18) 18 FT S/O SW ANDOVER N ST,GOOD,DISTRICT1,S,LADER,Y,N,WHT,SOUTH,-122.376151,47.568308,,35TH AVE SW,SW ANDOVER N ST,SW ANDOVER S ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,524665,NE 145TH ST 0200 BLOCK C SIDE ( 5) 5 FT E/O 20TH AVE NE,GOOD,DISTRICT5,N,LADER,N,N,WHT,NORTH,-122.307324,47.73389,,NE 145TH ST,20TH AVE NE,22ND AVE NE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
# xwalk_collisions.to_csv('../data/processed/crosswalks_sev_score.csv')

## Blocks with all_sdot Severity Metric

Need to clean seattle_streets data first

In [123]:
blocks = pd.read_csv('../data/Seattle_Streets.csv')

In [124]:
blocks.columns = blocks.columns.map(str.lower)

In [125]:
blocks.head()

Unnamed: 0,objectid,artclass,compkey,unitid,unitid2,unitidsort,unitdesc,stname_ord,xstrlo,xstrhi,artdescript,owner,status,blocknbr,speedlimit,segdir,oneway,onewaydir,flow,seglength,surfacewidth,surfacetype_1,surfacetype_2,intrlo,dirlo,intkeylo,intrhi,dirhi,nationhwysys,streettype,pvmtcondindx1,pvmtcondindx2,tranclass,trandescript,slope_pct,pvmtcategory,parkboulevard,shape_length
0,1,2.0,1006,10.0,120.0,100120.0,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,1ST AVE,SENECA ST,UNIVERSITY ST,Minor Arterial,,INSVC,1200.0,25.0,NW,N,,,306.0,48.0,PCC,AC/PCC,1ST AVE AND SENECA ST,NW,29611.0,1ST AVE AND UNIVERSITY ST,SE,N,Downtown Neighborhood,87.0,62.0,1,PRINCIPAL TRANSIT ROUTE,4.0,ART,N,305.96605
1,2,2.0,1009,10.0,150.0,100150.0,1ST AVE BETWEEN PIKE ST AND PINE ST,1ST AVE,PIKE ST,PINE ST,Minor Arterial,,INSVC,1500.0,25.0,NW,N,,,426.0,104.0,AC/PCC,PCC,1ST AVE AND PIKE ST,NW,29593.0,1ST AVE AND PINE ST,SE,N,Downtown Neighborhood,57.0,58.0,1,PRINCIPAL TRANSIT ROUTE,5.0,ART,N,426.031562
2,3,0.0,1032,15.0,80.0,150080.0,1ST AVE N BETWEEN VALLEY UPPER ST AND ALOHA ST,1ST AVE N,VALLEY UPPER ST,ALOHA ST,Not Designated,,INSVC,800.0,20.0,N,N,,,297.0,0.0,PCC,,1ST AVE N AND VALLEY UPPER ST,N,28897.0,1ST AVE N AND ALOHA ST,S,N,Neighborhood Yield Street,0.0,0.0,0,NOT DESIGNATED,17.0,NON-ART,N,297.147592
3,4,0.0,1051,15.0,230.0,150230.0,1ST AVE N BETWEEN LYNN ST AND MCGRAW S ST,1ST AVE N,LYNN ST,MCGRAW S ST,Not Designated,,INSVC,2200.0,20.0,N,N,,,175.0,25.0,AC,,1ST AVE N AND LYNN ST,N,28113.0,1ST AVE N AND MCGRAW S ST,S,N,Neighborhood Yield Street,9.0,0.0,0,NOT DESIGNATED,3.0,NON-ART,N,174.804983
4,5,0.0,1060,15.0,282.0,150282.0,1ST AVE N BETWEEN FULTON S ST AND FULTON N ST,1ST AVE N,FULTON S ST,FULTON N ST,Not Designated,,INSVC,2800.0,20.0,N,N,,,73.0,0.0,PCC,,1ST AVE N AND FULTON S ST,N,28051.0,1ST AVE N AND FULTON N ST,S,N,Neighborhood Yield Street,0.0,0.0,0,NOT DESIGNATED,5.0,NON-ART,N,73.110708


In [126]:
blocks.shape

(23806, 38)

In [127]:
# seems to be 6 rows with many missing column values
blocks.isnull().sum()

objectid             0
artclass             6
compkey              0
unitid               6
unitid2              6
unitidsort           6
unitdesc             6
stname_ord           0
xstrlo               6
xstrhi               6
artdescript          6
owner                6
status               6
blocknbr             6
speedlimit           7
segdir               6
oneway              11
onewaydir            6
flow                 6
seglength            6
surfacewidth         9
surfacetype_1     1198
surfacetype_2    23123
intrlo               9
dirlo                6
intkeylo             9
intrhi              11
dirhi                7
nationhwysys         6
streettype         793
pvmtcondindx1        6
pvmtcondindx2        6
tranclass            0
trandescript       187
slope_pct           18
pvmtcategory         6
parkboulevard        6
shape_length         0
dtype: int64

In [128]:
blocks['shape_length'].value_counts(dropna=False)

331.023148    2
331.561297    2
263.999888    2
290.993997    2
264.022730    2
263.999574    2
264.041689    2
             ..
331.872799    1
873.465661    1
229.249706    1
322.387703    1
320.219064    1
336.349619    1
201.871136    1
Name: shape_length, Length: 23798, dtype: int64

In [129]:
blocks['shape_length'].nunique()

23798

In [130]:
blocks.columns

Index(['objectid', 'artclass', 'compkey', 'unitid', 'unitid2', 'unitidsort',
       'unitdesc', 'stname_ord', 'xstrlo', 'xstrhi', 'artdescript', 'owner',
       'status', 'blocknbr', 'speedlimit', 'segdir', 'oneway', 'onewaydir',
       'flow', 'seglength', 'surfacewidth', 'surfacetype_1', 'surfacetype_2',
       'intrlo', 'dirlo', 'intkeylo', 'intrhi', 'dirhi', 'nationhwysys',
       'streettype', 'pvmtcondindx1', 'pvmtcondindx2', 'tranclass',
       'trandescript', 'slope_pct', 'pvmtcategory', 'parkboulevard',
       'shape_length'],
      dtype='object')

Drop columns that is not necessary and will be included from the all_sdot severity dataset

In [131]:
drop_blocks = ['objectid', 'artclass', 'compkey', 'unitid', 'unitid2', 'unitidsort','stname_ord', 'xstrlo', 
               'xstrhi', 'owner','status', 'blocknbr', 'onewaydir', 'flow', 'seglength', 'surfacewidth',
               'surfacetype_2', 'intrlo', 'dirlo', 'intkeylo', 'intrhi', 'dirhi', 'pvmtcondindx1',
               'pvmtcondindx2', 'pvmtcategory', 'shape_length']

blocks.drop(columns=drop_blocks, inplace=True)

In [132]:
blocks.head()

Unnamed: 0,unitdesc,artdescript,speedlimit,segdir,oneway,surfacetype_1,nationhwysys,streettype,tranclass,trandescript,slope_pct,parkboulevard
0,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N
1,1ST AVE BETWEEN PIKE ST AND PINE ST,Minor Arterial,25.0,NW,N,AC/PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,5.0,N
2,1ST AVE N BETWEEN VALLEY UPPER ST AND ALOHA ST,Not Designated,20.0,N,N,PCC,N,Neighborhood Yield Street,0,NOT DESIGNATED,17.0,N
3,1ST AVE N BETWEEN LYNN ST AND MCGRAW S ST,Not Designated,20.0,N,N,AC,N,Neighborhood Yield Street,0,NOT DESIGNATED,3.0,N
4,1ST AVE N BETWEEN FULTON S ST AND FULTON N ST,Not Designated,20.0,N,N,PCC,N,Neighborhood Yield Street,0,NOT DESIGNATED,5.0,N


## Deal with Blocks missing values

In [133]:
blocks.isnull().sum()

unitdesc            6
artdescript         6
speedlimit          7
segdir              6
oneway             11
surfacetype_1    1198
nationhwysys        6
streettype        793
tranclass           0
trandescript      187
slope_pct          18
parkboulevard       6
dtype: int64

In [134]:
blocks[blocks['surfacetype_1'].isnull()].groupby(['artdescript', 'streettype']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,unitdesc,speedlimit,segdir,oneway,surfacetype_1,nationhwysys,tranclass,trandescript,slope_pct,parkboulevard
artdescript,streettype,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Collector Arterial,Neighborhood Corridor,12,12,12,12,0,12,12,12,12,12
Collector Arterial,Urban Village Neighborhood,1,1,1,1,0,1,1,1,1,1
Minor Arterial,Downtown Neighborhood,2,2,2,2,0,2,2,2,2,2
Minor Arterial,Industrial Access,8,8,8,8,0,8,8,7,8,8
Minor Arterial,Neighborhood Corridor,3,3,3,3,0,3,3,3,3,3
Minor Arterial,Urban Center Connector,3,3,3,3,0,3,3,3,3,3
Minor Arterial,Urban Village Neighborhood,2,2,2,2,0,2,2,2,2,2
Not Designated,Alley,4,4,4,4,0,4,4,3,4,4
Not Designated,Downtown Neighborhood Access,20,20,20,20,0,20,20,18,20,20
Not Designated,Minor Industrial Access,26,26,26,26,0,26,26,25,26,26


- most Collector Arterial has surfacetype PCC
- most Minor Arterial has surfacetype AC/PCC
- most Not Designated has surfacetype PCC
- most Principal Arterial has surfacetype AC/PCC

In [135]:
blocks['surfacetype_1'].fillna(' ', inplace=True)

In [138]:
 for i, x in enumerate(blocks['surfacetype_1']):
        if x != ' ':
            x = x
        elif (blocks.loc[i, 'artdescript']=='Collector Arterial') | (blocks.loc[i, 'artdescript']=='Not Designated'):
            blocks.loc[i, 'surfacetype_1'] = 'PCC'
        else:
            blocks.loc[i, 'surfacetype_1'] = 'AC/PCC'

In [142]:
blocks['surfacetype_1'].value_counts(dropna=False)

PCC       11202
AC/PCC     4394
ST         4179
AC         3698
AC/AC       204
GRAVEL      129
Name: surfacetype_1, dtype: int64

- most 'Principal Arterial', 'Interstate/Freeway', 'State Route/Freeway' are Urban Center Connector streettype
- most 'Collector Arterial', 'Minor Arterial' is Neighborhood Corridor
- most 'Not Designated' is Neighborhood Yield Street

In [173]:
blocks['streettype'].fillna(' ', inplace=True)

In [183]:
 for i, x in enumerate(blocks['streettype']):
        if x != ' ':
            x = x
        elif (blocks.loc[i, 'artdescript'] in ['Principal Arterial', 'Interstate/Freeway', 'State Route/Freeway', 'County Arterial']):
            blocks.loc[i, 'streettype'] = 'Urban Center Connector'
        elif (blocks.loc[i, 'artdescript'] in ['Collector Arterial', 'Minor Arterial']):
            blocks.loc[i, 'streettype'] = 'Neighborhood Corridor'
        else:
            blocks.loc[i, 'streettype'] = 'Neighborhood Yield Street'

In [185]:
blocks['streettype'].value_counts(dropna=False)

Neighborhood Yield Street            13072
Urban Village Neighborhood Access     3074
Neighborhood Corridor                 2267
Urban Center Connector                1632
Urban Village Neighborhood            1144
Urban Village Main                     811
Minor Industrial Access                594
Industrial Access                      460
Downtown                               298
Downtown Neighborhood                  227
Downtown Neighborhood Access           197
Alley                                   30
Name: streettype, dtype: int64

- Average slope_pct for neighborhood streets is 4
- Average slope_pct for urban center connector is 3

In [208]:
blocks['slope_pct'].fillna(' ', inplace=True)

In [209]:
for i, x in enumerate(blocks['slope_pct']):
        if x != ' ':
            x = x
        elif (blocks.loc[i, 'streettype'] in ['Neighborhood Corridor', 'Neighborhood Yield Street']):
            blocks.loc[i, 'slope_pct'] = 4
        else:
            blocks.loc[i, 'slope_pct'] = 3

In [216]:
# drop the rest of missing values due to the whole row being mostly missing

blocks.dropna(axis=0, inplace=True)

In [217]:
blocks.isnull().sum()

unitdesc         0
artdescript      0
speedlimit       0
segdir           0
oneway           0
surfacetype_1    0
nationhwysys     0
streettype       0
tranclass        0
trandescript     0
slope_pct        0
parkboulevard    0
dtype: int64

In [218]:
blocks.head()

Unnamed: 0,unitdesc,artdescript,speedlimit,segdir,oneway,surfacetype_1,nationhwysys,streettype,tranclass,trandescript,slope_pct,parkboulevard
0,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4,N
1,1ST AVE BETWEEN PIKE ST AND PINE ST,Minor Arterial,25.0,NW,N,AC/PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,5,N
2,1ST AVE N BETWEEN VALLEY UPPER ST AND ALOHA ST,Not Designated,20.0,N,N,PCC,N,Neighborhood Yield Street,0,NOT DESIGNATED,17,N
3,1ST AVE N BETWEEN LYNN ST AND MCGRAW S ST,Not Designated,20.0,N,N,AC,N,Neighborhood Yield Street,0,NOT DESIGNATED,3,N
4,1ST AVE N BETWEEN FULTON S ST AND FULTON N ST,Not Designated,20.0,N,N,PCC,N,Neighborhood Yield Street,0,NOT DESIGNATED,5,N


In [223]:
blocks.rename(columns={'unitdesc':'location'}, inplace=True)

In [224]:
# merge collisions data with blocks to obtain dataset of all blocks with labels of collisions or none

blocks_collisions = blocks.merge(blocks_sev, how='left', on='location')

In [225]:
blocks_collisions.shape

(87248, 21)

In [226]:
blocks_collisions.isnull().sum()

location                         0
artdescript                      0
speedlimit                       0
segdir                           0
oneway                           0
surfacetype_1                    0
nationhwysys                     0
streettype                       0
tranclass                        0
trandescript                     0
slope_pct                        0
parkboulevard                    0
year                          6503
count                         6503
total fatalities              6503
total serious injuries        6503
total evident injuries        6503
total possible injuries       6503
total vehicles                6503
total pedestrians involved    6503
total bicyclists involved     6503
dtype: int64

In [227]:
# There are duplicates by intkey and year

blocks_collisions.drop_duplicates(['location', 'year'], inplace=True)

In [229]:
# make sure all intersections are kept
blocks['location'].nunique()

23619

In [231]:
blocks_collisions['location'].nunique()

23619

In [234]:
# about 92.55% of blocks had collisions

print('Percent of blocks with incidents: ',
      round((~blocks_collisions['year'].isnull()).sum() / (len(blocks_collisions['year']))*100 , 2),'%')

Percent of blocks with incidents:  92.55 %


In [235]:
# fill missing values with 0 for no collision

blocks_collisions.fillna(0, inplace=True)

In [236]:
blocks_collisions.shape

(87248, 21)

In [237]:
blocks_collisions.columns

Index(['location', 'artdescript', 'speedlimit', 'segdir', 'oneway',
       'surfacetype_1', 'nationhwysys', 'streettype', 'tranclass',
       'trandescript', 'slope_pct', 'parkboulevard', 'year', 'count',
       'total fatalities', 'total serious injuries', 'total evident injuries',
       'total possible injuries', 'total vehicles',
       'total pedestrians involved', 'total bicyclists involved'],
      dtype='object')

In [238]:
blocks_collisions[['count', 'total fatalities',
       'total serious injuries', 'total evident injuries',
       'total possible injuries']].sum()

count                      141939.0
total fatalities              175.0
total serious injuries       1574.0
total evident injuries       8258.0
total possible injuries     26801.0
dtype: float64

In [239]:
blocks_collisions['weigh_fatalities'] = blocks_collisions['total fatalities'] * 5
blocks_collisions['weigh_serious'] = blocks_collisions['total serious injuries'] * 4
blocks_collisions['weigh_evident'] = blocks_collisions['total evident injuries'] * 3
blocks_collisions['weigh_possible'] = blocks_collisions['total possible injuries'] * 2

In [240]:
bl_weigh_columns = ['count', 'weigh_fatalities', 'weigh_serious', 'weigh_evident', 'weigh_possible']
bl_weigh_total = blocks_collisions[bl_weigh_columns].sum()

In [241]:
bl_weigh_total

count               141939.0
weigh_fatalities       875.0
weigh_serious         6296.0
weigh_evident        24774.0
weigh_possible       53602.0
dtype: float64

In [242]:
blocks_collisions.head()

Unnamed: 0,location,artdescript,speedlimit,segdir,oneway,surfacetype_1,nationhwysys,streettype,tranclass,trandescript,slope_pct,parkboulevard,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible
0,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2004.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2005.0,2.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,2.0
2,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2006.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2007.0,4.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2008.0,3.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0


In [243]:
bl_sev_score = blocks_collisions[bl_weigh_columns].sum(axis=1) / bl_weigh_total.sum()
bl_sev_score

0        0.000004
1        0.000018
2        0.000013
3        0.000018
4        0.000013
5        0.000018
6        0.000022
7        0.000004
8        0.000022
9        0.000022
10       0.000013
11       0.000053
12       0.000026
13       0.000035
14       0.000013
15       0.000009
16       0.000035
17       0.000066
18       0.000062
19       0.000026
20       0.000053
21       0.000013
22       0.000070
23       0.000057
24       0.000084
25       0.000035
26       0.000053
27       0.000031
28       0.000044
29       0.000057
           ...   
87218    0.000009
87219    0.000013
87220    0.000004
87221    0.000004
87222    0.000004
87223    0.000018
87224    0.000031
87225    0.000018
87226    0.000004
87227    0.000004
87228    0.000004
87229    0.000009
87230    0.000004
87231    0.000004
87232    0.000004
87233    0.000000
87234    0.000004
87235    0.000009
87236    0.000048
87237    0.000004
87238    0.000004
87239    0.000013
87240    0.000013
87241    0.000022
87242    0

In [244]:
blocks_collisions['sev_score'] = bl_sev_score

In [245]:
blocks_collisions.head()

Unnamed: 0,location,artdescript,speedlimit,segdir,oneway,surfacetype_1,nationhwysys,streettype,tranclass,trandescript,slope_pct,parkboulevard,year,count,total fatalities,total serious injuries,total evident injuries,total possible injuries,total vehicles,total pedestrians involved,total bicyclists involved,weigh_fatalities,weigh_serious,weigh_evident,weigh_possible,sev_score
0,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2004.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4e-06
1,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2005.0,2.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,2.0,1.8e-05
2,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2006.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.3e-05
3,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2007.0,4.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.8e-05
4,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,Minor Arterial,25.0,NW,N,PCC,N,Downtown Neighborhood,1,PRINCIPAL TRANSIT ROUTE,4.0,N,2008.0,3.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.3e-05


In [246]:
# blocks_collisions.to_csv('../data/processed/blocks_sev_score.csv')