In [47]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

# Set style and settings
plt.style.use('ggplot')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 15)

In [48]:
# Load data and set Datetime column
collisions = pd.read_csv('../data/external/Collisions.csv', 
                         parse_dates={'Datetime': ['INCDTTM']}, 
                         infer_datetime_format=True)

In [49]:
# Clean up and set index to datetime
collisions = (
    collisions.set_index('Datetime')
    .sort_index()
    .drop(columns=['EXCEPTRSNDESC', 'EXCEPTRSNCODE', 'REPORTNO', 'STATUS'])
)

In [50]:
collisions.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 220436 entries, 2003-10-06 00:00:00 to 2020-07-14 00:00:00
Data columns (total 35 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   X                212975 non-null  float64
 1   Y                212975 non-null  float64
 2   OBJECTID         220436 non-null  int64  
 3   INCKEY           220436 non-null  int64  
 4   COLDETKEY        220436 non-null  int64  
 5   ADDRTYPE         216729 non-null  object 
 6   INTKEY           71516 non-null   float64
 7   LOCATION         215860 non-null  object 
 8   SEVERITYCODE     220435 non-null  object 
 9   SEVERITYDESC     220436 non-null  object 
 10  COLLISIONTYPE    194284 non-null  object 
 11  PERSONCOUNT      220436 non-null  int64  
 12  PEDCOUNT         220436 non-null  int64  
 13  PEDCYLCOUNT      220436 non-null  int64  
 14  VEHCOUNT         220436 non-null  int64  
 15  INJURIES         220436 non-null  int64  
 16  SERI

In [51]:
collisions.describe()

Unnamed: 0,X,Y,OBJECTID,INCKEY,COLDETKEY,INTKEY,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INJURIES,SERIOUSINJURIES,FATALITIES,SDOT_COLCODE,SDOTCOLNUM,SEGLANEKEY,CROSSWALKKEY
count,212975.0,212975.0,220436.0,220436.0,220436.0,71516.0,220436.0,220436.0,220436.0,220436.0,220436.0,220436.0,220436.0,220435.0,127205.0,220436.0,220436.0
mean,-122.330748,47.620232,110218.5,143893.625937,144116.360395,37614.72079,2.227145,0.038156,0.02731,1.730697,0.373868,0.015165,0.001674,13.382362,7971063.0,262.425969,9617.439
std,0.030042,0.056009,63634.536307,88451.01328,88818.314232,51915.702891,1.471406,0.201881,0.164398,0.829041,0.732352,0.158077,0.044493,7.305444,2611523.0,3254.921395,71609.64
min,-122.419091,47.495573,1.0,1001.0,1001.0,23807.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1007024.0,0.0,0.0
25%,-122.34928,47.577331,55109.75,71395.75,71395.75,28652.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,11.0,6007029.0,0.0,0.0
50%,-122.330348,47.616106,110218.5,126707.5,126707.5,29973.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,11.0,8033011.0,0.0,0.0
75%,-122.311998,47.664261,165327.25,208526.25,208726.25,33983.0,3.0,0.0,0.0,2.0,1.0,0.0,0.0,14.0,10181010.0,0.0,0.0
max,-122.238949,47.734142,220436.0,332640.0,334140.0,757580.0,93.0,6.0,2.0,15.0,78.0,41.0,5.0,87.0,13072020.0,525241.0,5239700.0


In [40]:
# Pandas profiling, use minimal=True for large dataset
profile = ProfileReport(collisions, title='Collisions Profile Report', minimal=True)
#profile.to_file('../reports/collisions_profiling.html')


HBox(children=(FloatProgress(value=0.0, description='variables', max=39.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




### Notes after initial exploration
1. Use 'INTKEY' to identify intersections.
2. 'SEVERITYCODE' can be used to cross-reference the intersection with severity. How do we reconcile this with 'SERIOUSINJURIES' and 'FATALITIES'?
3. 'SPEEDING' is missing a lot, but this could be a helpful feature.
4. 'JUNCTIONTYPE' or 'ADDRTYPE',  will need OHE for logistic regression. 
5. Target variable? Create 'Dangerous' based on what criteria?



ML Algorithms: can use GBC or RF if we don't want to worry about multicollinearity or preprocessing (OHE/nomalization). 

In [52]:
collisions.head()

Unnamed: 0_level_0,X,Y,OBJECTID,INCKEY,COLDETKEY,ADDRTYPE,INTKEY,LOCATION,SEVERITYCODE,SEVERITYDESC,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INJURIES,SERIOUSINJURIES,FATALITIES,INCDATE,JUNCTIONTYPE,SDOT_COLCODE,SDOT_COLDESC,INATTENTIONIND,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
2003-10-06,-122.320755,47.608073,1680,3663,3663,Intersection,29797.0,BROADWAY AND CHERRY ST,0,Unknown,,0,0,0,0,0,0,0,2003/10/06 00:00:00+00,,,,,,,,,Y,3279003.0,,,,0,0,N
2004-01-01,,,11627,25040,25040,Block,,BATTERY ST TUNNEL NB BETWEEN ALASKAN WY VI NB ...,2,Injury Collision,Rear Ended,2,0,0,2,1,0,0,2004/01/01 00:00:00+00,Mid-Block (not related to intersection),14.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, REAR END",,0.0,Raining,Dry,Dark - Street Lights On,,4001002.0,,13.0,From same direction - both going straight - bo...,0,0,N
2004-01-01,-122.31876,47.604359,12525,24635,24635,Block,,E ALDER ST BETWEEN 10TH AVE AND 11TH AVE,1,Property Damage Only Collision,Parked Car,2,0,0,2,0,0,0,2004/01/01 00:00:00+00,Mid-Block (not related to intersection),16.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, LEFT SIDE ...",,0.0,Snowing,Snow/Slush,Unknown,,4001026.0,,32.0,One parked--one moving,0,0,N
2004-01-01,-122.32078,47.614076,14172,26463,26463,Intersection,29745.0,BROADWAY AND E PIKE ST,2,Injury Collision,Left Turn,4,0,0,2,1,0,0,2004/01/01 00:00:00+00,At Intersection (intersection related),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END ...",,0.0,Overcast,Wet,Dark - Street Lights On,,4001015.0,,28.0,From opposite direction - one left turn - one ...,0,0,N
2004-01-01,,,16515,29248,29248,,,,0,Unknown,,0,0,0,0,0,0,0,2004/01/01 00:00:00+00,,0.0,NOT ENOUGH INFORMATION / NOT APPLICABLE,,,,,,,4001030.0,,,,0,0,N


In [62]:
## Look at intersections
collisions.groupby(by='INTKEY').count().sort_values(by='OBJECTID', ascending=False)


Unnamed: 0_level_0,X,Y,OBJECTID,INCKEY,COLDETKEY,ADDRTYPE,LOCATION,SEVERITYCODE,SEVERITYDESC,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INJURIES,SERIOUSINJURIES,FATALITIES,INCDATE,JUNCTIONTYPE,SDOT_COLCODE,SDOT_COLDESC,INATTENTIONIND,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
INTKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
29973.0,272,272,272,272,272,272,272,272,272,249,272,272,272,272,272,272,272,272,272,272,272,22,249,248,248,248,2,159,6,262,249,272,272,272
29933.0,173,173,173,173,173,173,173,173,173,161,173,173,173,173,173,173,173,173,173,173,173,15,161,158,160,161,18,105,1,166,161,173,173,173
29913.0,145,145,145,145,145,145,145,145,145,137,145,145,145,145,145,145,145,145,145,145,145,14,137,137,136,137,10,80,2,142,137,145,145,145
29549.0,143,143,143,143,143,143,143,143,143,136,143,143,143,143,143,143,143,143,143,143,143,10,136,136,136,136,1,107,0,138,136,143,143,143
29761.0,136,136,136,136,136,136,136,136,136,126,136,136,136,136,136,136,136,136,136,136,136,18,126,126,126,126,16,76,2,132,126,136,136,136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31621.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1
31633.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,1,1,1,1,1
31644.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1
31646.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1
