In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline

pd.options.display.max_rows = 500
pd.options.display.max_columns = 100

import os
import warnings
warnings.filterwarnings("ignore") 

from datetime import datetime
from IPython.display import display, Markdown

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
collisions_df = pd.read_pickle('../data/processed/cleaned_data.pkl')
collisions_df = collisions_df[collisions_df.ADDRTYPE == 'Intersection']

In [3]:
intersections_df = pd.read_csv('../data/Intersections.csv')
intersections_df.rename(columns={'COMPKEY':'INTKEY'}, inplace=True)

In [4]:
collisions_df = collisions_df.merge(intersections_df, on='INTKEY')
collisions_df['YEAR'] = collisions_df.INCDATE.apply(lambda s: int(s[:4]))
collisions_df['name1'] = collisions_df.UNITDESC.apply(lambda s: s.split(' AND ')[0])
collisions_df['name2'] = collisions_df.UNITDESC.apply(lambda s: s.split(' AND ')[1])

In [5]:
def load_unique(df, yr):
    out = pd.DataFrame(df.groupby('STNAME')['AAWDT'].mean())
    out.reset_index(inplace=True)
    out['YEAR'] = int(yr)
    return out
#These important names have changed over the years
def name_check(lst):
    if 'STNAME' in lst:
        return 'STNAME'
    elif 'FIRST_STNAME_ORD' in lst:
        return 'FIRST_STNAME_ORD'
    else:
        return 'STNAME_ORD'

def metric_check(lst):
    if 'AAWDT' in lst:
        return 'AAWDT'
    elif 'COUNTAAWDT' in lst:
        return 'COUNTAAWDT'
    else:
        return 'AWDT'

In [6]:
folder = '../data/counts/'
traffic_df = pd.read_csv('../data/counts/2007_Traffic_Flow_Counts.csv')[['STNAME', 'COMPKEY', 'AAWDT']]
traffic_df = load_unique(traffic_df, '2007')

for f in sorted(os.listdir(folder))[1:]:
    file = folder + f
    d = pd.read_csv(file)

    name_column, metric_column = name_check(d.columns), metric_check(d.columns)
    d.rename(columns={name_column:'STNAME', metric_column:'AAWDT'}, inplace=True)
    
    d = d[['STNAME', 'AAWDT']]
    d = load_unique(d, f[:4])
    #we'll only take the columns we can consistently use for now
    traffic_df = traffic_df[traffic_df.STNAME.isin(d.STNAME)]
    d = d[d.STNAME.isin(traffic_df.STNAME)]             
    
    traffic_df = pd.concat([traffic_df, d], sort=False)
    
traffic_df = traffic_df[~traffic_df.duplicated()]
traffic_df.reset_index(inplace=True, drop=True)

In [8]:
traffic_df

Unnamed: 0,STNAME,AAWDT,YEAR
0,10TH AVE E,15300.0,2007
1,11TH AVE NE,10100.0,2007
2,12TH AVE,12900.0,2007
3,12TH AVE E,9100.0,2007
4,12TH AVE NE,8800.0,2007
5,12TH AVE S,18800.0,2007
6,14TH AVE,8100.0,2007
7,14TH AVE S,16700.0,2007
8,15TH AVE NE,11960.0,2007
9,15TH AVE NW,32900.0,2007


In [9]:
concat1 = collisions_df.rename(columns={'name1':'STNAME'}).merge(traffic_df, on=['STNAME', 'YEAR'])
concat2 = collisions_df.rename(columns={'name2':'STNAME'}).merge(traffic_df, on=['STNAME', 'YEAR'])

In [10]:
merge_df = pd.concat([concat1, concat2]).reset_index(drop=True)

In [11]:
merge_df.columns

Index(['X_x', 'Y_x', 'OBJECTID_x', 'INCKEY', 'ADDRTYPE', 'INTKEY', 'LOCATION',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES',
       'SERIOUSINJURIES', 'FATALITIES', 'INCDATE', 'JUNCTIONTYPE',
       'SDOT_COLCODE', 'UNDERINFL', 'ST_COLCODE', 'CROSSWALKKEY', 'SPEEDING_Y',
       'INATTENTIONIND_Y', 'HITPARKEDCAR_Y', 'PEDROWNOTGRNT_Y',
       'WEATHER_Adverse', 'WEATHER_Good', 'WEATHER_Unknown',
       'ROADCOND_Adverse', 'ROADCOND_Dry', 'ROADCOND_Unknown',
       'LIGHTCOND_Dark', 'LIGHTCOND_Daylight', 'LIGHTCOND_Unknown',
       'LIGHTCOND_VeryDark', 'SEVERITYCODE_Injury',
       'SEVERITYCODE_PropertyDamage', 'SEVERITYCODE_Unknown', 'X_y', 'Y_y',
       'OBJECTID_y', 'INTR_ID', 'GIS_XCOORD', 'GIS_YCOORD', 'COMPTYPE',
       'UNITID', 'SUBAREA', 'UNITDESC', 'ARTERIALCLASSCD', 'SIGNAL_MAINT_DIST',
       'SIGNAL_TYPE', 'SHAPE_LNG', 'SHAPE_LAT', 'YEAR', 'STNAME', 'name2',
       'AAWDT', 'name1'],
      dtype='object')

In [13]:
columns = ['INCKEY', 'INTKEY', 'AAWDT', 'YEAR', 'ARTERIALCLASSCD', 'SIGNAL_TYPE',
           'LOCATION', 'SUBAREA', 'WEATHER_Adverse', 'WEATHER_Good', 'WEATHER_Unknown',
           'ROADCOND_Adverse', 'ROADCOND_Dry', 'ROADCOND_Unknown',
            'LIGHTCOND_Dark', 'LIGHTCOND_Daylight', 'LIGHTCOND_Unknown',
           'LIGHTCOND_VeryDark', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'SEVERITYCODE_Injury',
           'FATALITIES', 'SERIOUSINJURIES','INJURIES', 'UNDERINFL']
merge_df = merge_df[~merge_df.INCKEY.duplicated()][columns]

In [14]:
intersection_dictionary = defaultdict(list)
years = range(2007, 2019)
c = 0
for intersection in merge_df.INTKEY.unique():
    
    inter_slice = merge_df[merge_df.INTKEY == intersection]
    name = inter_slice['LOCATION'].iloc[0]
    subarea = inter_slice['SUBAREA'].iloc[0]
    signal_type = inter_slice['SIGNAL_TYPE'].iloc[0]
    art_class = inter_slice['ARTERIALCLASSCD'].iloc[0]
    
    for year in years:
        yr_slice = merge_df[(merge_df.INTKEY == intersection) & (merge_df.YEAR == year)]
        num_accidents = yr_slice.shape[0]
        
        if num_accidents == 0:
            intersection_dictionary[c].extend([intersection, name, year, signal_type, art_class, subarea, 0])
            c += 1
        else:
            pct = num_accidents/yr_slice['AAWDT'].iloc[0]
            intersection_dictionary[c].extend([intersection, name, year, signal_type, art_class, subarea, pct])
            c += 1

In [15]:
final_df = pd.DataFrame(data=intersection_dictionary.values(),
                        columns=['intkey', 'name', 'year', 'signal_type', 
                                  'arterial_class', 'subarea', 'risk'],
                        index=intersection_dictionary.keys())

In [16]:
final_df.to_pickle('../data/processed/intersection_w_normalization.pkl')

In [17]:
collisions_df[~collisions_df.INCKEY.isin(merge_df.INCKEY)]

Unnamed: 0,X_x,Y_x,OBJECTID_x,INCKEY,ADDRTYPE,INTKEY,LOCATION,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INJURIES,SERIOUSINJURIES,FATALITIES,INCDATE,JUNCTIONTYPE,SDOT_COLCODE,UNDERINFL,ST_COLCODE,CROSSWALKKEY,SPEEDING_Y,INATTENTIONIND_Y,HITPARKEDCAR_Y,PEDROWNOTGRNT_Y,WEATHER_Adverse,WEATHER_Good,WEATHER_Unknown,ROADCOND_Adverse,ROADCOND_Dry,ROADCOND_Unknown,LIGHTCOND_Dark,LIGHTCOND_Daylight,LIGHTCOND_Unknown,LIGHTCOND_VeryDark,SEVERITYCODE_Injury,SEVERITYCODE_PropertyDamage,SEVERITYCODE_Unknown,X_y,Y_y,OBJECTID_y,INTR_ID,GIS_XCOORD,GIS_YCOORD,COMPTYPE,UNITID,SUBAREA,UNITDESC,ARTERIALCLASSCD,SIGNAL_MAINT_DIST,SIGNAL_TYPE,SHAPE_LNG,SHAPE_LAT,YEAR,name1,name2
0,-122.320780,47.614076,14172,26463,Intersection,29745.0,BROADWAY AND E PIKE ST,4,0,0,2,1,0,0,2004/01/01 00:00:00+00,At Intersection (intersection related),11.0,0,28,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,-122.320780,47.614076,5948,11682,1.273562e+06,227541.54870,13,40120,E,BROADWAY AND E PIKE ST,2.0,,CITY,-122.320780,47.614076,2004,BROADWAY,E PIKE ST
1,-122.320780,47.614076,17447,30147,Intersection,29745.0,BROADWAY AND E PIKE ST,2,1,0,1,1,0,0,2004/01/29 00:00:00+00,At Intersection (intersection related),24.0,0,2,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,0,-122.320780,47.614076,5948,11682,1.273562e+06,227541.54870,13,40120,E,BROADWAY AND E PIKE ST,2.0,,CITY,-122.320780,47.614076,2004,BROADWAY,E PIKE ST
2,-122.320780,47.614076,16960,29864,Intersection,29745.0,BROADWAY AND E PIKE ST,7,0,0,4,0,0,0,2004/02/14 00:00:00+00,At Intersection (intersection related),11.0,0,28,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,-122.320780,47.614076,5948,11682,1.273562e+06,227541.54870,13,40120,E,BROADWAY AND E PIKE ST,2.0,,CITY,-122.320780,47.614076,2004,BROADWAY,E PIKE ST
3,-122.320780,47.614076,9145,22338,Intersection,29745.0,BROADWAY AND E PIKE ST,5,0,0,2,0,0,0,2004/04/06 00:00:00+00,At Intersection (intersection related),11.0,0,29,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,-122.320780,47.614076,5948,11682,1.273562e+06,227541.54870,13,40120,E,BROADWAY AND E PIKE ST,2.0,,CITY,-122.320780,47.614076,2004,BROADWAY,E PIKE ST
4,-122.320780,47.614076,12144,24415,Intersection,29745.0,BROADWAY AND E PIKE ST,2,1,0,1,1,0,0,2004/06/15 00:00:00+00,At Intersection (intersection related),24.0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,-122.320780,47.614076,5948,11682,1.273562e+06,227541.54870,13,40120,E,BROADWAY AND E PIKE ST,2.0,,CITY,-122.320780,47.614076,2004,BROADWAY,E PIKE ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65476,-122.386261,47.522416,220104,331420,Intersection,34735.0,41ST AVE SW AND SW BARTON ST,1,0,0,1,1,0,0,2020/05/10 00:00:00+00,At Intersection (intersection related),28.0,0,50,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,-122.386261,47.522416,12742,18202,1.256740e+06,194433.29600,13,75325,SW,41ST AVE SW AND SW BARTON ST,2.0,,NONE,-122.386261,47.522416,2020,41ST AVE SW,SW BARTON ST
65477,-122.294896,47.711999,219654,331475,Intersection,36651.0,31ST AVE NE AND NE 115TH ST,2,0,0,2,1,0,0,2020/05/12 00:00:00+00,At Intersection (intersection related),11.0,0,10,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,-122.294896,47.711999,9808,3426,1.280624e+06,263131.93415,13,227304,N,31ST AVE NE AND NE 115TH ST,3.0,,NONE,-122.294896,47.711999,2020,31ST AVE NE,NE 115TH ST
65478,-122.282843,47.566194,219143,331484,Intersection,32701.0,CASCADIA AVE S AND S DAKOTA ST,2,0,0,2,1,0,0,2020/05/13 00:00:00+00,At Intersection (intersection related),11.0,0,10,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,-122.282843,47.566194,10986,15062,1.282589e+06,209900.60993,13,59721,SE,CASCADIA AVE S AND S DAKOTA ST,0.0,,NONE,-122.282843,47.566194,2020,CASCADIA AVE S,S DAKOTA ST
65479,-122.339120,47.688723,218592,331575,Intersection,24221.0,ASHWORTH AVE N AND N 82ND ST,2,0,0,2,0,0,0,2020/05/21 00:00:00+00,At Intersection (intersection related),11.0,0,10,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,-122.339120,47.688723,11961,5027,1.269572e+06,254853.19698,13,4107,NW,ASHWORTH AVE N AND N 82ND ST,0.0,,NONE,-122.339120,47.688723,2020,ASHWORTH AVE N,N 82ND ST


In [18]:
merge_df[merge_df.INCKEY.isin(merge_df.INCKEY.value_counts()[(merge_df.INCKEY.value_counts() > 1)].index)].sort_values(by='INCKEY')

Unnamed: 0,INCKEY,INTKEY,AAWDT,YEAR,ARTERIALCLASSCD,SIGNAL_TYPE,LOCATION,SUBAREA,WEATHER_Adverse,WEATHER_Good,WEATHER_Unknown,ROADCOND_Adverse,ROADCOND_Dry,ROADCOND_Unknown,LIGHTCOND_Dark,LIGHTCOND_Daylight,LIGHTCOND_Unknown,LIGHTCOND_VeryDark,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,SEVERITYCODE_Injury,FATALITIES,SERIOUSINJURIES,INJURIES,UNDERINFL
