In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline

pd.options.display.max_rows = 500
pd.options.display.max_columns = 100

import os
import warnings
warnings.filterwarnings("ignore") 

from datetime import datetime
from IPython.display import display, Markdown

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
collisions_df = pd.read_pickle('../data/processed/cleaned_data.pkl')
collisions_df = collisions_df[collisions_df.ADDRTYPE == 'Intersection']

In [3]:
intersections_df = pd.read_csv('../data/Intersections.csv')
intersections_df.rename(columns={'COMPKEY':'INTKEY'}, inplace=True)

In [4]:
collisions_df = collisions_df.merge(intersections_df, on='INTKEY')
collisions_df['YEAR'] = collisions_df.INCDATE.apply(lambda s: int(s[:4]))
collisions_df['name1'] = collisions_df.UNITDESC.apply(lambda s: s.split(' AND ')[0])
collisions_df['name2'] = collisions_df.UNITDESC.apply(lambda s: s.split(' AND ')[1])

In [5]:
def load_unique(df, yr):
    out = pd.DataFrame(df.groupby('STNAME')['AAWDT'].mean())
    out.reset_index(inplace=True)
    out['YEAR'] = int(yr)
    return out
#These important names have changed over the years
def name_check(lst):
    if 'STNAME' in lst:
        return 'STNAME'
    elif 'FIRST_STNAME_ORD' in lst:
        return 'FIRST_STNAME_ORD'
    else:
        return 'STNAME_ORD'

def metric_check(lst):
    if 'AAWDT' in lst:
        return 'AAWDT'
    elif 'COUNTAAWDT' in lst:
        return 'COUNTAAWDT'
    else:
        return 'AWDT'

In [6]:
folder = '../data/counts/'
traffic_df = pd.read_csv('../data/counts/2007_Traffic_Flow_Counts.csv')[['STNAME', 'COMPKEY', 'AAWDT']]
traffic_df = load_unique(traffic_df, '2007')

for f in sorted(os.listdir(folder))[1:]:
    file = folder + f
    d = pd.read_csv(file)

    name_column, metric_column = name_check(d.columns), metric_check(d.columns)
    d.rename(columns={name_column:'STNAME', metric_column:'AAWDT'}, inplace=True)
    
    d = d[['STNAME', 'AAWDT']]
    d = load_unique(d, f[:4])
    #we'll only take the columns we can consistently use for now
    traffic_df = traffic_df[traffic_df.STNAME.isin(d.STNAME)]
    d = d[d.STNAME.isin(traffic_df.STNAME)]             
    
    traffic_df = pd.concat([traffic_df, d], sort=False)
    
traffic_df = traffic_df[~traffic_df.duplicated()]
traffic_df.reset_index(inplace=True, drop=True)

In [7]:
traffic_df

Unnamed: 0,STNAME,AAWDT,YEAR
0,10TH AVE E,15300.000000,2007
1,11TH AVE NE,10100.000000,2007
2,12TH AVE,12900.000000,2007
3,12TH AVE E,9100.000000,2007
4,12TH AVE NE,8800.000000,2007
...,...,...,...
1759,W DRAVUS ST,13267.400000,2018
1760,W EMERSON PL,16567.500000,2018
1761,WEST MARGINAL WAY SW,8420.333333,2018
1762,WESTERN AVE,11058.384615,2018


In [8]:
concat1 = collisions_df.rename(columns={'name1':'STNAME'}).merge(traffic_df, on=['STNAME', 'YEAR'])
concat2 = collisions_df.rename(columns={'name2':'STNAME'}).merge(traffic_df, on=['STNAME', 'YEAR'])

In [9]:
merge_df = pd.concat([concat1, concat2]).reset_index(drop=True)

In [10]:
merge_df.columns

Index(['AAWDT', 'ADDRTYPE', 'ARTERIALCLASSCD', 'COMPTYPE', 'CROSSWALKKEY',
       'FATALITIES', 'GIS_XCOORD', 'GIS_YCOORD', 'HITPARKEDCAR_Y',
       'INATTENTIONIND_Y', 'INCDATE', 'INCKEY', 'INJURIES', 'INTKEY',
       'INTR_ID', 'JUNCTIONTYPE', 'LIGHTCOND', 'LOCATION', 'OBJECTID_x',
       'OBJECTID_y', 'PEDCOUNT', 'PEDCYLCOUNT', 'PEDROWNOTGRNT_Y',
       'PERSONCOUNT', 'ROADCOND', 'SDOT_COLCODE', 'SERIOUSINJURIES',
       'SEVERITYCODE', 'SHAPE_LAT', 'SHAPE_LNG', 'SIGNAL_MAINT_DIST',
       'SIGNAL_TYPE', 'SPEEDING_Y', 'STNAME', 'ST_COLCODE', 'ST_COLDESC',
       'SUBAREA', 'UNDERINFL', 'UNITDESC', 'UNITID', 'VEHCOUNT', 'WEATHER',
       'X_x', 'X_y', 'YEAR', 'Y_x', 'Y_y', 'name1', 'name2'],
      dtype='object')

In [11]:
columns = ['INCKEY', 'INTKEY', 'AAWDT', 'YEAR', 'ARTERIALCLASSCD', 'SIGNAL_TYPE',
           'LOCATION', 'SUBAREA', 'LIGHTCOND', 'WEATHER',
           'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'FATALITIES', 'INJURIES', 'UNDERINFL']
merge_df = merge_df[~merge_df.INCKEY.duplicated()][columns]

In [12]:
intersection_dictionary = defaultdict(list)
years = range(2007, 2019)
c = 0
for intersection in merge_df.INTKEY.unique():
    
    inter_slice = merge_df[merge_df.INTKEY == intersection]
    name = inter_slice['LOCATION'].iloc[0]
    subarea = inter_slice['SUBAREA'].iloc[0]
    signal_type = inter_slice['SIGNAL_TYPE'].iloc[0]
    art_class = inter_slice['ARTERIALCLASSCD'].iloc[0]
    
    for year in years:
        yr_slice = merge_df[(merge_df.INTKEY == intersection) & (merge_df.YEAR == year)]
        num_accidents = yr_slice.shape[0]
        
        if num_accidents == 0:
            intersection_dictionary[c].extend([intersection, name, year, signal_type, art_class, subarea, 0])
            c += 1
        else:
            pct = num_accidents/yr_slice['AAWDT'].iloc[0]
            intersection_dictionary[c].extend([intersection, name, year, signal_type, art_class, subarea, pct])
            c += 1

In [13]:
final_df = pd.DataFrame(data=intersection_dictionary.values(),
                        columns=['intkey', 'name', 'year', 'signal_type', 
                                  'arterial_class', 'subarea', 'risk'],
                        index=intersection_dictionary.keys())

In [14]:
final_df.to_pickle('../data/processed/intersection_w_normalization.pkl')

In [15]:
collisions_df[~collisions_df.INCKEY.isin(merge_df.INCKEY)]

Unnamed: 0,X_x,Y_x,OBJECTID_x,INCKEY,ADDRTYPE,INTKEY,LOCATION,SEVERITYCODE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INJURIES,SERIOUSINJURIES,FATALITIES,INCDATE,JUNCTIONTYPE,SDOT_COLCODE,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,ST_COLCODE,ST_COLDESC,CROSSWALKKEY,SPEEDING_Y,INATTENTIONIND_Y,HITPARKEDCAR_Y,PEDROWNOTGRNT_Y,X_y,Y_y,OBJECTID_y,INTR_ID,GIS_XCOORD,GIS_YCOORD,COMPTYPE,UNITID,SUBAREA,UNITDESC,ARTERIALCLASSCD,SIGNAL_MAINT_DIST,SIGNAL_TYPE,SHAPE_LNG,SHAPE_LAT,YEAR,name1,name2
0,-122.320755,47.608073,909,3663,Intersection,29797.0,BROADWAY AND CHERRY ST,0,0,0,0,0,0,0,0,2003/10/06 00:00:00+00,,,0,0,0,0,,,0,0,0,0,1,1.273526e+06,225352.442139,6970,12046,1.273526e+06,225352.44227,13,40323,E,BROADWAY AND CHERRY ST,2.0,,CITY,-122.320755,47.608073,2003,BROADWAY,CHERRY ST
2,-122.320755,47.608073,20937,36038,Intersection,29797.0,BROADWAY AND CHERRY ST,1,2,0,0,2,0,0,0,2005/11/12 00:00:00+00,At Intersection (intersection related),11.0,0,2,2,3,28,From opposite direction - one left turn - one ...,0,0,0,0,0,1.273526e+06,225352.442139,6970,12046,1.273526e+06,225352.44227,13,40323,E,BROADWAY AND CHERRY ST,2.0,,CITY,-122.320755,47.608073,2005,BROADWAY,CHERRY ST
3,-122.320755,47.608073,43215,57020,Intersection,29797.0,BROADWAY AND CHERRY ST,2,2,1,0,1,1,0,0,2006/12/15 00:00:00+00,At Intersection (intersection related),24.0,0,1,1,1,2,Vehicle turning left hits pedestrian,0,0,0,0,1,1.273526e+06,225352.442139,6970,12046,1.273526e+06,225352.44227,13,40323,E,BROADWAY AND CHERRY ST,2.0,,CITY,-122.320755,47.608073,2006,BROADWAY,CHERRY ST
12,-122.337454,47.615057,13901,27177,Intersection,29540.0,7TH AVE AND VIRGINIA ST,0,0,0,0,2,0,0,0,2004/01/01 00:00:00+00,At Intersection (intersection related),11.0,0,0,0,0,,,0,0,0,0,0,1.269457e+06,227979.276125,8515,11597,1.269457e+06,227979.27619,13,39137,DWNTN,7TH AVE AND VIRGINIA ST,2.0,,CITY,-122.337454,47.615057,2004,7TH AVE,VIRGINIA ST
13,-122.337454,47.615057,8744,21878,Intersection,29540.0,7TH AVE AND VIRGINIA ST,0,0,0,0,2,0,0,0,2004/12/25 00:00:00+00,At Intersection (intersection related),11.0,0,0,0,0,,,0,0,0,0,0,1.269457e+06,227979.276125,8515,11597,1.269457e+06,227979.27619,13,39137,DWNTN,7TH AVE AND VIRGINIA ST,2.0,,CITY,-122.337454,47.615057,2004,7TH AVE,VIRGINIA ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71299,-122.384573,47.642928,211828,331737,Intersection,27583.0,22ND AVE W AND THORNDYKE AVE W,2,1,0,0,0,1,0,0,2020/06/01 00:00:00+00,At Intersection (intersection related),28.0,0,0,0,0,,,0,0,0,0,0,1.258043e+06,238374.284840,8390,9313,1.258043e+06,238374.28497,13,27118,MGNL-QA,22ND AVE W AND THORNDYKE AVE W,2.0,,NONE,-122.384573,47.642928,2020,22ND AVE W,THORNDYKE AVE W
71300,-122.400316,47.633792,213279,331867,Intersection,27535.0,34TH CT W AND MAGNOLIA BLVD W,1,3,0,0,2,0,0,0,2020/06/05 00:00:00+00,At Intersection (intersection related),14.0,0,1,1,1,32,One parked--one moving,0,0,0,1,0,1.254095e+06,235121.140453,15245,10150,1.254095e+06,235121.14045,13,26356,MGNL-QA,34TH CT W AND MAGNOLIA BLVD W,3.0,,NONE,-122.400316,47.633792,2020,34TH CT W,MAGNOLIA BLVD W
71301,-122.393039,47.688685,210086,332217,Intersection,23960.0,28TH AVE NW AND NW 83RD S ST,2,2,0,0,0,1,0,0,2020/06/13 00:00:00+00,At Intersection (intersection related),11.0,0,0,0,0,,,0,0,0,0,0,1.256295e+06,255103.671353,7765,4978,1.256295e+06,255103.67123,13,2506,BLRD,28TH AVE NW AND NW 83RD S ST,0.0,,NONE,-122.393039,47.688685,2020,28TH AVE NW,NW 83RD S ST
71302,-122.332522,47.695938,209027,332314,Intersection,37572.0,MERIDIAN PL N AND N 92ND ST,2,2,0,0,0,1,0,0,2020/06/18 00:00:00+00,At Intersection (but not related to intersection),55.0,0,0,0,0,,,0,0,0,0,0,1.271248e+06,257452.468796,6783,4443,1.271248e+06,257452.46892,13,233707,NW,MERIDIAN PL N AND N 92ND ST,2.0,,NONE,-122.332522,47.695938,2020,MERIDIAN PL N,N 92ND ST


In [16]:
merge_df[merge_df.INCKEY.isin(merge_df.INCKEY.value_counts()[(merge_df.INCKEY.value_counts() > 1)].index)].sort_values(by='INCKEY')

Unnamed: 0,INCKEY,INTKEY,AAWDT,YEAR,ARTERIALCLASSCD,SIGNAL_TYPE,LOCATION,SUBAREA,LIGHTCOND,WEATHER,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,FATALITIES,INJURIES,UNDERINFL
