In [51]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [52]:
pwd

'/Users/Monika/Projects'

In [53]:
df = pd.read_csv('data/collisions_orig.csv', parse_dates = ['INCDTTM'])

In [54]:
df.head()

Unnamed: 0,X,Y,OBJECTID,INCKEY,COLDETKEY,REPORTNO,STATUS,ADDRTYPE,INTKEY,LOCATION,...,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,-122.340472,47.608629,1,18600,18600,1785104,Matched,Intersection,29598.0,PIKE PL AND PIKE ST,...,Dry,Dark - Street Lights On,,4288030.0,,3.0,Vehicle backing hits pedestrian,0,0,N
1,-122.251788,47.508176,2,328272,329772,EA07021,Unmatched,Block,,S PRENTICE ST BETWEEN 65TH AVE S AND 66TH AVE S,...,,,,,,,,0,0,Y
2,-122.328526,47.70318,3,328374,329874,EA09347,Matched,Intersection,37555.0,1ST AVE NE AND NE 103RD ST,...,Wet,Daylight,,,,10.0,Entering at angle,0,0,N
3,-122.320383,47.539432,4,328985,330485,3857045,Matched,Intersection,34194.0,ELLIS AVE S AND S MYRTLE ST,...,Dry,Daylight,,,,11.0,From same direction - both going straight - bo...,0,0,N
4,-122.366178,47.669041,5,328852,330352,EA12640,Unmatched,Block,,8TH AVE NW BETWEEN NW MARKET ST AND NW 56TH ST,...,,,,,,,,0,0,Y


In [55]:
# make a dictionary of the LOCATIONs that do have X, Y
def make_location_dict(df,location,xcol,ycol):
    location_dict = dict()
    xy_dict = dict() # empty template
    xy_dict['X'] = 0
    xy_dict['Y'] = 0

    for idx,row in df[df['X'].notnull()].iterrows():
        this_loc = row[location]

        if this_loc in location_dict.keys():
            if location_dict[this_loc]['X'] != row[xcol]:
                print(f"Same loc, new X: {this_loc}, {row[xcol]}")
            if location_dict[this_loc]['Y'] != row[ycol]:
                print(f"Same loc, new Y: {this_loc}, {row[ycol]}")                                
        else:
            location_dict[this_loc] = xy_dict.copy()
            location_dict[this_loc]['X'] = row[xcol]
            location_dict[this_loc]['Y'] = row[ycol]  
            
    return location_dict

In [56]:
def find_X(location_dict,location_string,old_x, debug=False):
    # only need to do this if the old_x is NaN
    if (math.isnan(old_x)) and (location_string in location_dict.keys()):
        newX = location_dict[location_string]['X']
        if (debug):
            print(f"Filling in X for {location_string}")
    else:
        newX = old_x
    #print(f"{location_string}  {newX},{newY}")
    return newX

def find_Y(location_dict,location_string,old_y, debug=False):
    if (math.isnan(old_y)) and (location_string in location_dict.keys()):
        newY = location_dict[location_string]['Y']
        if (debug):
            print(f"Filling in Y for {location_string}")
    else:
        newY = old_y
    #print(f"{location_string}  {newX},{newY}")
    return newY

In [57]:
dfi = pd.read_csv('data/intersections.csv')

In [58]:
self_loc_dict = make_location_dict(df,'LOCATION','X','Y') 

Same loc, new X: 2ND AVE S AND S SPOKANE NR ST, -122.331939324729
Same loc, new Y: 2ND AVE S AND S SPOKANE NR ST, 47.5717239239888


In [59]:
# use the intersection data to make a dictionary converting LOCATION to X, Y
intersection_dict = make_location_dict(dfi,'UNITDESC','SHAPE_LNG','SHAPE_LAT')

In [60]:
# check number of missing X before any updates
starting_nulls = df['X'].isnull().sum()
starting_nulls

7461

In [61]:
only_in_inter = [x for x in intersection_dict.keys() if x not in self_loc_dict.keys()]
print(f"Only in self: {len(only_in_self)}, only in intersections {len(only_in_inter)}")
print(f"Overlap: {len(intersection_dict.keys())-len(only_in_inter)}")

# Note: most of the items that are only in self include "BETWEEN"

Only in self: 17157, only in intersections 7662
Overlap: 7779


In [62]:
df['X'] = df.apply(lambda x: find_X(self_loc_dict,x['LOCATION'],x['X'],debug=True), axis=1)
df['Y'] = df.apply(lambda x: find_Y(self_loc_dict,x['LOCATION'],x['Y']), axis=1)
# Only solves 4

Filling in X for 6TH AVE N AND MERCER ST
Filling in X for 5TH AVE AND MARION ST
Filling in X for 5TH AVE AND MARION ST
Filling in X for 6TH AVE N AND MERCER ST


In [63]:
# update based on intersection file
df['X'] = df.apply(lambda x: find_X(intersection_dict,x['LOCATION'],x['X']), axis=1)
df['Y'] = df.apply(lambda x: find_Y(intersection_dict,x['LOCATION'],x['Y']), axis=1)

In [64]:
# check number of missing X after intersection update
current_nulls = df['X'].isnull().sum()
print(f"Starting nulls {starting_nulls} --> current nulls {current_nulls} meaning {starting_nulls-current_nulls} resolved")

Starting nulls 7461 --> current nulls 7269 meaning 192 resolved


In [65]:
df.columns = df.columns.str.lower()

In [66]:
df.head()

Unnamed: 0,x,y,objectid,inckey,coldetkey,reportno,status,addrtype,intkey,location,...,roadcond,lightcond,pedrownotgrnt,sdotcolnum,speeding,st_colcode,st_coldesc,seglanekey,crosswalkkey,hitparkedcar
0,-122.340472,47.608629,1,18600,18600,1785104,Matched,Intersection,29598.0,PIKE PL AND PIKE ST,...,Dry,Dark - Street Lights On,,4288030.0,,3.0,Vehicle backing hits pedestrian,0,0,N
1,-122.251788,47.508176,2,328272,329772,EA07021,Unmatched,Block,,S PRENTICE ST BETWEEN 65TH AVE S AND 66TH AVE S,...,,,,,,,,0,0,Y
2,-122.328526,47.70318,3,328374,329874,EA09347,Matched,Intersection,37555.0,1ST AVE NE AND NE 103RD ST,...,Wet,Daylight,,,,10.0,Entering at angle,0,0,N
3,-122.320383,47.539432,4,328985,330485,3857045,Matched,Intersection,34194.0,ELLIS AVE S AND S MYRTLE ST,...,Dry,Daylight,,,,11.0,From same direction - both going straight - bo...,0,0,N
4,-122.366178,47.669041,5,328852,330352,EA12640,Unmatched,Block,,8TH AVE NW BETWEEN NW MARKET ST AND NW 56TH ST,...,,,,,,,,0,0,Y


In [67]:
#Niwako - dropping exceptrsncode and expectrsndesc, deemed unnecessary
df.drop(['exceptrsncode', 'exceptrsndesc'],axis=1, inplace=True)

In [68]:
#these are % of how many nulls we have left after Jamie filling in X,Y coordinates
#consider FE flags for remaining X,Y nulls
df.isnull().sum()/len(df)

x                  0.032976
y                  0.032976
objectid           0.000000
inckey             0.000000
coldetkey          0.000000
reportno           0.000000
status             0.000000
addrtype           0.016817
intkey             0.675570
location           0.020759
severitycode       0.000005
severitydesc       0.000000
collisiontype      0.118638
personcount        0.000000
pedcount           0.000000
pedcylcount        0.000000
vehcount           0.000000
injuries           0.000000
seriousinjuries    0.000000
fatalities         0.000000
incdate            0.000000
incdttm            0.000000
junctiontype       0.054147
sdot_colcode       0.000005
sdot_coldesc       0.000005
inattentionind     0.863058
underinfl          0.118547
weather            0.119490
roadcond           0.119128
lightcond          0.119894
pedrownotgrnt      0.976501
sdotcolnum         0.422939
speeding           0.955198
st_colcode         0.042702
st_coldesc         0.118638
seglanekey         0

In [69]:
#exporting to a csv, this will overwrite when more people fill in their nulls
df.to_csv('collisions_clean.csv')