# Seattle Traffic Collision Data
### Rebecca Stewart

## Add census area, neighborhood and impute weather, lightcond and roadcond

In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Keeps our notebooks clean 
import warnings
warnings.filterwarnings("ignore") 

# New imports
from datetime import datetime
from collections import namedtuple

In [2]:
# I installed geopandas from a conda prompt after having problem using !pip install geopandas from a notebook
import geopandas as gpd

#### The only change between the master and the following is that my data files are in a data directory not within the notebook folder

In [3]:
# READ IN THE DATA FROM THE LOCAL CSV FILE
local_file_name  = "../data/collisions_orig.csv"
df_collisions  = pd.read_csv(local_file_name , parse_dates=["INCDTTM"])
df_locations_xy = pd.read_csv('../data/location_xy.csv')


### From Master

In [4]:
# Check the starting shape
df_collisions.shape

(220436, 40)

In [5]:
# Save column list to restore column order after merge
column_list = df_collisions.columns
column_list = column_list.append(pd.Index(['fe_exists']))

In [6]:
# Drop the old X,Y and replace with the ones in location_xy.csv if they exist, otherwise, will be nan
df_collisions.drop(columns=['X','Y'],inplace=True,errors='ignore')   # get these from the location_xy.csv file
df_collisions.shape

(220436, 38)

In [7]:
df_new = pd.merge(df_collisions, df_locations_xy, on='LOCATION', how='left')

In [8]:
# Do some sanity checks
df_new.shape

(220436, 41)

In [9]:
# Restore the original order of columns
df_new = df_new[column_list]

In [10]:
df_new.drop_duplicates(inplace=True)

In [11]:
df_new['fe_exists'].value_counts()

1    216059
0      4377
Name: fe_exists, dtype: int64

In [12]:
new_keys = list(df_new['INCKEY'])
col_keys = list(df_collisions['INCKEY'])
print(f"{len(new_keys)}   {len(col_keys)}")

220436   220436


In [13]:
#Dropping columns deemed unnecessary
df_new.drop(['EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'STATUS', 'REPORTNO', 'SEGLANEKEY', 'CROSSWALKKEY', 'INCKEY','COLDETKEY'],axis=1, inplace=True)

In [14]:
#to lowercase
df_new.columns = df_new.columns.str.lower()

In [15]:
df_new.columns

Index(['x', 'y', 'objectid', 'addrtype', 'intkey', 'location', 'severitycode',
       'severitydesc', 'collisiontype', 'personcount', 'pedcount',
       'pedcylcount', 'vehcount', 'injuries', 'seriousinjuries', 'fatalities',
       'incdate', 'incdttm', 'junctiontype', 'sdot_colcode', 'sdot_coldesc',
       'inattentionind', 'underinfl', 'weather', 'roadcond', 'lightcond',
       'pedrownotgrnt', 'sdotcolnum', 'speeding', 'st_colcode', 'st_coldesc',
       'hitparkedcar', 'fe_exists'],
      dtype='object')

In [16]:
print(df_new['x'].isnull().sum())
print(df_new['y'].isnull().sum())

7189
7189


In [17]:
#FE time, total_injuries and total_person_count columns
df_new["time"]=df_new['incdttm'].dt.strftime('%H:%M')
df_new["total_injuries"]=df_new['injuries'] + df_new['seriousinjuries']  + df_new['fatalities']
df_new["total_person_count"]=df_new['personcount'] + df_new['pedcount']  + df_new['pedcylcount']

In [18]:
fe_emd_crit = (df_new['weather'].isnull() &
    df_new['lightcond'].isnull() &
    df_new['roadcond'].isnull() &
    df_new['collisiontype'].isnull() &
    df_new['st_coldesc'].isnull() &
    df_new['underinfl'].isnull() &
    df_new['inattentionind'].isnull() &
    df_new['speeding'].isnull() &
    df_new['pedrownotgrnt'].isnull() &
    (df_new['vehcount'] == 0))
df_new['fe_emd'] = fe_emd_crit

In [19]:
#converting incdate to datetime
df_new["incdate"] = df_new["incdate"].astype("datetime64")

In [20]:
df_new.shape

(220436, 37)

### Merge with Census Areas GEOJSON

We can use geopandus to join our collisions dataset to the Census file using our x/y coordinates.

In [21]:
# Create a dataframe that includes a new feature called geometry, which is basically a point made from x & y           
gdf_collisions = gpd.GeoDataFrame(df_new, geometry=gpd.points_from_xy(df_new.x, df_new.y))
# This will show the newly created geometry geo feature
gdf_collisions.head(1)

Unnamed: 0,x,y,objectid,addrtype,intkey,location,severitycode,severitydesc,collisiontype,personcount,...,speeding,st_colcode,st_coldesc,hitparkedcar,fe_exists,time,total_injuries,total_person_count,fe_emd,geometry
0,-122.340472,47.608629,1,Intersection,29598.0,PIKE PL AND PIKE ST,2,Injury Collision,Pedestrian,2,...,,3,Vehicle backing hits pedestrian,N,1,18:36,1,3,False,POINT (-122.34047 47.60863)


In [22]:
# Open the Census Tracts Geo file
geojson_file = "../data/Census_Tracts_2010.geojson"
census_tracts = gpd.read_file(geojson_file)
census_tracts.head(1)

Unnamed: 0,OBJECTID,TRACT,TRACTCE10,GEOID10,NAME10,NAMELSAD10,ACRES_TOTAL,WATER,SHAPE_Length,SHAPE_Area,geometry
0,1,2500,2500,53033002500,25,Census Tract 25,243.219083,0,16442.968402,10594620.0,"POLYGON ((-122.29602 47.69023, -122.29608 47.6..."


In [23]:
# Join the two - a left join means we wont loose records from our collision dataset. within means the x/y pont must be contained within the census polygon
df_with_geo = gpd.sjoin(gdf_collisions, census_tracts, how="left", op="within")

In [24]:
# Make sure it was a left join and we didn't loose any records
print("Original df shape:", df_new.shape)
print("New df shape:", df_with_geo.shape)

Original df shape: (220436, 38)
New df shape: (220436, 49)


In [25]:
# Drop un-needed columns
df_with_geo.drop(['OBJECTID', 'TRACT', 'TRACTCE10', 'GEOID10', 'NAME10', 'WATER', 'geometry', 'SHAPE_Area', 'SHAPE_Length', 'ACRES_TOTAL', 'index_right', 'SHAPE_Length', 'SHAPE_Area'],axis=1, inplace=True)
df_with_geo.head(1)

Unnamed: 0,x,y,objectid,addrtype,intkey,location,severitycode,severitydesc,collisiontype,personcount,...,speeding,st_colcode,st_coldesc,hitparkedcar,fe_exists,time,total_injuries,total_person_count,fe_emd,NAMELSAD10
0,-122.340472,47.608629,1,Intersection,29598.0,PIKE PL AND PIKE ST,2,Injury Collision,Pedestrian,2,...,,3,Vehicle backing hits pedestrian,N,1,18:36,1,3,False,Census Tract 81


In [26]:
df_with_geo.columns = df_with_geo.columns.str.lower()

### Merge with Neighborhood CSV

We can merge with this dataset (created using Tableau) which contains the neighborhood for each census group

In [27]:
# Now we can pull neighborhood from the csv I created from tableau

df_neighborhood = pd.read_csv('../data/location_neighborhood_census_area.csv')
df_neighborhood.shape

(135, 2)

In [28]:
df_with_geo_neighborhood = pd.merge(df_with_geo, df_neighborhood, on='namelsad10', how='left')

In [29]:
df_with_geo_neighborhood.head(1)

Unnamed: 0,x,y,objectid,addrtype,intkey,location,severitycode,severitydesc,collisiontype,personcount,...,st_colcode,st_coldesc,hitparkedcar,fe_exists,time,total_injuries,total_person_count,fe_emd,namelsad10,neighborhood
0,-122.340472,47.608629,1,Intersection,29598.0,PIKE PL AND PIKE ST,2,Injury Collision,Pedestrian,2,...,3,Vehicle backing hits pedestrian,N,1,18:36,1,3,False,Census Tract 81,Downtown / Waterfront


In [30]:
# Rename census area column to something that makes more sense
df_with_geo_neighborhood=df_with_geo_neighborhood.rename(columns = {'namelsad10':'census_area'})

In [31]:
# Make sure it was a left join and we didn't loose any records
print("Original df shape:", df_new.shape)
print("New df shape:", df_with_geo_neighborhood.shape)

Original df shape: (220436, 38)
New df shape: (220436, 39)


In [32]:
# rename dataframe back to what we were using before
df_new=df_with_geo_neighborhood

### Impute missing weather with date and census area

If that doesn't work, then we will try neighborhood

In [33]:
# Define a Named Tuple Here so that we can use it in the rest of the notebook
DateArea = namedtuple("DateArea", ["date", "area"])

def make_date_area_env_dict(df, date, area, env_condition):
    # Creating a dictionary that has a named tuple as the key, basically (date,area) and a dict of environmental (like weather) conditions conditions as the value
    # My intent is to count the number of environmental (like weather) conditions for the date/area combination so that the one with the highst count wins

    #Only use those records with values for all three features, date, area and environmental (like weather) conditions
    df_with_values = df[df["incdate"].notnull() & df[area].notnull() & df[env_condition].notnull()  ]
    
    # Overall Dictionary Object to track all this informaiton
    loc_date_dict = dict()
    for idx, row in df_with_values.iterrows():
        # This is my date/area tuple
        dl = DateArea(date=str(row[date]), area=row[area])
        if dl in loc_date_dict.keys():
            env_dict = loc_date_dict[dl]
            if row[env_condition] in env_dict.keys():
                env_dict[row[env_condition]] +=1
            else:
                env_dict[row[env_condition]]=1
                loc_date_dict[dl]=env_dict
        else:
            env_dict = dict()
            env_dict[row[env_condition]]  = 1
            loc_date_dict[dl]= env_dict
    return loc_date_dict


In [34]:
# Create Dictionary using date, census area for the named tuple (key) and a dictionary of the most common weather conditions for the value
dict_date_census_weather = make_date_area_env_dict(df_new, "incdate", "census_area", "weather")
count_imputed=0

In [35]:
# How many records do we have where weather is null, but date and census area are not
df_no_weather = df_new[df_new["incdate"].notnull() & df_new["census_area"].notnull() & ((df_new[ "weather"]=="Unknown") | (df_new[ "weather"].isnull())) ]
df_no_weather.shape


(37777, 39)

In [36]:
# Take a look at the first 15 items of our imputing dictionary
first_5=0
for key, value in dict_date_census_weather.items() :
    if first_5<5: print(key)
    if first_5<5: print(value)    
    first_5+=1

DateArea(date='2004-10-14 00:00:00', area='Census Tract 81')
{'Overcast': 1, 'Clear': 1}
DateArea(date='2020-01-05 00:00:00', area='Census Tract 12')
{'Raining': 1}
DateArea(date='2020-02-13 00:00:00', area='Census Tract 109')
{'Overcast': 1, 'Clear': 1}
DateArea(date='2020-01-21 00:00:00', area='Census Tract 50')
{'Overcast': 1}
DateArea(date='2004-09-21 00:00:00', area='Census Tract 109')
{'Clear': 1}


In [37]:
# This function will be used in our lambda expression below. It will try to fill weather with the most common weather condition for the date/census area combo

def find_env_val(dict_env,area_string, date_string, old_val, debug=False):
    dl = DateArea(date=str(date_string), area=area_string)
    if (str(old_val)=="nan" or str(old_val)=="Unknown") and dl in dict_env.keys():  
        env_dict = dict_env[dl]
        # Now we want to find the key of the item in env_dic that has the greatest value (count) and use it to replace our NaN value
        # If they all have equal, then the value selected is kind of random (order that it was added to dictionary)
        new_val = max(env_dict, key=env_dict.get)
        if (debug) and  len(env_dict.keys()) >1: print(f"Filling in env for {area_string}, and {date_string} with {new_val} using this info {env_dict}")
    else:
        new_val = old_val
    #print(f"{area_string}  {newX},{newY}")
    return new_val

In [38]:
df_new['weather'] = df_new.apply(lambda x: find_env_val(dict_date_census_weather,area_string=x['census_area'],date_string=str(x['incdate']), old_val=x['weather'],debug=False), axis=1)

In [39]:
df_no_weather = df_new[df_new["incdate"].notnull() & df_new["census_area"].notnull() & ((df_new[ "weather"]=="Unknown") | (df_new[ "weather"].isnull())) ]
df_no_weather.shape

(29251, 39)

### We imputed about a quarter of the null and unknown weather values

Now let's try it with neighborhood

In [40]:
dict_date_neighborhood_weather = make_date_area_env_dict(df_new, "incdate", "neighborhood", "weather")

In [41]:
# Take a look at the first 15 items of our imputing dictionary
first_5=0
for key, value in dict_date_neighborhood_weather.items() :
    if first_5<5: print(key)
    if first_5<5: print(value)    
    first_5+=1
            

DateArea(date='2004-10-14 00:00:00', area='Downtown / Waterfront')
{'Overcast': 3, 'Clear': 4}
DateArea(date='2020-01-05 00:00:00', area='Northgate')
{'Raining': 1}
DateArea(date='2020-02-13 00:00:00', area='Georgetown')
{'Overcast': 1, 'Clear': 1}
DateArea(date='2020-01-21 00:00:00', area='Wallingford')
{'Overcast': 1, 'Raining': 1, 'Clear': 1}
DateArea(date='2004-09-21 00:00:00', area='Georgetown')
{'Clear': 2}


In [42]:
df_new['weather'] = df_new.apply(lambda x: find_env_val(dict_date_neighborhood_weather,area_string=x['neighborhood'],date_string=str(x['incdate']), old_val=x['weather'],debug=False), axis=1)

In [43]:
df_no_weather = df_new[df_new["incdate"].notnull() & df_new["census_area"].notnull() & ((df_new[ "weather"]=="Unknown") | (df_new[ "weather"].isnull())) ]
df_no_weather.shape

(18403, 39)

### We imputed about a third of the remaining null and unknown weather values

Next let's just use date. We can pass a constant value feature for area. We will make one called city that contains only "seattle"

In [44]:
df_new['city']="seattle"

In [45]:
dict_date_city_weather = make_date_area_env_dict(df_new, "incdate", "city", "weather")

In [46]:
df_new['weather'] = df_new.apply(lambda x: find_env_val(dict_date_city_weather,area_string=x['city'],date_string=str(x['incdate']), old_val=x['weather'],debug=False), axis=1)

In [47]:
df_no_weather = df_new[df_new["incdate"].notnull() & df_new["census_area"].notnull() & ((df_new[ "weather"]=="Unknown") | (df_new[ "weather"].isnull())) ]
df_no_weather.shape

(167, 40)

### Now that we have null and unknowns down to such a small amount, let's fill the rest with the most common  value for weather

In [48]:
most_common_value=df_new['weather'].value_counts().index[0]
print(most_common_value)

Clear


In [49]:
df_new['weather'].fillna(most_common_value, inplace=True)
df_new['weather'].replace(['Unknown'], most_common_value, inplace=True)

In [50]:
df_no_weather = df_new[df_new["incdate"].notnull() & df_new["census_area"].notnull() & ((df_new[ "weather"]=="Unknown") | (df_new[ "weather"].isnull())) ]
df_no_weather.shape

(0, 40)

#### Now we can do the same thing for Road Conditions

In [51]:
# Create Dictionary using date, census area for the named tuple (key) and a dictionary of the most common weather conditions for the value
dict_date_census_roadcond = make_date_area_env_dict(df_new, "incdate", "census_area", "roadcond")

In [52]:
df_new['roadcond'] = df_new.apply(lambda x: find_env_val(dict_date_census_roadcond,area_string=x['census_area'],date_string=str(x['incdate']), old_val=x['roadcond'],debug=False), axis=1)
df_no_weather = df_new[df_new["incdate"].notnull() & df_new["census_area"].notnull() & ((df_new[ "roadcond"]=="Unknown") | (df_new[ "roadcond"].isnull())) ]
df_no_weather.shape


(29118, 40)

In [53]:
dict_date_neighborhood_roadcond = make_date_area_env_dict(df_new, "incdate", "neighborhood", "roadcond")


In [54]:
df_new['roadcond'] = df_new.apply(lambda x: find_env_val(dict_date_neighborhood_weather,area_string=x['neighborhood'],date_string=str(x['incdate']), old_val=x['roadcond'],debug=False), axis=1)
df_no_weather = df_new[df_new["incdate"].notnull() & df_new["census_area"].notnull() & ((df_new[ "roadcond"]=="Unknown") | (df_new[ "roadcond"].isnull())) ]
df_no_weather.shape

(17539, 40)

In [55]:
dict_date_city_roadcond = make_date_area_env_dict(df_new, "incdate", "city", "roadcond")

In [56]:
df_new['roadcond'] = df_new.apply(lambda x: find_env_val(dict_date_city_weather,area_string=x['city'],date_string=str(x['incdate']), old_val=x['roadcond'],debug=False), axis=1)
df_no_weather = df_new[df_new["incdate"].notnull() & df_new["census_area"].notnull() & ((df_new[ "roadcond"]=="Unknown") | (df_new[ "roadcond"].isnull())) ]
df_no_weather.shape

(161, 40)

In [57]:
most_common_value=df_new['roadcond'].value_counts().index[0]
df_new['roadcond'].fillna(most_common_value, inplace=True)
df_new['roadcond'].replace(['Unknown'], most_common_value, inplace=True)

And finally, save our new dataframe as a clean (new) version

In [59]:

df_new.to_csv('../data/collisions_clean_new.csv',index_label="id")