### <b>Washington State Crash Event Analysis</b>
#### --- by 

In [186]:
import pandas as pd
import numpy as np
import regex as re

import requests
import asyncio
import json as js

import time

import os

pd.set_option('display.max_rows', 9)

#### <b>Load Datasets</b></br>
##### Introduction to Datasets</br>
- <b>df_data_main</b>: The original data set with an expanded column that stores the zipcode of the place where the accident occured</br>
- <b> df_metadata_crashtype</b>: Derived from the table 6 of the original dataset. This dataframe contains all the metainfo about a category of crashtype</br>

In [187]:
dir = os.path.abspath(os.path.dirname(os.getcwd())) + '/data/'

df_data_main = pd.read_csv(dir + '/output/data_with_zipcode.csv').drop(axis=1, labels='Unnamed: 0') 
df_data_main.event_zipcode = df_data_main.event_zipcode.astype(str)   # convert the default float type values into str

df_metadata_crashtype = pd.read_csv(dir + '/output/crash_type.csv').set_index(keys='type_index')

df_metadata_driver_factor = pd.read_csv(dir + '/output/driver_behavioral_factors.csv').drop(labels='Unnamed: 0', axis=1)

df_data_main.shape

  df_data_main = pd.read_csv(dir + '/output/data_with_zipcode.csv').drop(axis=1, labels='Unnamed: 0')


(4132, 306)

##### <b>Data Cleaning</b>

- The following blocks drop rows which do not have valid zipcodes (i.e. rows that do not have either a driver zipcode or an accident zipcode)

In [188]:
# drop rows which do not have an event zipcode

has_no_zipcode = df_data_main.event_zipcode.map(lambda v : v == 'nan')
df_data_main = df_data_main[df_data_main.event_zipcode != 'nan']
df_data_main.shape

(4132, 306)

In [189]:
# drop rows which do not have a person zipcode

df_data_main.dzip = df_data_main.dzip.map(
    lambda n: 0 if n ==0 else 0 if pd.isna(n) else int(n)
)
df_data_main = df_data_main[df_data_main.dzip > 10000]     # valid zip codes are all 5 digit so we filter out those with less than 5 digits
df_data_main.dzip = df_data_main.dzip.astype(str)
df_data_main.shape

(4100, 306)

- The following block cleans the <b>age</b> column. <br/>
- After observation we found that there are invalid age values such as 999 or 998, which, after cleaning, are replaced with the column mean (calculation of the mean is based on the column being filtered out of the abnomral values.)

In [190]:
age_filter = filter(lambda v: v > 0 and v < 100, df_data_main.age)
age_mean = round( np.mean(list(age_filter), dtype=float),0)

df_data_main.age = df_data_main.age.map(
    lambda v : age_mean if v < 0 or v >= 100 else v
)

##### <b>Among drivers involved in fatal crashes, what proportion are involved in crashes in communities where they live?</b>
<br/>
- <b>Visualization note</b>: a barchart / pie chart to show the proportion of non-resident and resident crash cases.

In [191]:
df_data_main['is_resident'] = df_data_main.index.map(
    lambda i: df_data_main.event_zipcode[i] == df_data_main.dzip[i]
)

df_data_main['is_driver'] = df_data_main.ptype.map(
    lambda t: t == 1
)

prop = len(df_data_main[(df_data_main.is_resident == True) & (
    df_data_main.is_driver == True)]) / float(len(df_data_main[df_data_main.is_driver == True]))

print('{prop:.4f}% of the drivers are from the community where the accident occured'.format(prop = prop * 100))

23.7805% of the drivers are from the community where the accident occured


Based on our analysis, <b>23.7805%</b> of the drivers are from the community where the accident occured.

##### <b>Are there differences in the types of crashes and behavior factors in those crashes among “residents” versus those deemed to be not “from” the area?</b>
- We will first take a look at the types of crashes among residents versus non-residents drivers. To that end, we load the metadata regarding crash types.

In [192]:
df_metadata_crashtype.head()     # this dataframe stores the meta info of the variable crashtype

Unnamed: 0_level_0,crash_type,category
type_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,No Impact,NOT CATEGORIZED
1,Drive Off Road,SINGLE DRIVER
2,Control/Traction Loss,SINGLE DRIVER
3,"Avoid Collision with Vehicle, Pedestrian, Animal",SINGLE DRIVER
4,Specifics Other,SINGLE DRIVER


In [193]:
map_crashtype_category = {  # maps a crashtype to its category
    k:v for k,v in zip(df_metadata_crashtype.index, df_metadata_crashtype.category)
}

map_crashtype_eng = {   # maps a crashtype index to its actual meaning
    k:v for k,v in zip(df_metadata_crashtype.index, df_metadata_crashtype['crash_type'])
}

df_data_main['crash_category'] = df_data_main.crashtype.map(map_crashtype_category)
df_data_main['crashtype_eng'] = df_data_main.crashtype.map(map_crashtype_eng)

In [194]:
df_data_crash = df_data_main.groupby(by=['crash_category', 'is_resident']).agg(
    case_count=pd.NamedAgg(column='par', aggfunc=len),
).reset_index()

df_temp = df_data_main.groupby('is_resident').par.agg('count')  # temporary dataframe for calculating total by is_resident
# print(df_temp)
non_resident_event_count = float(df_temp.iloc[0])
resident_event_count = float(df_temp.iloc[1])
del df_temp


def get_case_proportion(case_index:int, crash_dataframe: pd.DataFrame) -> float:
    is_resident = crash_dataframe.loc[case_index, 'is_resident']
    case_count = crash_dataframe.loc[case_index, 'case_count']
    if is_resident:
        ratio = case_count / resident_event_count
    else:
        ratio = case_count / non_resident_event_count
    return ratio


df_data_crash['case_proportion'] = df_data_crash.index.map(lambda i : get_case_proportion(i, df_data_crash))

df_data_crash[:6]

Unnamed: 0,crash_category,is_resident,case_count,case_proportion
0,"CHANGING TRAFFICWAY, VEHICLE TURNING",False,273,0.087388
1,"CHANGING TRAFFICWAY, VEHICLE TURNING",True,135,0.138462
2,INTERSECTING PATHS (VEHICLE DAMAGE),False,219,0.070102
3,INTERSECTING PATHS (VEHICLE DAMAGE),True,82,0.084103
4,MISCELLANEOUS,False,575,0.184059
5,MISCELLANEOUS,True,136,0.139487


- The following pivot table compares the proportion of different crash categories among the resident and non-resident groups.
- <b>Visualization note</b> There should be a paired bar chart to show the proportional differences of case categories across resident and non-resident groups

In [195]:
df_crash_pivoted = df_data_crash.pivot(index='crash_category', columns='is_resident', values=['case_proportion'])

df_crash_pivoted['diff'] = df_crash_pivoted[('case_proportion', False)] - df_crash_pivoted[('case_proportion', True)]
df_crash_pivoted.sort_values(by = 'diff', ascending = False, inplace = True)
df_crash_pivoted.drop(axis = 1, labels='diff', inplace= True)
df_crash_pivoted

Unnamed: 0_level_0,case_proportion,case_proportion
is_resident,False,True
crash_category,Unnamed: 1_level_2,Unnamed: 2_level_2
MISCELLANEOUS,0.184059,0.139487
"SAME TRAFFICWAY, OPPOSITE DIRECTION",0.190141,0.16
"SAME TRAFFICWAY, SAME DIRECTION",0.081306,0.068718
NOT CATEGORIZED,0.005762,0.004103
INTERSECTING PATHS (VEHICLE DAMAGE),0.070102,0.084103
SINGLE DRIVER,0.381562,0.405128
"CHANGING TRAFFICWAY, VEHICLE TURNING",0.087388,0.138462


- According to the pivot table, the largest difference of case proportion appears in the <u>MISCELLANEOUS</u> category, followed by <u>SAME TRAFFICWAY, OPPOSITE DIRECTION</u> category.</br>

- We conclude that <b>there is a significantly high proportion of non-resident drivers who caused MISCELLANEOUS crash events</b>.</br>

Next, we load the matadata of driver behavioral factors and analyze those factors.

In [196]:
df_metadata_driver_factor

Unnamed: 0,factor_index,driver_factor,effect_start,effect_end,factor_category
0,0,No factors identified for this driver,0,9999,default
1,1,"Drowsy, Asleep",0,2009,Physical/Mental Condition
2,2,"Ill, Passed Out, Blacked Out",0,2009,Physical/Mental Condition
3,3,"Emotional (e.g., depressed, angry, disturbed)",0,2009,Physical/Mental Condition
...,...,...,...,...,...
103,94,Emergency Medical Service Personnel,0,2019,Possible Distraction Inside Vehicle
104,95,Fire Personnel,0,2019,Possible Distraction Inside Vehicle
105,96,Tow Operator,0,2019,Possible Distraction Inside Vehicle
106,97,"Transportation i.e. maintenance workers, safte...",0,2019,Possible Distraction Inside Vehicle


In [197]:
df_metadata_driver_factor = df_metadata_driver_factor.loc[
    df_metadata_driver_factor.effect_end >= df_data_main.year.min(), :
]

df_metadata_driver_factor.sort_values(by = 'factor_index', ascending=True)

Unnamed: 0,factor_index,driver_factor,effect_start,effect_end,factor_category
0,0,No factors identified for this driver,0,9999,default
4,4,Reaction to/Failure to take Drugs/Medication,0,9999,Physical/Mental Condition
7,6,Careless Driving,2012,9999,Physical/Mental Condition
10,8,Aggressive Driving Road Rage,2004,9999,Physical/Mental Condition
...,...,...,...,...,...
103,94,Emergency Medical Service Personnel,0,2019,Possible Distraction Inside Vehicle
104,95,Fire Personnel,0,2019,Possible Distraction Inside Vehicle
105,96,Tow Operator,0,2019,Possible Distraction Inside Vehicle
106,97,"Transportation i.e. maintenance workers, safte...",0,2019,Possible Distraction Inside Vehicle


In [199]:
def sum_driver_factor(index:int, df:pd.DataFrame):
    joint = '|'.join(df.loc[index, 'drf1':'drf4'].astype(int).astype(str))   # first conerted to int to remove trailing decimal zero
    return re.sub(pattern=r'\|0', string=joint, repl='')


df_data_main.loc[:, 'drf1':'drf4'] = df_data_main.loc[:, 'drf1':'drf4'].fillna(0)     # in this case, na suggests no factor rather than the missing of value
df_data_main['driver_factor_summed'] = df_data_main.index.map( lambda i:
    sum_driver_factor(i, df_data_main)
)

df_data_main.drop(axis=1, labels=['drf1','drf2','drf3','drf4'], inplace= True)  # drop the original component columns as we already acquired the summed up one
df_data_main.driver_factor_summed

0          87
1           0
2           0
3       87|91
        ...  
4128        0
4129        0
4130        0
4131        0
Name: driver_factor_summed, Length: 4100, dtype: object

In [200]:
df_derive_drf = df_data_main[['is_resident', 'driver_factor_summed']]

map_driver_index_factor = {     # map a driver factor index to its actual meaning
    k:v for k,v in zip(df_metadata_driver_factor.factor_index, df_metadata_driver_factor.driver_factor)
}



##### Output Cleaned Dataset for Visualization

In [201]:
df_data_main.to_csv(dir + 'output/data_vis.csv')

##### Analysis of Behavioral Factors

- The following columns are thought to indicate whether an involved person conducted risky behavior in the crash event.
- - restraintmisuse: valued 1 when there was a restraint misuse
- - helmetmisuse: valued 1 when there was a helmet misuse
- - 

##### Predictive Analysis of Risky Drivers