This notebook explores other metrics for determining that a location is dangerous, 
e.g. normalizing the number of severe accidents by the total number of accidents.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/collisions_clean.csv')

# Analysis from original EDA notebook

In [3]:
# Metric #1: Count of all accidents
locdf = df['location'].value_counts().to_frame('count')
locdf = locdf.head(7)
locdf

Unnamed: 0,count
BATTERY ST TUNNEL NB BETWEEN ALASKAN WY VI NB AND AURORA AVE N,298
N NORTHGATE WAY BETWEEN MERIDIAN AVE N AND CORLISS AVE N,295
BATTERY ST TUNNEL SB BETWEEN AURORA AVE N AND ALASKAN WY VI SB,291
AURORA AVE N BETWEEN N 117TH PL AND N 125TH ST,280
6TH AVE AND JAMES ST,272
AURORA AVE N BETWEEN N 130TH ST AND N 135TH ST,269
ALASKAN WY VI NB BETWEEN S ROYAL BROUGHAM WAY ON RP AND SENECA ST OFF RP,256


In [4]:
# Re-labeling serverity so that the categories are self-explanatory
df['severitycode'] = df['severitycode'].map({
    '0': 'Unknown', 
    '1': 'Prop_damage_only', 
    '2': 'Injury',
    '2b': 'Serious_injury', '3': 'Fatality'
})

In [5]:
df['severitycode'].value_counts(dropna=False).to_frame('count')

Unnamed: 0,count
Prop_damage_only,137026
Injury,58472
Unknown,21509
Serious_injury,3082
Fatality,346
,1


In [6]:
# Metric #2: Count of accidents involving serious injuries (NOT normalized by total accidents)

# From the EDA notebook
# Select incidents that involved injuries and deaths 
severity = ['Fatality', 'Serious_injury', 'Injury']
sevdf = df[(df['severitycode'].isin(severity))]
# Group the dataframe by location to rank locations by the number of severe incidents
sevcdf = sevdf['location'].value_counts().to_frame('count')

# Select the top 7 locations
most_sev = sevcdf.head(7)
most_sev

Unnamed: 0,count
AURORA AVE N BETWEEN N 117TH PL AND N 125TH ST,126
6TH AVE AND JAMES ST,112
RAINIER AVE S BETWEEN S BAYVIEW ST AND S MCCLELLAN ST,97
N NORTHGATE WAY BETWEEN MERIDIAN AVE N AND CORLISS AVE N,96
AURORA AVE N BETWEEN N 130TH ST AND N 135TH ST,96
BATTERY ST TUNNEL SB BETWEEN AURORA AVE N AND ALASKAN WY VI SB,93
WEST SEATTLE BR EB BETWEEN ALASKAN WY VI NB ON RP AND DELRIDGE-W SEATTLE BR EB ON RP,82


In [7]:
sorted(most_sev.index)

['6TH AVE AND JAMES ST',
 'AURORA AVE N BETWEEN N 117TH PL AND N 125TH ST',
 'AURORA AVE N BETWEEN N 130TH ST AND N 135TH ST',
 'BATTERY ST TUNNEL SB BETWEEN AURORA AVE N AND ALASKAN WY VI SB',
 'N NORTHGATE WAY BETWEEN MERIDIAN AVE N AND CORLISS AVE N',
 'RAINIER AVE S BETWEEN S BAYVIEW ST AND S MCCLELLAN ST',
 'WEST SEATTLE BR EB BETWEEN ALASKAN WY VI NB ON RP AND DELRIDGE-W SEATTLE BR EB ON RP']

In [8]:
# Metric 3: Same as Metric 2 but in alpha order
# These are the locations with highest number of severe incidents

mostsevlocations = list(most_sev.index)
sorted(mostsevlocations, key=str.lower)

['6TH AVE AND JAMES ST',
 'AURORA AVE N BETWEEN N 117TH PL AND N 125TH ST',
 'AURORA AVE N BETWEEN N 130TH ST AND N 135TH ST',
 'BATTERY ST TUNNEL SB BETWEEN AURORA AVE N AND ALASKAN WY VI SB',
 'N NORTHGATE WAY BETWEEN MERIDIAN AVE N AND CORLISS AVE N',
 'RAINIER AVE S BETWEEN S BAYVIEW ST AND S MCCLELLAN ST',
 'WEST SEATTLE BR EB BETWEEN ALASKAN WY VI NB ON RP AND DELRIDGE-W SEATTLE BR EB ON RP']

In [9]:
# These are the locations with highest total number of incidents (we found them earlier)
manyaccident_locations = list(locdf.index)
sorted(manyaccident_locations, key=str.lower) # alpha sort

['6TH AVE AND JAMES ST',
 'ALASKAN WY VI NB BETWEEN S ROYAL BROUGHAM WAY ON RP AND SENECA ST OFF RP',
 'AURORA AVE N BETWEEN N 117TH PL AND N 125TH ST',
 'AURORA AVE N BETWEEN N 130TH ST AND N 135TH ST',
 'BATTERY ST TUNNEL NB BETWEEN ALASKAN WY VI NB AND AURORA AVE N',
 'BATTERY ST TUNNEL SB BETWEEN AURORA AVE N AND ALASKAN WY VI SB',
 'N NORTHGATE WAY BETWEEN MERIDIAN AVE N AND CORLISS AVE N']

# Alternate metrics 
- percent of accidents that are severe (considering only locations with N accidents)
- 'baseline' model of the average #severe/#total...which ones are out of line? (not shown here)

Just out of curiosity
- highest number of non-severe accidents (Unknown and also Prop_damage_only)
- highest number of property damage only

In [10]:
# Metric 3: rank locations based on percent severe, a.k.a. #severe/#total 
# top totals

# Option to include only the ones that still exist
topdf = df.loc[df['fe_exists']==1,
#topdf = df.loc[:,
               'location'].value_counts().to_frame('total')

# severe ones are in 
severe_df = sevcdf.rename(columns={'count':'severe'})

# merge total and severe
mdf = pd.merge(topdf,severe_df,how='left',
               left_index=True, right_index=True)


# Create pct_severe column
mdf['pct_severe'] = mdf.apply(lambda x: x['severe']/x['total'],axis=1)

# Rank by percent severe, where there have been at least N accidents

num_accidents = 25

mdf.loc[mdf['total']>num_accidents,:].sort_values(by='pct_severe',
                                              ascending=False).head(20)

Unnamed: 0,total,severe,pct_severe
BOYLSTON AVE AND E PINE ST,46,34.0,0.73913
14TH AVE NW AND NW MARKET ST,46,33.0,0.717391
14TH AVE AND E YESLER WAY,31,22.0,0.709677
18TH AVE S AND S JACKSON ST,31,22.0,0.709677
8TH AVE S AND S DEARBORN ST,33,23.0,0.69697
AIRPORT WAY S BETWEEN S OTHELLO ST AND MILITARY RD S,26,18.0,0.692308
NE 45TH ST BETWEEN 1ST AVE NE AND 2ND AVE NE,42,29.0,0.690476
EAST MARGINAL WAY S AND S ALASKA ST,62,41.0,0.66129
EASTLAKE AVE E AND FUHRMAN AVE E,50,33.0,0.66
1ST AVE S AND S SPOKANE NR ST,35,23.0,0.657143


In [11]:
# Revisit metric #2, ranked by severe
num_accidents = 25
mdf.loc[mdf['total']>num_accidents,:].sort_values(by='severe',
                                              ascending=False).head(20)

# Locations with a high number of severe accidents also rank high for accidents in general,
#   the resulting pct_severe is often lower (e.g. < 50%)

Unnamed: 0,total,severe,pct_severe
AURORA AVE N BETWEEN N 117TH PL AND N 125TH ST,280,126.0,0.45
6TH AVE AND JAMES ST,272,112.0,0.411765
RAINIER AVE S BETWEEN S BAYVIEW ST AND S MCCLELLAN ST,255,97.0,0.380392
N NORTHGATE WAY BETWEEN MERIDIAN AVE N AND CORLISS AVE N,295,96.0,0.325424
AURORA AVE N BETWEEN N 130TH ST AND N 135TH ST,269,96.0,0.356877
5TH AVE AND SPRING ST,173,77.0,0.445087
BOREN AVE AND PIKE ST,134,75.0,0.559701
12TH AVE AND E MADISON ST,136,69.0,0.507353
RAINIER AVE S AND S ORCAS ST,136,69.0,0.507353
AURORA BR BETWEEN RAYE ST AND BRIDGE WAY N,217,67.0,0.308756


In [12]:
# 'Baseline' -- #severe/#total
total_severe = mdf['severe'].sum()
total        = mdf['total'].sum()
print(f"For ALL accidents: Severe {total_severe:.0f} / Total: {total}  = Percent_Severe {100*total_severe/total:.0f}% ")

print(f"For accidents grouped by location, looking at Percent_Severe:")
print(f"   Average Percent_Severe {100*mdf['pct_severe'].mean():.0f}%")
print(f"   Median Percent_Severe {100*mdf['pct_severe'].median():.0f}%")

For ALL accidents: Severe 60249 / Total: 211483  = Percent_Severe 28% 
For accidents grouped by location, looking at Percent_Severe:
   Average Percent_Severe 43%
   Median Percent_Severe 36%


In [13]:
# Metric A1: most non-severe accidents
low_severity = ['Unknown','Prop_damage_only']

lowsevdf = df[df['severitycode'].isin(low_severity)]

lowsevcdf = lowsevdf['location'].value_counts().to_frame('count')
# Select the top 7 locations
low_sev = lowsevcdf.head(7)
low_sev

# Note that the only change is that we've swapped
#    add: RAINIER AVE S BETWEEN S BAYVIEW ST AND S MCCLELLAN ST
#   drop: AURORA AVE N BETWEEN N 117TH PL AND N 125TH ST

Unnamed: 0,count
BATTERY ST TUNNEL NB BETWEEN ALASKAN WY VI NB AND AURORA AVE N,217
N NORTHGATE WAY BETWEEN MERIDIAN AVE N AND CORLISS AVE N,199
BATTERY ST TUNNEL SB BETWEEN AURORA AVE N AND ALASKAN WY VI SB,198
ALASKAN WY VI NB BETWEEN S ROYAL BROUGHAM WAY ON RP AND SENECA ST OFF RP,188
AURORA AVE N BETWEEN N 130TH ST AND N 135TH ST,173
6TH AVE AND JAMES ST,160
RAINIER AVE S BETWEEN S BAYVIEW ST AND S MCCLELLAN ST,158


In [14]:
# Metric A2: most property-only accidents (could still be expensive, just not in terms of 
#                                          human life)
property_only = ['Prop_damage_only']

propsevdf = df[df['severitycode'].isin(property_only)]

propsevcdf = propsevdf['location'].value_counts().to_frame('count')
# Select the top 7 locations
prop_sev = propsevcdf.head(7)
prop_sev

# Now we see
#    add: ALASKAN WY VI SB BETWEEN COLUMBIA ST ON RP AND ALASKAN WY VI SB EFR OFF RP
#  

Unnamed: 0,count
BATTERY ST TUNNEL NB BETWEEN ALASKAN WY VI NB AND AURORA AVE N,198
BATTERY ST TUNNEL SB BETWEEN AURORA AVE N AND ALASKAN WY VI SB,185
ALASKAN WY VI NB BETWEEN S ROYAL BROUGHAM WAY ON RP AND SENECA ST OFF RP,175
N NORTHGATE WAY BETWEEN MERIDIAN AVE N AND CORLISS AVE N,171
AURORA AVE N BETWEEN N 130TH ST AND N 135TH ST,151
6TH AVE AND JAMES ST,146
ALASKAN WY VI SB BETWEEN COLUMBIA ST ON RP AND ALASKAN WY VI SB EFR OFF RP,144
