# Q2 â€” State-level accident rates per population / per VMT

Which locations or states experience the highest accident frequency per population or per vehicle miles?   
Goal: compute accidents per vehicle miles traveled (VMT).

# Imports and loads

In [1]:
import pandas as pd
import geopandas as gpd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go


DATA_PATH = "../Data/us_accidents_sample_500k_clean.csv"
df = pd.read_csv(DATA_PATH, parse_dates=["Start_Time"], low_memory=False)
df = df.dropna(subset=["State"])

url = "https://data.transportation.gov/resource/nps9-3pm2.csv?$limit=5000000"
df_vmt = pd.read_csv(url)

df_vmt.info()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29016 entries, 0 to 29015
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    29016 non-null  int64  
 1   state   29016 non-null  object 
 2   area    29016 non-null  object 
 3   fclass  29016 non-null  object 
 4   vmt     29016 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451874 entries, 0 to 451873
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   ID                 451874 non-null  object        
 1   Severity           451874 non-null  int64         
 2   Start_Time         451874 non-null  datetime64[ns]
 3   End_Time           451874 non-null  object        
 4   Lat                451874 non-null  float64       
 5   Lng                451874 non-null  float64       
 6   Street 

In [2]:
# Mapping from state abbreviation to full name
us_state_abbrev = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
    'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
    'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'District of Columbia'
}

# Convert abbreviations in crash dataset to full names
df['State_Full'] = df['State'].map(us_state_abbrev)

# Aggregate crashes by state
state_counts = df.groupby("State_Full").size().rename("crashes").reset_index()

# Aggregate VMT by state
vmt_by_state = df_vmt.groupby("state")["vmt"].sum().reset_index()

# Merge on full state names
merged = pd.merge(state_counts, vmt_by_state, left_on="State_Full", right_on="state", how="inner")

# Calculate accidents per million VMT
merged["accidents_per_million_vmt"] = merged["crashes"] / merged["vmt"] * 1_000_000

# Sort and view top states
top_states = merged.sort_values("accidents_per_million_vmt", ascending=False)
print(top_states[["State_Full", "crashes", "vmt", "accidents_per_million_vmt"]].head(10))


              State_Full  crashes           vmt  accidents_per_million_vmt
38        South Carolina    22870  1.894805e+12                   0.012070
3             California   100935  1.256105e+13                   0.008036
35                Oregon    10520  1.357431e+12                   0.007750
8                Florida    49038  6.984890e+12                   0.007021
7   District of Columbia     1013  1.533681e+11                   0.006605
42                  Utah     5836  9.768206e+11                   0.005974
44              Virginia    16988  3.097595e+12                   0.005484
31        North Carolina    20257  3.798594e+12                   0.005333
21             Minnesota    11040  2.132829e+12                   0.005176
16             Louisiana     8860  1.819561e+12                   0.004869


In [3]:
# Plot choropleth map
fig = px.choropleth(
    merged,
    locations="State_Full",       # Full state names
    locationmode="USA-states",    # Map expects state abbreviations, so we convert
    color="accidents_per_million_vmt",
    hover_name="State_Full",
    hover_data={"crashes": True, "vmt": True, "accidents_per_million_vmt": True},
    color_continuous_scale="Reds",
    scope="usa",
    labels={"accidents_per_million_vmt": "Accidents per million VMT"},
    title="US States: Accident Frequency per Million Vehicle Miles Traveled"
)

abbrev_to_name = {v: k for k, v in us_state_abbrev.items()}
merged["State_Abbrev"] = merged["State_Full"].map(abbrev_to_name)
fig.update_traces(locations=merged["State_Abbrev"])

fig.show()


In [4]:
top10 = merged.sort_values("accidents_per_million_vmt", ascending=False).head(10)

# Create bar chart
bar_fig = go.Figure(go.Bar(
    x=top10["State_Full"],
    y=top10["accidents_per_million_vmt"],
    text=top10["accidents_per_million_vmt"].round(2),
    textposition="auto",
    marker_color="indianred"
))

bar_fig.update_layout(
    title="Top 10 States by Accident Frequency per Million VMT",
    xaxis_title="State",
    yaxis_title="Accidents per Million VMT"
)

bar_fig.show()
