# Imports

In [1]:
import kagglehub
import pandas as pd
import numpy as np
import glob

# 1. Download dataset

In [2]:
path = kagglehub.dataset_download("sobhanmoosavi/us-accidents")
csv_file = glob.glob(f'{path}/*.csv')[0]

print(f"Loaded dataset: {csv_file}")

Loaded dataset: C:\Users\jacob\.cache\kagglehub\datasets\sobhanmoosavi\us-accidents\versions\13\US_Accidents_March23.csv


# 2. Load entire dataset (only once) and take reproducible 500k sample

In [3]:
np.random.seed(42)

df = pd.read_csv(csv_file, low_memory=False)

print(f"Total rows in original dataset: {len(df):,}")

sample_size = 500_000
df = df.sample(n=sample_size, random_state=42)

print(f"Sampled rows: {len(df):,}")

Total rows in original dataset: 7,728,394
Sampled rows: 500,000


# 3. Basic initial cleaning

In [4]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')

df['Weather_Condition'] = (
    df['Weather_Condition']
    .astype(str)
    .str.lower()
    .str.strip()
)

severity_map = {
    1: 'Minor',
    2: 'Moderate',
    3: 'Major',
    4: 'Severe'
}

df['Severity_Label'] = df['Severity'].map(severity_map)

df = df.dropna(subset=['Start_Time', 'Weather_Condition', 'Severity'])

print("Finished cleaning step 1 (datetime, weather, severity).")

Finished cleaning step 1 (datetime, weather, severity).


# 4. Save cleaned sample

In [5]:
output_file = 'us_accidents_sample_500k_clean.csv'
df.to_csv(output_file, index=False)

print(f"Saved cleaned sample to: {output_file}")
print(f"Final rows: {len(df):,}")

Saved cleaned sample to: us_accidents_sample_500k_clean.csv
Final rows: 451,874


# dataframe checks

In [6]:
df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Severity_Label
7133276,A-7182628,Source1,1,2020-04-17 09:29:30,2020-04-17 10:29:30,26.7069,-80.11936,26.7069,-80.11936,0.0,...,False,False,False,True,False,Day,Day,Day,Day,Minor
155993,A-156000,Source3,3,2016-08-12 16:45:00,2016-08-12 17:15:00,33.985249,-84.269348,,,0.0,...,False,False,False,False,False,Day,Day,Day,Day,Major
1861414,A-1871277,Source2,3,2019-09-20 15:22:16,2019-09-20 15:56:00,47.118706,-122.556908,,,0.0,...,False,False,False,False,False,Day,Day,Day,Day,Major
2021359,A-2031222,Source2,2,2019-06-03 16:55:43,2019-06-03 18:12:09,33.451355,-111.890343,,,0.0,...,False,False,False,False,False,Day,Day,Day,Day,Moderate
1157640,A-1167415,Source2,2,2021-02-04 12:48:21,2021-02-04 16:51:15,42.44891,-93.721138,,,7.77,...,False,False,False,False,False,Day,Day,Day,Day,Moderate


In [8]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 451874 entries, 7133276 to 5228853
Data columns (total 47 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   ID                     451874 non-null  object        
 1   Source                 451874 non-null  object        
 2   Severity               451874 non-null  int64         
 3   Start_Time             451874 non-null  datetime64[ns]
 4   End_Time               451874 non-null  object        
 5   Start_Lat              451874 non-null  float64       
 6   Start_Lng              451874 non-null  float64       
 7   End_Lat                231198 non-null  float64       
 8   End_Lng                231198 non-null  float64       
 9   Distance(mi)           451874 non-null  float64       
 10  Description            451873 non-null  object        
 11  Street                 451298 non-null  object        
 12  City                   451860 non-null  ob

ID                            0
Source                        0
Severity                      0
Start_Time                    0
End_Time                      0
Start_Lat                     0
Start_Lng                     0
End_Lat                  220676
End_Lng                  220676
Distance(mi)                  0
Description                   1
Street                      576
City                         14
County                        0
State                         0
Zipcode                      88
Country                       0
Timezone                    416
Airport_Code               1220
Weather_Timestamp          6833
Temperature(F)             9404
Wind_Chill(F)            127866
Humidity(%)               10016
Pressure(in)               8073
Visibility(mi)            10185
Wind_Direction             9948
Wind_Speed(mph)           35683
Precipitation(in)        140796
Weather_Condition             0
Amenity                       0
Bump                          0
Crossing

In [9]:
df.nunique()

ID                       451874
Source                        3
Severity                      4
Start_Time               439744
End_Time                 445918
Start_Lat                334418
Start_Lng                335192
End_Lat                  186138
End_Lng                  187177
Distance(mi)               9959
Description              373587
Street                    82147
City                       9229
County                     1590
State                        49
Zipcode                  117312
Country                       1
Timezone                      4
Airport_Code               1870
Weather_Timestamp        242113
Temperature(F)              717
Wind_Chill(F)               805
Humidity(%)                 100
Pressure(in)                985
Visibility(mi)               63
Wind_Direction               24
Wind_Speed(mph)             103
Precipitation(in)           170
Weather_Condition           105
Amenity                       2
Bump                          2
Crossing

In [10]:
for thing in df:
    unique = df[thing].unique()
    print(unique)
    print()

['A-7182628' 'A-156000' 'A-1871277' ... 'A-7027703' 'A-4141403'
 'A-5268520']

['Source1' 'Source3' 'Source2']

[1 3 2 4]

<DatetimeArray>
['2020-04-17 09:29:30', '2016-08-12 16:45:00', '2019-09-20 15:22:16',
 '2019-06-03 16:55:43', '2021-02-04 12:48:21', '2022-06-23 10:57:30',
 '2020-09-25 16:48:29', '2022-02-04 19:57:43', '2020-01-14 00:41:00',
 '2018-03-23 07:43:01',
 ...
 '2017-09-20 19:22:27', '2017-06-06 07:18:25', '2020-02-09 03:40:00',
 '2022-09-30 16:10:16', '2020-06-10 08:07:29', '2022-03-09 16:07:01',
 '2022-08-28 09:49:52', '2020-05-13 17:53:29', '2022-05-20 15:41:35',
 '2022-04-18 16:41:08']
Length: 439744, dtype: datetime64[ns]

['2020-04-17 10:29:30' '2016-08-12 17:15:00' '2019-09-20 15:56:00' ...
 '2022-08-28 10:34:38' '2020-05-13 18:08:29' '2022-05-20 17:48:23']

[26.7069   33.985249 47.118706 ... 38.317217 40.646566 29.614077]

[ -80.11936   -84.269348 -122.556908 ...  -77.462997  -74.228173
  -82.340822]

[26.7069         nan 38.884636 ... 44.0335   38.321137 29.6162