In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cluster import KMeans, DBSCAN

file_path = "dataset/cleaned_us_accident_data.csv"
df = pd.read_csv(file_path)

In [10]:
print(f"Dataset Loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print(df.head())

Dataset Loaded: 7472284 rows, 48 columns
    id   source  severity           start_time             end_time  \
0  A-1  source2         3  2016-02-08 05:46:00  2016-02-08 11:00:00   
1  A-2  source2         2  2016-02-08 06:07:59  2016-02-08 06:37:59   
2  A-3  source2         2  2016-02-08 06:49:27  2016-02-08 07:19:27   
3  A-4  source2         3  2016-02-08 07:23:34  2016-02-08 07:53:34   
4  A-5  source2         2  2016-02-08 07:39:07  2016-02-08 08:09:07   

   start_lat  start_lng  distance(mi)  \
0  39.865147 -84.058723          0.01   
1  39.928059 -82.831184          0.01   
2  39.063148 -84.032608          0.01   
3  39.747753 -84.205582          0.01   
4  39.627781 -84.188354          0.01   

                                         description  \
0  Right lane blocked due to accident on I-70 Eas...   
1  Accident on Brice Rd at Tussing Rd. Expect del...   
2  Accident on OH-32 State Route 32 Westbound at ...   
3  Accident on I-75 Southbound at Exits 52 52B US...   
4  Ac

In [11]:
print("Data Types of Each Column:")
print(df.dtypes)

Data Types of Each Column:
id                        object
source                    object
severity                   int64
start_time                object
end_time                  object
start_lat                float64
start_lng                float64
distance(mi)             float64
description               object
street                    object
city                      object
county                    object
state                     object
zipcode                   object
timezone                  object
airport_code              object
weather_timestamp         object
temperature(f)           float64
wind_chill(f)            float64
humidity(%)              float64
pressure(in)             float64
visibility(mi)           float64
wind_direction            object
wind_speed(mph)          float64
precipitation(in)        float64
weather_condition         object
amenity                    int64
bump                       int64
crossing                   int64
give_way        

In [12]:
# Check for Missing Values
missing_values = df.isnull().sum()
print("Missing Values Before Cleaning:")
print(missing_values[missing_values > 0])

Missing Values Before Cleaning:
start_time               714742
end_time                 714742
description                   4
street                    10390
city                        243
weather_condition         15453
sunrise_sunset            19934
civil_twilight            19934
nautical_twilight         19934
astronomical_twilight     19934
start_hour               714742
start_weekday            714742
start_month              714742
start_year               714742
duration(min)            714742
dtype: int64


In [13]:
# Check Categorical Variable
categorical_columns = [col for col in df.columns if df[col].nunique() > 1 and df[col].dtype not in ['int64', 'float64']]
unique_value_counts = {col: df[col].nunique() for col in categorical_columns}

print("Unique Value Counts for Categorical Variables:")
for col, count in unique_value_counts.items():
  print(f"{col}: {count} unique values")

Unique Value Counts for Categorical Variables:
id: 7472284 unique values
source: 3 unique values
start_time: 5382863 unique values
end_time: 5859595 unique values
description: 3652997 unique values
street: 329506 unique values
city: 13185 unique values
county: 1857 unique values
state: 49 unique values
zipcode: 805783 unique values
timezone: 4 unique values
airport_code: 2014 unique values
weather_timestamp: 930711 unique values
wind_direction: 23 unique values
weather_condition: 142 unique values
sunrise_sunset: 2 unique values
civil_twilight: 2 unique values
nautical_twilight: 2 unique values
astronomical_twilight: 2 unique values


In [14]:
# Check Correlation with Severity and Occurrence Count
for col in ['source', 'timezone', 'sunrise_sunset', 'civil_twilight', 'nautical_twilight', 'astronomical_twilight']:
  severity_stats = df.groupby(col)['severity'].agg(['mean', 'count']).reset_index()
  print(severity_stats.to_string(index=False))

 source     mean   count
source1 2.117767 4161516
source2 2.331339 3216401
source3 2.275541   94367
   timezone     mean   count
 us/central 2.235551 1607849
 us/eastern 2.225291 3457823
us/mountain 2.200587  411677
 us/pacific 2.171176 1994935
sunrise_sunset     mean   count
           day 2.208334 5167624
         night 2.219192 2284726
civil_twilight     mean   count
           day 2.208587 5517464
         night 2.220434 1934886
nautical_twilight     mean   count
              day 2.209026 5885950
            night 2.221570 1566400
astronomical_twilight     mean   count
                  day 2.209996 6177286
                night 2.219739 1275064


In [None]:
# Classify Weather_Condition into Broader Categories
weather_categories = {
    'Clear': ['clear', 'fair'],
    'Cloudy': ['cloud', 'overcast'],
    'Foggy': ['fog', 'mist', 'haze'],
    'Windy': ['windy'],
    'Dusty': ['dust', 'sand', 'volcanic ash'],
    'Smoky': ['smoke'],
    'Drizzle': ['drizzle'],
    'Rainy': ['rain'],
    'Snowy': ['snow'],
    'Sleet/Ice': ['sleet', 'freezing', 'ice pellets', 'wintry mix'],
    'Thunderstorms': ['thunder', 'storm'],
    'Extreme': ['tornado', 'funnel cloud', 'squalls']
}

df['weather_condition'] = df['weather_condition'].fillna('Unknown').astype(str).str.lower()

for category in weather_categories.keys():
    df[category] = 0

for category, keywords in weather_categories.items():
    df[category] = df['weather_condition'].apply(lambda x: int(any(kw in x for kw in keywords)))

In [None]:
features = [
  'start_lat', 'start_lng', 'distance(mi)', 'temperature(f)', 'wind_chill(f)', 'humidity(%)', 'pressure(in)', 'visibility(mi)', 'wind_speed(mph)', 'precipitation(in)', 'amenity', 'bump', 'crossing', 'give_way', 'junction', 'no_exit', 'railway', 'roundabout', 'station', 'stop', 'traffic_calming', 'traffic_signal', 'turning_loop'
] + list(weather_categories.keys())
target = 'severity'

df = df.dropna(subset=features + [target])

unnecessary_columns = [
  'id', 'source', 'start_time', 'end_time', 'description', 'street', 'city', 'county', 'state', 'zipcode', 'timezone', 'airport_code', 'weather_timestamp', 'wind_direction', 'weather_condition','sunrise_sunset', 'civil_twilight', 'nautical_twilight', 'astronomical_twilight', 'start_hour', 'start_weekday', 'start_month', 'start_year', 'duration(min)'
]

df = df.drop(columns=unnecessary_columns, errors='ignore')

print(f"Cleaned Dataset: {df.shape[0]} rows, {df.shape[1]} columns")
print("\nColumns Remaining After Feature Selection:")
print(df.columns)



Cleaned Dataset: 7472284 rows, 36 columns
   severity  start_lat  start_lng  distance(mi)  temperature(f)  \
0         3  39.865147 -84.058723          0.01            36.9   
1         2  39.928059 -82.831184          0.01            37.9   
2         2  39.063148 -84.032608          0.01            36.0   
3         3  39.747753 -84.205582          0.01            35.1   
4         2  39.627781 -84.188354          0.01            36.0   

   wind_chill(f)  humidity(%)  pressure(in)  visibility(mi)  wind_speed(mph)  \
0           62.0         91.0         29.68            10.0              7.0   
1           62.0        100.0         29.65            10.0              7.0   
2           33.3        100.0         29.67            10.0              3.5   
3           31.0         96.0         29.64            10.0              4.6   
4           33.3         89.0         29.65            10.0              3.5   

   ...  Foggy  Windy  Dusty  Smoky  Drizzle  Rainy  Snowy  Sleet/Ice  \
0

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

print("Training Set Size:", X_train.shape, "Testing Set Size:", X_test.shape)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Training Set Size: (5977827, 35) Testing Set Size: (1494457, 35)


In [18]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.8587754615890588
              precision    recall  f1-score   support

           1       0.71      0.20      0.32     13056
           2       0.88      0.95      0.92   1190676
           3       0.74      0.55      0.63    252043
           4       0.47      0.21      0.29     38682

    accuracy                           0.86   1494457
   macro avg       0.70      0.48      0.54   1494457
weighted avg       0.85      0.86      0.85   1494457

