Can we predict the severity of an accident based on time, weather, and location data?
    Using Random Forest, Logistic Regression, and SVM (multiclass classification)

In [2]:
#imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

###Random Forest

In [3]:
#Random Forest

#dataset
df = pd.read_csv('../Data/us_accidents_sample_500k_clean.csv')

#target 
#df.Severity.value_counts() #already int

# Extract time features from Start_Time
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['Hour'] = df['Start_Time'].dt.hour
df['Day_of_Week'] = df['Start_Time'].dt.dayofweek
df['Month'] = df['Start_Time'].dt.month

# Reduce high-cardinality by grouping rare categories for 'State' and 'Weather_Simple'
state_counts = df['State'].value_counts()
rare_states = state_counts[state_counts < 500].index  # threshold: group states with <500 occurrences
df['State'] = df['State'].replace(rare_states, 'Other_State')

weather_counts = df['Weather_Simple'].value_counts()
rare_weather = weather_counts[weather_counts < 1000].index
df['Weather_Simple'] = df['Weather_Simple'].replace(rare_weather, 'Other_Weather')

# Small stratified subsample for fast experiments (optional)
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.2, random_state=42)
train_idx, _ = next(sss.split(df, df['Severity']))
df_sample = df.iloc[train_idx].copy()
print('Subsample size for experiments:', len(df_sample))

# show columns
df.columns

Subsample size for experiments: 90374


Index(['ID', 'Severity', 'Start_Time', 'End_Time', 'Lat', 'Lng', 'Street',
       'City', 'County', 'State', 'Wind_Chill(F)', 'Humidity(%)',
       'Pressure(in)', 'Sunrise_Sunset', 'Severity_Label', 'Is_Day',
       'Temperature(F)', 'Visibility(mi)', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Weather_Simple', 'Hour',
       'Day_of_Week', 'Month'],
      dtype='object')

In [4]:
# Stratified subsample for faster experiments (20% of data, preserves class balance)
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.2, random_state=42)
sample_idx, _ = next(sss.split(df, df['Severity']))
df_sample = df.iloc[sample_idx].copy()
print('Stratified subsample created: df_sample with', len(df_sample), 'rows')


Stratified subsample created: df_sample with 90374 rows


In [5]:
#split into training and testing
train, test = train_test_split(
    df,
    test_size=0.20,
    random_state=42,
    stratify=df.Severity  #4 classes balanced
)

In [6]:
#chose feature + target
features = ['Weather_Simple', 'Visibility(mi)', 'Sunrise_Sunset', 'State', 
            'Hour', 'Day_of_Week', 'Month', 'Precipitation(in)', 'Humidity(%)', 'Wind_Speed(mph)']
target = 'Severity'

# check null values for new features
print("Null values per feature:")
for col in features:
    print(f"{col}: {df[col].isnull().sum()}")

# create stratified train/test split
train, test = train_test_split(
    df,
    test_size=0.20,
    random_state=42,
    stratify=df.Severity  #4 classes balanced
)

# For ML: drop rows with missing values in features (after split to avoid leakage)
train = train.dropna(subset=features + ['Severity'])
test = test.dropna(subset=features + ['Severity'])
print(f"\nTraining set size after dropping NAs: {len(train)}")
print(f"Test set size after dropping NAs: {len(test)}")

Null values per feature:
Weather_Simple: 0
Visibility(mi): 10185
Sunrise_Sunset: 1166
State: 0
Hour: 0
Day_of_Week: 0
Month: 0
Precipitation(in): 0
Humidity(%): 10016
Wind_Speed(mph): 0

Training set size after dropping NAs: 350888
Test set size after dropping NAs: 87662


In [7]:
#prepare training and testing subsets
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

In [8]:
#preprocessing(OneHot and num combo)

# numeric pipeline: impute then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

num_features = ['Visibility(mi)', 'Precipitation(in)', 'Humidity(%)', 'Wind_Speed(mph)', 'Hour', 'Day_of_Week', 'Month']
cat_features = ['Weather_Simple', 'Sunrise_Sunset', 'State']

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
        ('num', numeric_transformer, num_features)
    ],
    remainder='drop'
)

In [11]:
#pipeline
rand_forest = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', RandomForestClassifier(n_estimators=100, max_depth=12, max_features='sqrt', n_jobs=-1, random_state=42))
]) 
#n_jobs=-1 uses more power to make it run faster

In [10]:
#cross-validation only for Training
scores = cross_val_score(rand_forest, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1)
print("Cross-Validation Score:", scores)
print("Mean Cross-Validation accuracy:", scores.mean())


Cross-Validation Score: [0.77791267 0.77768183 0.77787658]
Mean Cross-Validation accuracy: 0.777823693180964


###logistic regression

In [66]:
#logistic regression multiclass 
#split data 
y = df[target] #target
X = df[features] #Features
#train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y           #stratify by the target == severity
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 361499
Test set size: 90375


In [None]:

#drop rows with NAs
train_data = pd.concat([X_train, y_train], axis=1).dropna(subset=features + [target])
X_train = train_data[features]
y_train = train_data[target]

test_data = pd.concat([X_test, y_test], axis=1).dropna(subset=features + [target])
X_test = test_data[features]
y_test = test_data[target]

In [68]:
#preprocessing
# reuse numeric_transformer defined earlier
cat_cols = ['Weather_Simple', 'Sunrise_Sunset', 'State']
num_cols = ['Visibility(mi)', 'Precipitation(in)', 'Humidity(%)', 'Wind_Speed(mph)', 'Hour', 'Day_of_Week', 'Month']

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
        ('num', numeric_transformer, num_cols)
    ],
    remainder='drop'
)

In [69]:
#pipeline
log_model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000))
])

In [70]:
#cross validation
scores = cross_val_score(
    log_model, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1
)

print("Cross-Validation scores:", scores)
print("Mean Cross-Validation accuracy:", scores.mean())

Cross-Validation scores: [0.77740824 0.77718595 0.77742344]
Mean Cross-Validation accuracy: 0.7773392081924388


###SVM

In [None]:
#SVM
from sklearn.svm import SVC

# Use the same split from logistic regression
train_data_svm = pd.concat([X_train, y_train], axis=1).dropna(subset=features + [target])
X_train_svm = train_data_svm[features]
y_train_svm = train_data_svm[target]

test_data_svm = pd.concat([X_test, y_test], axis=1).dropna(subset=features + [target])
X_test_svm = test_data_svm[features]
y_test_svm = test_data_svm[target]

# Pipeline for SVM
svm_model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', SVC(kernel='linear', random_state=42, probability=True))
])

# Cross validation (fewer folds for speed, parallelize)
scores_svm = cross_val_score(svm_model, X_train_svm, y_train_svm, cv=3, scoring='accuracy', n_jobs=-1)

print("SVM Cross-Validation scores:", scores_svm)
print("Mean SVM Cross-Validation accuracy:", scores_svm.mean())

###Best Model: 

In [None]:
#testing the best ML model
#hyperparam tuning

#refit final model

#final Eval