Can we predict the severity of an accident based on time, weather, and location data?
    Using Random Forest, Logistic Regression, and SVM (multiclass classification)

In [None]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


###Random Forest

In [12]:
#Random Forest

#dataset
df = pd.read_csv('../Data/us_accidents_sample_500k_clean.csv')

#target 
#df.Severity.value_counts() #already int



In [29]:
#split into training and testing
train, test = train_test_split(
    df,
    test_size=0.20,
    random_state=42,
    stratify=df.Severity  #4 classes balanced
)

In [30]:
#chose feature + target
features = ['Weather_Simple', 'Visibility(mi)', 'Sunrise_Sunset', 'State']
target = 'Severity'
#df['Visibility(mi)'].isnull().sum() #(10185)null values (drop rows)
#df.Weather_Simple.isnull().sum() #no null values (labeled as other in the data cleaning stage)
#df.Sunrise_Sunset.isnull().sum() #(1166) null values (drop rows)
#df.State.isnull().sum() #no null values (labeled as other in the data cleaning stage)

train = train.dropna(subset=features + ['Severity']) #to prevent data leak, after the split
test = train.dropna(subset=features + ['Severity']) #to prevent data leak, after the split

In [31]:
#prepare training and testing subsets
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

In [33]:
#preprocessing(OneHot and num combo)
num_features = ['Visibility(mi)']
cat_features = ['Weather_Simple', 'Sunrise_Sunset', 'State']

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('num','passthrough',num_features)
    ]
)

In [34]:
#pipline
rand_forest = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', RandomForestClassifier(random_state=42))
])

In [35]:
#cross-validation only for Training
scores = cross_val_score(rand_forest, X_train, y_train, cv=5,scoring='accuracy')
print("Cross-Validation Score:", scores)
print("Mean Cross-Validation accuracy:", scores.mean())

Cross-Validation Score: [0.77442777 0.77503439 0.77365876 0.77502021 0.77389985]
Mean Cross-Validation accuracy: 0.7744081958567266


In [39]:
#logistic regression multiclass 
#split data 
# features = ['Weather_Simple', 'Visibility(mi)', 'Sunrise_Sunset', 'State']
# target = 'Severity'
y = df[target] #target
X = df[features] #Features

#drop rows with NAs
data = pd.concat([X,y], axis=1).dropna()
X = data[features]
y = data[target]


In [40]:
#train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y           #stratify by the target == severity
)

In [45]:
#preprocessing
cat_cols = ['Weather_Simple', 'Sunrise_Sunset', 'State']
num_cols = ['Visibility(mi)']

#column transformer
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num','passthrough',num_cols)
    ]
)

In [46]:
#pipeline
log_model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000))
])

In [48]:
#cross validation
scores = cross_val_score(
    log_model, X_train, y_train, cv=5, scoring='accuracy'
)

print("Cross-Validation scores:", scores)
print("Mean Cross-Validation accuracy:", scores.mean())

Cross-Validation scores: [0.77725616 0.77732707 0.77692995 0.77685904 0.77739483]
Mean Cross-Validation accuracy: 0.7771534100013662


In [None]:
#SVM

In [None]:
#testing the best ML model
#hyperparam tuning

#refit final model

#final Eval