In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt



import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load the cleaned accident data
df = pd.read_csv('D:/accident_severity_analytics/data/processed/Cleaning_Crashes_Data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142276 entries, 60 to 2076250
Data columns (total 21 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   crash date                     142276 non-null  object 
 1   crash time                     142276 non-null  object 
 2   borough                        142276 non-null  object 
 3   latitude                       142276 non-null  float64
 4   longitude                      142276 non-null  float64
 5   on street name                 142276 non-null  object 
 6   cross street name              142276 non-null  object 
 7   number of persons injured      142276 non-null  int64  
 8   number of persons killed       142276 non-null  int64  
 9   number of pedestrians injured  142276 non-null  int64  
 10  number of pedestrians killed   142276 non-null  int64  
 11  number of cyclist injured      142276 non-null  int64  
 12  number of cyclist killed       14

In [3]:
df.describe()

Unnamed: 0,latitude,longitude,number of persons injured,number of persons killed,number of pedestrians injured,number of pedestrians killed,number of cyclist injured,number of cyclist killed,number of motorist injured,number of motorist killed,collision_id,Severity_score
count,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0
mean,40.658501,-73.792514,0.328088,0.000696,0.002622,9.8e-05,0.045995,0.000141,0.273588,0.000436,2912222.0,0.331567
std,1.741243,3.158349,0.76439,0.02767,0.065373,0.009919,0.212143,0.011856,0.744308,0.022492,1561182.0,0.781029
min,0.0,-74.252876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0
25%,40.683777,-73.979404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2811218.0,0.0
50%,40.739706,-73.944547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3447761.0,0.0
75%,40.771393,-73.883623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4022989.0,0.0
max,40.912468,0.0,24.0,3.0,9.0,1.0,3.0,1.0,24.0,3.0,4712957.0,24.0


In [4]:
zero_percentage = (df == 0).sum() / len(df) * 100
print(zero_percentage)

crash date                        0.000000
crash time                        0.000000
borough                           0.000000
latitude                          0.182743
longitude                         0.182743
on street name                    0.000000
cross street name                 0.000000
number of persons injured        77.922489
number of persons killed         99.933228
number of pedestrians injured    99.784222
number of pedestrians killed     99.990160
number of cyclist injured        95.456015
number of cyclist killed         99.985943
number of motorist injured       83.117321
number of motorist killed        99.959234
contributing factor vehicle 1     0.000000
contributing factor vehicle 2     0.000000
collision_id                      0.000000
vehicle type code 1               0.000000
vehicle type code 2               0.000000
Severity_score                   77.886643
dtype: float64


In [6]:
# Drop columns with more than 99% zeros
drop_cols = ['number of persons killed', 'number of pedestrians killed', 
             'number of cyclist killed', 'number of motorist killed']
df.drop(columns=drop_cols, inplace=True)

In [7]:
# Convert "number of persons injured" into categorical labels
def categorize_injury(x):
    if x == 0:
        return 'No Injury'
    elif x <= 2:
        return 'Minor Injury'
    else:
        return 'Severe Injury'

df['injury_category'] = df['number of persons injured'].apply(categorize_injury)


In [8]:
# Drop the original column after conversion
df.drop(columns=['number of persons injured'], inplace=True)

In [9]:
# Encode categorical variables using Label Encoding
categorical_cols = ['borough', 'on street name', 'cross street name', 
                    'contributing factor vehicle 1', 'contributing factor vehicle 2', 
                    'vehicle type code 1', 'vehicle type code 2', 'injury_category']

for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

In [10]:
# Define features (X) and target variable (y)
X = df.drop(columns=['injury_category'])  # Features
y = df['injury_category']  # Target variable

In [11]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [14]:
# Convert 'crash time' to datetime format first
df['crash time'] = pd.to_datetime(df['crash time'])

# Extract only the hour (0-23)
df['crash hour'] = df['crash time'].dt.hour

# Drop the original 'crash time' column
df.drop(columns=['crash time'], inplace=True)


In [15]:
# Balance the dataset using SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

ValueError: could not convert string to float: '2018-06-23'

In [5]:
# Convert categorical features to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

In [6]:
# Define the target variable
y = df['fatal_accident']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [9]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1')
grid_search.fit(X_resampled, y_resampled)

In [None]:
# Best model after hyperparameter tuning
best_model = grid_search.best_estimator_

In [None]:
# Train the best model
best_model.fit(X_resampled, y_resampled)

In [None]:
# Predict on the test set
y_pred = best_model.predict(X_test)

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28438
           1       1.00      0.89      0.94        18

    accuracy                           1.00     28456
   macro avg       1.00      0.94      0.97     28456
weighted avg       1.00      1.00      1.00     28456

Confusion Matrix:
[[28438     0]
 [    2    16]]


In [None]:
# Calculate ROC AUC score
y_pred_prob = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)
print("ROC AUC Score:", roc_auc)

Accuracy: 99.99%


In [None]:
# Plotting ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


In [9]:
df.describe()


Unnamed: 0,latitude,longitude,number of persons injured,number of persons killed,number of pedestrians injured,number of pedestrians killed,number of cyclist injured,number of cyclist killed,number of motorist injured,number of motorist killed,collision_id,Severity_score,fatal_accident
count,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0,142276.0
mean,40.658501,-73.792514,0.328088,0.000696,0.002622,9.8e-05,0.045995,0.000141,0.273588,0.000436,2912222.0,0.331567,0.000668
std,1.741243,3.158349,0.76439,0.02767,0.065373,0.009919,0.212143,0.011856,0.744308,0.022492,1561182.0,0.781029,0.025832
min,0.0,-74.252876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0
25%,40.683777,-73.979404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2811218.0,0.0,0.0
50%,40.739706,-73.944547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3447761.0,0.0,0.0
75%,40.771393,-73.883623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4022989.0,0.0,0.0
max,40.912468,0.0,24.0,3.0,9.0,1.0,3.0,1.0,24.0,3.0,4712957.0,24.0,1.0


crash date                        0.000000
crash time                        0.000000
borough                           0.000000
latitude                          0.182743
longitude                         0.182743
on street name                    0.000000
cross street name                 0.000000
number of persons injured        77.922489
number of persons killed         99.933228
number of pedestrians injured    99.784222
number of pedestrians killed     99.990160
number of cyclist injured        95.456015
number of cyclist killed         99.985943
number of motorist injured       83.117321
number of motorist killed        99.959234
contributing factor vehicle 1     0.000000
contributing factor vehicle 2     0.000000
collision_id                      0.000000
vehicle type code 1               0.000000
vehicle type code 2               0.000000
Severity_score                   77.886643
fatal_accident                   99.933228
dtype: float64
