In [1]:
import requests
import zipfile
import pandas as pd
from io import BytesIO

url = 'https://github.com/DataScienceAndEngineering/machine-learning-dse-i210-final-project-nyc-car-accident-severity/blob/main/data/processed/Processed_Data_v2.zip?raw=true'

# Download the zip file
response = requests.get(url)
zip_content = BytesIO(response.content)

# Unzip the file|
with zipfile.ZipFile(zip_content, 'r') as zip_ref:
    # Extract all the contents into the current directory
    zip_ref.extractall()

    csv_filename = zip_ref.namelist()[0]

    # Read the CSV file
    mvc_processed = pd.read_csv(csv_filename)

In [2]:
# Combine Class 1, 2, 3 to make a  binary class dataset

mvc_processed_binary = mvc_processed

# Display original class distribution
print("Original Class Distribution:")
print(mvc_processed_binary['CLASS TYPE'].value_counts())

# Function to combine classes
def combine_classes(label):
    if label == 'Class 0':
        return 'Class 0'
    else:
        return 'Class 1'

# Apply the function to the 'CLASS TYPE' column
mvc_processed_binary['CLASS TYPE'] = mvc_processed_binary['CLASS TYPE'].apply(combine_classes)

# Display new class distribution
print("\nNew Class Distribution:")
print(mvc_processed_binary['CLASS TYPE'].value_counts())

Original Class Distribution:
CLASS TYPE
Class 0    824193
Class 1    231289
Class 2      1030
Class 3       905
Name: count, dtype: int64

New Class Distribution:
CLASS TYPE
Class 0    824193
Class 1    233224
Name: count, dtype: int64


In [3]:
# Convert CRASH DATE and CRASH TIME to datetime
mvc_processed_binary['CRASH DATE'] = pd.to_datetime(mvc_processed_binary['CRASH DATE'])

mvc_processed_binary['CRASH TIME'] = pd.to_datetime(mvc_processed_binary['CRASH TIME'], format='%H:%M:%S').dt.time

In [4]:
# Decompose date and time

mvc_processed_binary['year'] = mvc_processed_binary['CRASH DATE'].dt.year
mvc_processed_binary['month'] = mvc_processed_binary['CRASH DATE'].dt.month
mvc_processed_binary['day'] = mvc_processed_binary['CRASH DATE'].dt.day
mvc_processed_binary['dayofweek'] = mvc_processed_binary['CRASH DATE'].dt.dayofweek  # Monday=0, Sunday=6

# For time, since dt accessor doesn't work directly with dtype 'time', you need to convert them again to datetime:
mvc_processed_binary['temp_datetime'] = pd.to_datetime(mvc_processed_binary['CRASH TIME'].astype(str))

mvc_processed_binary['hour'] = mvc_processed_binary['temp_datetime'].dt.hour
mvc_processed_binary['minute'] = mvc_processed_binary['temp_datetime'].dt.minute
mvc_processed_binary['second'] = mvc_processed_binary['temp_datetime'].dt.second

# Drop the temporary datetime column used for extracting time
mvc_processed_binary.drop(['temp_datetime', 'CRASH DATE', 'CRASH TIME', 'second'], axis=1, inplace=True)

  mvc_processed_binary['temp_datetime'] = pd.to_datetime(mvc_processed_binary['CRASH TIME'].astype(str))


In [5]:
mvc_processed_binary.head()

Unnamed: 0,LATITUDE,LONGITUDE,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,CLASS TYPE,year,month,day,dayofweek,hour,minute
0,40.667202,-73.8665,Unspecified,No factor,No factor,No factor,No factor,Sedan,No vehicle,No vehicle,No vehicle,No vehicle,Class 0,2021,9,11,5,9,35
1,40.86816,-73.83148,Unspecified,Unspecified,No factor,No factor,No factor,Sedan,Sedan,No vehicle,No vehicle,No vehicle,Class 1,2021,12,14,1,8,17
2,40.75144,-73.97397,Passing Too Closely,Unspecified,No factor,No factor,No factor,Sedan,Station Wagon/Sport Utility Vehicle,No vehicle,No vehicle,No vehicle,Class 0,2021,12,14,1,14,58
3,40.675884,-73.75577,Turning Improperly,Unspecified,No factor,No factor,No factor,Sedan,Station Wagon/Sport Utility Vehicle,No vehicle,No vehicle,No vehicle,Class 0,2021,12,14,1,16,50
4,40.87262,-73.904686,Unspecified,Unspecified,No factor,No factor,No factor,Station Wagon/Sport Utility Vehicle,Sedan,No vehicle,No vehicle,No vehicle,Class 1,2021,12,11,5,19,43


In [6]:
# Encode the features and targets

from sklearn.preprocessing import LabelEncoder

# Create features set
X = mvc_processed_binary.drop('CLASS TYPE', axis=1)
y = mvc_processed_binary['CLASS TYPE']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Encode categorical variables
X_encoded = pd.get_dummies(X)

In [7]:
# Split data into trianing and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, stratify=y, test_size=0.2, random_state=42)

In [8]:
# Scale the features

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Establish baseline logistic regression model (binary classification)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize logistic regression model
lr_model = LogisticRegression()

# Fit the model on the training data
lr_model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = lr_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)

Accuracy: 0.8158584100924893
Confusion Matrix:
 [[159702   5137]
 [ 33806  12839]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.97      0.89    164839
           1       0.71      0.28      0.40     46645

    accuracy                           0.82    211484
   macro avg       0.77      0.62      0.64    211484
weighted avg       0.80      0.82      0.78    211484



In [10]:
# Undersampling the majority class

from sklearn.utils import resample
import numpy as np

# Combine X_train and y_train
train_df = pd.DataFrame(X_train_scaled)
train_df['CLASS TYPE'] = y_train  # Add the target column

# Identify and separate the majority and minority classes
df_majority = train_df[train_df['CLASS TYPE'] == 0]
df_minority = train_df[train_df['CLASS TYPE'] == 1]

# Undersample the majority class
df_majority_undersampled = resample(df_majority,
                                    replace=False,    # sample without replacement
                                    n_samples=len(df_minority),  # to match minority class size
                                    random_state=123)

# Concatenate the minority class with the undersampled majority class
df_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset to mix up the rows (optional)
df_balanced = df_balanced.sample(frac=1, random_state=123).reset_index(drop=True)

# Separate features and target variable after undersampling
X_train_balanced = df_balanced.drop('CLASS TYPE', axis=1)
y_train_balanced = df_balanced['CLASS TYPE']

In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model
lr = LogisticRegression()

# Fit the model on the balanced training data
lr.fit(X_train_balanced, y_train_balanced)

# Predict on the training set
y_train_pred = lr.predict(X_train_balanced)

# Predict on the test set
y_test_pred = lr.predict(X_test_scaled)

# Print classification report for the training set
print("Training Classification Report:\n", classification_report(y_train_balanced, y_train_pred))

# Print classification report for the test set
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))


Training Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.72      0.69    186579
           1       0.69      0.63      0.66    186579

    accuracy                           0.68    373158
   macro avg       0.68      0.68      0.68    373158
weighted avg       0.68      0.68      0.68    373158

Test Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.72      0.79    164839
           1       0.39      0.64      0.49     46645

    accuracy                           0.70    211484
   macro avg       0.63      0.68      0.64    211484
weighted avg       0.77      0.70      0.72    211484



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees in the forest

# Fit the classifier on the balanced training data
rf.fit(X_train_balanced, y_train_balanced)

# Predict on the training set
y_train_pred = rf.predict(X_train_balanced)

# Predict on the test set
y_test_pred = rf.predict(X_test_scaled)

# Print classification reports
print("Training Classification Report:\n", classification_report(y_train_balanced, y_train_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

# Optionally, print the accuracy
print("Training Accuracy:", accuracy_score(y_train_balanced, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))


Training Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    186579
           1       1.00      1.00      1.00    186579

    accuracy                           1.00    373158
   macro avg       1.00      1.00      1.00    373158
weighted avg       1.00      1.00      1.00    373158

Test Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.76      0.82    164839
           1       0.43      0.64      0.52     46645

    accuracy                           0.74    211484
   macro avg       0.66      0.70      0.67    211484
weighted avg       0.78      0.74      0.75    211484

Training Accuracy: 0.9999839210200505
Test Accuracy: 0.7369824667587146


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint

# Define the parameter distribution
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=100, cv=3, n_jobs=-1, verbose=2, random_state=42)

# Fit the RandomizedSearchCV on the balanced training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Update the Random Forest classifier with the best parameters
rf_best = RandomForestClassifier(**best_params, random_state=42)

# Fit the classifier on the balanced training data
rf_best.fit(X_train_balanced, y_train_balanced)

# Predict on the training set
y_train_pred = rf_best.predict(X_train_balanced)

# Predict on the test set
y_test_pred = rf_best.predict(X_test_scaled)

# Print classification reports
print("Training Classification Report:\n", classification_report(y_train_balanced, y_train_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

# Optionally, print the accuracy
print("Training Accuracy:", accuracy_score(y_train_balanced, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))


Fitting 3 folds for each of 100 candidates, totalling 300 fits




Best Parameters: {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 287}
Training Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.89      0.83    186579
           1       0.87      0.76      0.81    186579

    accuracy                           0.82    373158
   macro avg       0.83      0.82      0.82    373158
weighted avg       0.83      0.82      0.82    373158

Test Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.79      0.83    164839
           1       0.46      0.63      0.53     46645

    accuracy                           0.76    211484
   macro avg       0.67      0.71      0.68    211484
weighted avg       0.79      0.76      0.77    211484

Training Accuracy: 0.8213544932709469
Test Accuracy: 0.7561044807172174
