In [1]:
import requests
import zipfile
import pandas as pd
from io import BytesIO

url = 'https://github.com/DataScienceAndEngineering/machine-learning-dse-i210-final-project-nyc-car-accident-severity/blob/main/data/processed/Processed_Data_v2.zip?raw=true'

# Download the zip file
response = requests.get(url)
zip_content = BytesIO(response.content)

# Unzip the file
with zipfile.ZipFile(zip_content, 'r') as zip_ref:
    # Extract all the contents into the current directory
    zip_ref.extractall()

    csv_filename = zip_ref.namelist()[0]

    # Read the CSV file
    mvc_processed = pd.read_csv(csv_filename)

In [2]:
# Combine Class 1, 2, 3 to make a  binary class dataset

mvc_processed_binary = mvc_processed

# Display original class distribution
print("Original Class Distribution:")
print(mvc_processed_binary['CLASS TYPE'].value_counts())

# Function to combine classes
def combine_classes(label):
    if label == 'Class 0':
        return 'Class 0'
    else:
        return 'Class 1'

# Apply the function to the 'CLASS TYPE' column
mvc_processed_binary['CLASS TYPE'] = mvc_processed_binary['CLASS TYPE'].apply(combine_classes)

# Display new class distribution
print("\nNew Class Distribution:")
print(mvc_processed_binary['CLASS TYPE'].value_counts())

Original Class Distribution:
CLASS TYPE
Class 0    824193
Class 1    231289
Class 2      1030
Class 3       905
Name: count, dtype: int64

New Class Distribution:
CLASS TYPE
Class 0    824193
Class 1    233224
Name: count, dtype: int64


In [3]:
# Convert CRASH DATE and CRASH TIME to datetime
mvc_processed_binary['CRASH DATE'] = pd.to_datetime(mvc_processed_binary['CRASH DATE'])

mvc_processed_binary['CRASH TIME'] = pd.to_datetime(mvc_processed_binary['CRASH TIME'], format='%H:%M:%S').dt.time

In [4]:
# Decompose date and time

mvc_processed_binary['year'] = mvc_processed_binary['CRASH DATE'].dt.year
mvc_processed_binary['month'] = mvc_processed_binary['CRASH DATE'].dt.month
mvc_processed_binary['day'] = mvc_processed_binary['CRASH DATE'].dt.day
mvc_processed_binary['dayofweek'] = mvc_processed_binary['CRASH DATE'].dt.dayofweek  # Monday=0, Sunday=6

# For time, since dt accessor doesn't work directly with dtype 'time', you need to convert them again to datetime:
mvc_processed_binary['temp_datetime'] = pd.to_datetime(mvc_processed_binary['CRASH TIME'].astype(str))

mvc_processed_binary['hour'] = mvc_processed_binary['temp_datetime'].dt.hour
mvc_processed_binary['minute'] = mvc_processed_binary['temp_datetime'].dt.minute
mvc_processed_binary['second'] = mvc_processed_binary['temp_datetime'].dt.second

# Drop the temporary datetime column used for extracting time
mvc_processed_binary.drop(['temp_datetime', 'CRASH DATE', 'CRASH TIME', 'second'], axis=1, inplace=True)

  mvc_processed_binary['temp_datetime'] = pd.to_datetime(mvc_processed_binary['CRASH TIME'].astype(str))


In [5]:
mvc_processed_binary.head()

Unnamed: 0,LATITUDE,LONGITUDE,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,CLASS TYPE,year,month,day,dayofweek,hour,minute
0,40.667202,-73.8665,Unspecified,No factor,No factor,No factor,No factor,Sedan,No vehicle,No vehicle,No vehicle,No vehicle,Class 0,2021,9,11,5,9,35
1,40.86816,-73.83148,Unspecified,Unspecified,No factor,No factor,No factor,Sedan,Sedan,No vehicle,No vehicle,No vehicle,Class 1,2021,12,14,1,8,17
2,40.75144,-73.97397,Passing Too Closely,Unspecified,No factor,No factor,No factor,Sedan,Station Wagon/Sport Utility Vehicle,No vehicle,No vehicle,No vehicle,Class 0,2021,12,14,1,14,58
3,40.675884,-73.75577,Turning Improperly,Unspecified,No factor,No factor,No factor,Sedan,Station Wagon/Sport Utility Vehicle,No vehicle,No vehicle,No vehicle,Class 0,2021,12,14,1,16,50
4,40.87262,-73.904686,Unspecified,Unspecified,No factor,No factor,No factor,Station Wagon/Sport Utility Vehicle,Sedan,No vehicle,No vehicle,No vehicle,Class 1,2021,12,11,5,19,43


In [6]:
# Encode the features and targets

from sklearn.preprocessing import LabelEncoder

# Create features set
X = mvc_processed_binary.drop('CLASS TYPE', axis=1)
y = mvc_processed_binary['CLASS TYPE']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Encode categorical variables
X_encoded = pd.get_dummies(X)

In [7]:
# Split data into trianing and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, stratify=y, test_size=0.2, random_state=42)

In [8]:
# Scale the features

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Undersampling the majority class

from sklearn.utils import resample
import numpy as np

# Combine X_train and y_train
train_df = pd.DataFrame(X_train_scaled)
train_df['CLASS TYPE'] = y_train  # Add the target column

# Identify and separate the majority and minority classes
df_majority = train_df[train_df['CLASS TYPE'] == 0]
df_minority = train_df[train_df['CLASS TYPE'] == 1]

# Undersample the majority class
df_majority_undersampled = resample(df_majority,
                                    replace=False,    # sample without replacement
                                    n_samples=len(df_minority),  # to match minority class size
                                    random_state=123)

# Concatenate the minority class with the undersampled majority class
df_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset to mix up the rows (optional)
df_balanced = df_balanced.sample(frac=1, random_state=123).reset_index(drop=True)

# Separate features and target variable after undersampling
X_train_balanced = df_balanced.drop('CLASS TYPE', axis=1)
y_train_balanced = df_balanced['CLASS TYPE']

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the XGBoost classifier
xgb_clf = XGBClassifier(random_state=42)

# Fit the classifier on the balanced training data
xgb_clf.fit(X_train_balanced, y_train_balanced)

# Predict on the training set
y_train_pred_xgb = xgb_clf.predict(X_train_balanced)

# Predict on the test set
y_test_pred_xgb = xgb_clf.predict(X_test_scaled)

# Print classification reports
print("Training Classification Report:\n", classification_report(y_train_balanced, y_train_pred_xgb))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred_xgb))


Training Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.79      0.74    186579
           1       0.76      0.66      0.71    186579

    accuracy                           0.73    373158
   macro avg       0.73      0.73      0.72    373158
weighted avg       0.73      0.73      0.72    373158

Test Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.78      0.83    164839
           1       0.45      0.65      0.53     46645

    accuracy                           0.75    211484
   macro avg       0.67      0.71      0.68    211484
weighted avg       0.79      0.75      0.76    211484



In [11]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Define the base models
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
xgboost = XGBClassifier(random_state=42)

# Define the meta-model
meta_model = LogisticRegression()

# Create the stacking classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', random_forest),
        ('xgb', xgboost)
    ],
    final_estimator=meta_model
)

# Fit the stacking classifier on the balanced training data
stacking_clf.fit(X_train_balanced, y_train_balanced)

# Predict on the training set
y_train_pred = stacking_clf.predict(X_train_balanced)

# Predict on the test set
y_test_pred = stacking_clf.predict(X_test_scaled)

# Print classification reports
print("Training Classification Report:\n", classification_report(y_train_balanced, y_train_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

Training Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.86      0.82    186579
           1       0.85      0.78      0.81    186579

    accuracy                           0.82    373158
   macro avg       0.82      0.82      0.82    373158
weighted avg       0.82      0.82      0.82    373158

Test Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.77      0.82    164839
           1       0.45      0.66      0.53     46645

    accuracy                           0.74    211484
   macro avg       0.67      0.71      0.68    211484
weighted avg       0.79      0.74      0.76    211484

