In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

# Importing and cleaning data

In [2]:
df = pd.read_csv("./datasets/Motor_Vehicle_Crashes.csv")

years_filter = [2022, 2023]
df = df[df['Year'].isin(years_filter)]

# these are the features we want to use, drop cols where these features are None
features = ['Time', 'Lighting Conditions', 'Weather Conditions', 'Road Surface Conditions', 'Road Descriptor']
df = df.dropna(subset=features, how='any')

# take a random sample of the whole data set
df = df.sample(n=100_000, random_state=420)

# convert 'Time' to datetime and calculate the minute of the day (0 to 3600)
df['Minute of Day'] = pd.to_datetime(df['Time'], format='%H:%M').dt.hour * 60 + pd.to_datetime(df['Time'], format='%H:%M').dt.minute

df['Hour of Day'] = pd.to_datetime(df['Time'], format='%H:%M').dt.hour

# 1 if it is an injury or fatal accident, otherwise 0
df['Injury or Fatal Accident'] = df['Crash Descriptor'].apply(lambda x: 1 if x in ['Injury Accident', 'Fatal Accident', 'Property Damage & Injury Accident'] else 0)

df.head()

Unnamed: 0,Year,Crash Descriptor,Time,Date,Day of Week,Police Report,Lighting Conditions,Municipality,Collision Type Descriptor,County Name,...,Weather Conditions,Traffic Control Device,Road Surface Conditions,DOT Reference Marker Location,Pedestrian Bicyclist Action,Event Descriptor,Number of Vehicles Involved,Minute of Day,Hour of Day,Injury or Fatal Accident
1806037,2023,Property Damage Accident,16:56,10/07/2023,Saturday,Y,Daylight,LONG BEACH,OTHER,NASSAU,...,Cloudy,,Wet,,Not Applicable,"Other Motor Vehicle, Collision With",2,1016,16,0
1207349,2022,Property Damage & Injury Accident,14:03,01/23/2022,Sunday,Y,Daylight,OLD WESTBURY,Unknown,NASSAU,...,Clear,Traffic Signal,Dry,,Not Applicable,"Other Motor Vehicle, Collision With",2,843,14,1
1676884,2023,Property Damage Accident,6:01,05/25/2023,Thursday,Y,Dusk,CORINTH,OTHER,SARATOGA,...,Clear,,Wet,,Not Applicable,"Light Support/Utility Pole, Collision With Fix...",1,361,6,0
1803458,2023,Property Damage Accident,7:11,10/04/2023,Wednesday,Y,Daylight,BABYLON,OTHER,SUFFOLK,...,Clear,,Dry,,Not Applicable,"Other Motor Vehicle, Collision With",3,431,7,0
1523143,2022,Property Damage Accident,20:25,12/31/2022,Saturday,Y,Dark-Road Unlighted,JAY,OTHER,ESSEX,...,Rain,No Passing Zone,Wet,9N12031582,Not Applicable,Deer,1,1225,20,0


In [3]:
df.shape

(100000, 21)

In [4]:
features[0] = 'Minute of Day'

X = df[features]
y = df['Injury or Fatal Accident']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (70000, 5)
y_train shape: (70000,)
X_test shape: (30000, 5)
y_test shape: (30000,)


# for outputting

In [7]:
def print_stats(y_hat, y_test, model_description=""):
    print(f"Model: {model_description}")
    acc = np.mean(y_hat == y_test)
    print(f'accuracy: {acc}')

    correct_guesses = np.sum(y_hat == y_test)
    false_negatives = np.sum((y_hat == 0) & (y_test == 1))
    false_positives = np.sum((y_hat == 1) & (y_test == 0))
    true_negatives = np.sum((y_hat == 0) & (y_test == 0))
    true_positives = np.sum((y_hat == 1) & (y_test == 1))

    # Calculate total number of samples
    total_samples = len(y_test)

    # Calculate percentages
    correct_guesses_percent = (correct_guesses / total_samples) * 100
    false_negatives_percent = (false_negatives / total_samples) * 100
    false_positives_percent = (false_positives / total_samples) * 100
    true_negatives_percent = (true_negatives / total_samples) * 100
    true_positives_percent = (true_positives / total_samples) * 100

    # Print out the results with percentages
    print(f"Correct guesses: {correct_guesses} ({correct_guesses_percent:.2f}%)")
    print(f"False negatives: {false_negatives} ({false_negatives_percent:.2f}%)")
    print(f"False positives: {false_positives} ({false_positives_percent:.2f}%)")
    print(f"True negatives: {true_negatives} ({true_negatives_percent:.2f}%)")
    print(f"True positives: {true_positives} ({true_positives_percent:.2f}%)")

# Logisitic

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

numeric_features = [features[0]]
categorical_features = features[1:]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', 'passthrough', numeric_features)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

In [9]:
model.fit(X_train, y_train)

In [10]:
y_hat = model.predict(X_test)

In [11]:
print_stats(y_hat, y_test, "Logisitic")

Model: Logisitic
accuracy: 0.7068
Correct guesses: 21204 (70.68%)
False negatives: 8776 (29.25%)
False positives: 20 (0.07%)
True negatives: 21196 (70.65%)
True positives: 8 (0.03%)


# Random forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [13]:
model.fit(X_train, y_train)

In [14]:
y_hat = model.predict(X_test)

In [15]:
print_stats(y_hat, y_test, "Random forest")

Model: Random forest
accuracy: 0.6632
Correct guesses: 19896 (66.32%)
False negatives: 7384 (24.61%)
False positives: 2720 (9.07%)
True negatives: 18496 (61.65%)
True positives: 1400 (4.67%)
