In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

# Importing and cleaning data

In [None]:
df = pd.read_csv("./datasets/Motor_Vehicle_Crashes.csv")

years_filter = [2022, 2023]
df = df[df['Year'].isin(years_filter)]

# these are the features we want to use, drop cols where these features are None
features = ['Time', 'Lighting Conditions', 'Weather Conditions', 'Road Surface Conditions']
df = df.dropna(subset=features, how='any')

# take a random sample of the whole data set
# df = df.sample(n=100_000, random_state=420)

# convert 'Time' to datetime and calculate the minute of the day (0 to 3600)
df['Minute of Day'] = pd.to_datetime(df['Time'], format='%H:%M').dt.hour * 60 + pd.to_datetime(df['Time'], format='%H:%M').dt.minute

# 1 if it is an injury or fatal accident, otherwise 0
df['Injury or Fatal Accident'] = df['Crash Descriptor'].apply(lambda x: 1 if x in ['Injury Accident', 'Fatal Accident', 'Property Damage & Injury Accident'] else 0)

df.head()

Unnamed: 0,Year,Crash Descriptor,Time,Date,Day of Week,Police Report,Lighting Conditions,Municipality,Collision Type Descriptor,County Name,Road Descriptor,Weather Conditions,Traffic Control Device,Road Surface Conditions,DOT Reference Marker Location,Pedestrian Bicyclist Action,Event Descriptor,Number of Vehicles Involved,Minute of Day,Injury or Fatal Accident
1146570,2022,Property Damage Accident,0:55,01/01/2022,Saturday,Y,Dark-Road Lighted,HAMBURG,OVERTAKING,ERIE,Straight and Level,Clear,,Dry,75 53011183,Not Applicable,"Other Motor Vehicle, Collision With",2,55,0
1146571,2022,Property Damage Accident,4:30,01/01/2022,Saturday,Y,Dark-Road Unlighted,AMHERST,OTHER,ERIE,Curve and Level,Cloudy,,Wet,62 53034029,Not Applicable,"Light Support/Utility Pole, Collision With Fix...",1,270,0
1146572,2022,Property Damage Accident,12:14,01/01/2022,Saturday,Y,Daylight,AMHERST,OTHER,ERIE,Straight and Level,Cloudy,,Wet,324 53021175,Not Applicable,"Other Motor Vehicle, Collision With",2,734,0
1146573,2022,Property Damage Accident,4:05,01/01/2022,Saturday,Y,Dark-Road Lighted,AMHERST,REAR END,ERIE,Straight and Level,Cloudy,,Wet,62 53034026,Not Applicable,"Other Motor Vehicle, Collision With",2,245,0
1146574,2022,Property Damage Accident,0:38,01/01/2022,Saturday,Y,Dark-Road Lighted,ORCHARD PARK,OTHER,ERIE,Straight and Level,Rain,,Wet,,Not Applicable,Deer,1,38,0


In [3]:
df.shape

(772145, 20)

In [4]:
X = df[['Minute of Day', 'Lighting Conditions', 'Weather Conditions', 'Road Surface Conditions']]
y = df['Injury or Fatal Accident']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (540501, 4)
y_train shape: (540501,)
X_test shape: (231644, 4)
y_test shape: (231644,)


# Logisitic

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

numeric_features = ['Minute of Day']
categorical_features = ['Lighting Conditions', 'Weather Conditions', 'Road Surface Conditions']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', 'passthrough', numeric_features)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

In [8]:
model.fit(X_train, y_train)

In [9]:
y_hat = model.predict(X_test)

In [10]:
acc = np.mean(y_hat == y_test)
acc

np.float64(0.7067612370706774)

In [11]:
print('injury or fatal')
print(f'y_hat - 0 count: {sum(y_hat == 0)}, 1 count: {sum(y_hat == 1)}')
print(f'y_test - 0 count: {sum(y_test == 0)}, 1 count: {sum(y_test == 1)}')

injury or fatal
y_hat - 0 count: 231624, 1 count: 20
y_test - 0 count: 163727, 1 count: 67917


# Random forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [13]:
model.fit(X_train, y_train)

In [14]:
y_hat = model.predict(X_test)

In [15]:
acc = np.mean(y_hat == y_test)
acc

np.float64(0.6971559807290497)

In [16]:
print('injury or fatal')
print(f'y_hat - 0 count: {sum(y_hat == 0)}, 1 count: {sum(y_hat == 1)}')
print(f'y_test - 0 count: {sum(y_test == 0)}, 1 count: {sum(y_test == 1)}')

injury or fatal
y_hat - 0 count: 221163, 1 count: 10481
y_test - 0 count: 163727, 1 count: 67917


In [19]:
# Calculate confusion matrix components
correct_guesses = np.sum(y_hat == y_test)
false_negatives = np.sum((y_hat == 0) & (y_test == 1))
false_positives = np.sum((y_hat == 1) & (y_test == 0))
true_negatives = np.sum((y_hat == 0) & (y_test == 0))
true_positives = np.sum((y_hat == 1) & (y_test == 1))

# Calculate total number of samples
total_samples = len(y_test)

# Calculate percentages
correct_guesses_percent = (correct_guesses / total_samples) * 100
false_negatives_percent = (false_negatives / total_samples) * 100
false_positives_percent = (false_positives / total_samples) * 100
true_negatives_percent = (true_negatives / total_samples) * 100
true_positives_percent = (true_positives / total_samples) * 100

# Print out the results with percentages
print(f"Correct guesses: {correct_guesses} ({correct_guesses_percent:.2f}%)")
print(f"False negatives: {false_negatives} ({false_negatives_percent:.2f}%)")
print(f"False positives: {false_positives} ({false_positives_percent:.2f}%)")
print(f"True negatives: {true_negatives} ({true_negatives_percent:.2f}%)")
print(f"True positives: {true_positives} ({true_positives_percent:.2f}%)")

Correct guesses: 161492 (69.72%)
False negatives: 63794 (27.54%)
False positives: 6358 (2.74%)
True negatives: 157369 (67.94%)
True positives: 4123 (1.78%)
