## Logistic Regression by Nicole Zino

In [11]:
import pandas as pd

df = pd.read_csv("../BC Vehicle Accidents 2021.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Year,2021,2021.1
0,,,,Measures,Number of injured,Number of fatalities
1,Road Surface,Collision Sev,Weather,Speed limit,,
2,"Dry, normal",Fatal collisions,Clear and sunny,Less than 40 km per hour,2,7
3,"Dry, normal",Fatal collisions,Clear and sunny,40 km per hour,7,29
4,"Dry, normal",Fatal collisions,Clear and sunny,50 km per hour,89,209


In [14]:
import pandas as pd

df = pd.read_csv("../BC Vehicle Accidents 2021.csv")

# This drops rows where all elements are NaN
df = df.dropna(how='all')

# This sets the second row as the header
df.columns = df.iloc[1]

# This drops the first two rows which are not needed
df = df.drop([0,1]) 

# This cleans up column names: becomes lowercase, removes spaces, and replaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# This manually renames columns so they have consistent names
df.columns = ['road_surface', 'collision_sev', 'weather', 'speed_limit', 'number_of_injured', 'number_of_fatalities']

# This resets the index now that the messy header rows are gone
df = df.reset_index(drop=True)

# This displays the first few rows to confirm everything looks correct
df.head()

Unnamed: 0,road_surface,collision_sev,weather,speed_limit,number_of_injured,number_of_fatalities
0,"Dry, normal",Fatal collisions,Clear and sunny,Less than 40 km per hour,2,7
1,"Dry, normal",Fatal collisions,Clear and sunny,40 km per hour,7,29
2,"Dry, normal",Fatal collisions,Clear and sunny,50 km per hour,89,209
3,"Dry, normal",Fatal collisions,Clear and sunny,60 km per hour,53,104
4,"Dry, normal",Fatal collisions,Clear and sunny,80 km per hour,174,224


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# First I encode the categorical columns
label_cols = ['road_surface', 'weather', 'speed_limit']
encoder = LabelEncoder()
for col in label_cols:
    df[col] = encoder.fit_transform(df[col])

# Next, I define features (X) and target (y)
X = df[['road_surface', 'weather', 'speed_limit', 'number_of_injured', 'number_of_fatalities']]
y = df['collision_sev']

# Then, I split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# This is the train model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# This is the predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9166666666666666

Classification Report:
                    precision    recall  f1-score   support

 Fatal collisions       0.85      0.96      0.90        23
Injury collisions       0.97      0.89      0.93        37

         accuracy                           0.92        60
        macro avg       0.91      0.92      0.91        60
     weighted avg       0.92      0.92      0.92        60


Confusion Matrix:
 [[22  1]
 [ 4 33]]
