# K-Nearest Neighbors by Samuel Shull

In [1]:
import pandas as pd

df = pd.read_csv("../BC Vehicle Accidents 2021.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Year,2021,2021.1
0,,,,Measures,Number of injured,Number of fatalities
1,Road Surface,Collision Sev,Weather,Speed limit,,
2,"Dry, normal",Fatal collisions,Clear and sunny,Less than 40 km per hour,2,7
3,"Dry, normal",Fatal collisions,Clear and sunny,40 km per hour,7,29
4,"Dry, normal",Fatal collisions,Clear and sunny,50 km per hour,89,209


In [2]:
import pandas as pd

df = pd.read_csv("../BC Vehicle Accidents 2021.csv")

df = df.dropna(how='all')

df.columns = df.iloc[1]
df = df.drop([0,1]) 

df.columns = df.columns.str.strip().str.lower().str.replace(' ', '')

df.columns = ['road_surface', 'collision_sev', 'weather', 'speed_limit', 'number_of_injured', 'number_of_fatalities']

df = df.reset_index(drop=True)

df.head()

Unnamed: 0,road_surface,collision_sev,weather,speed_limit,number_of_injured,number_of_fatalities
0,"Dry, normal",Fatal collisions,Clear and sunny,Less than 40 km per hour,2,7
1,"Dry, normal",Fatal collisions,Clear and sunny,40 km per hour,7,29
2,"Dry, normal",Fatal collisions,Clear and sunny,50 km per hour,89,209
3,"Dry, normal",Fatal collisions,Clear and sunny,60 km per hour,53,104
4,"Dry, normal",Fatal collisions,Clear and sunny,80 km per hour,174,224


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Map the categorical values into numerical values, prior to training the model. 
roadSurfaceMapping = LabelEncoder()
weatherMapping = LabelEncoder()
speedLimitMapping = LabelEncoder()
df['road_surface'] = roadSurfaceMapping.fit_transform(df['road_surface'])
df['weather'] = weatherMapping.fit_transform(df['weather'])
df['speed_limit'] = speedLimitMapping.fit_transform(df['speed_limit'])

# Define the features (x) and target (y)
allXValues = df[['road_surface', 'weather', 'speed_limit', 'number_of_injured', 'number_of_fatalities']]
allYValues = df['collision_sev']

xTrainData, xTestData, yTrainData, yTestData = train_test_split(allXValues, allYValues, test_size=0.2, random_state=42)

# Train the KNN model, make predictions, then evaluate it's performance. After adjusting K, the best performance was with K=6. 
# Based on the five features, we are able to predict the collision severity (Fatal or Injury) with 90% accuracy.
knnModel = KNeighborsClassifier(n_neighbors=6)
knnModel.fit(xTrainData, yTrainData)
yPredictedData = knnModel.predict(xTestData)
print("Accuracy:", accuracy_score(yTestData, yPredictedData))
print("\nClassification Report:")
print(classification_report(yTestData, yPredictedData))
print("\nConfusion Matrix:")
print(confusion_matrix(yTestData, yPredictedData))

Accuracy: 0.9

Classification Report:
                   precision    recall  f1-score   support

 Fatal collisions       0.81      0.96      0.88        23
Injury collisions       0.97      0.86      0.91        37

         accuracy                           0.90        60
        macro avg       0.89      0.91      0.90        60
     weighted avg       0.91      0.90      0.90        60


Confusion Matrix:
[[22  1]
 [ 5 32]]
