Vehicular Crash Data Investigation From Maryland DoT

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import time
from math import cos, pi


# Load the dataset
FILE = "../data/crashes.csv"
df = pd.read_csv(FILE)

### Approach 1: Assign increasing numbers to reflect danger level in ordinal categorical variables

In [None]:
# Select relevant columns and drop missing values
df1 = df[["Weather", "Surface Condition", "Vehicle Damage Extent", "Speed Limit", "Crash Date/Time"]].dropna()

# Convert string columns to lowercase (if applicable)
for col in ["Weather", "Surface Condition", "Vehicle Damage Extent"]:
    df1[col] = df1[col].map(lambda x: x.lower() if isinstance(x, str) else x)

# Remove rows with "unknown" or "other" values in specified columns
str_to_drop = ["unknown", "other"]
mask = ~df1[["Weather", "Surface Condition", "Vehicle Damage Extent"]].isin(str_to_drop).any(axis=1)
df1 = df1[mask]

In [None]:
# Define mappings for danger levels
weather_danger_levels = {
    "clear": 0,  
    "raining": 3,
    "cloudy": 1,                              
    "rain": 2,                                                                 
    "snow": 4,                                 
    "foggy": 5,                                                                  
    "wintry mix": 6,                                                           
    "sleet": 7,                                  
    "severe winds": 8,                          
    "blowing snow": 9,                           
    "fog, smog, smoke": 10,                       
    "blowing sand, soil, dirt": 11,                
    "freezing rain or freezing drizzle": 12,      
    "severe crosswinds": 13,                        
    "sleet or hail": 14,    
}
surface_danger_levels = {
    "dry": 0,                          
    "wet": 1,                         
    "snow": 2,                         
    "ice": 3,                         
    "slush": 4,                         
    "ice/frost": 5,                        
    "mud, dirt, gravel": 6,               
    "water(standing/moving)": 7,           
    "oil": 8,                           
    "sand": 9,                             
    "water (standing, moving)": 10 
}
vehicle_damage_levels = {
    "disabling": 4,
    "functional": 3, 
    "superficial": 2,
    "destroyed": 5, 
    "no damage": 1, 
    "vehicle not at scene": 0,
}

In [None]:
# Map danger levels to their numeric values
df1["Weather Danger Level"] = df1["Weather"].map(weather_danger_levels)
df1["Surface Danger Level"] = df1["Surface Condition"].map(surface_danger_levels)
df1["Vehicle Damage Level"] = df1["Vehicle Damage Extent"].map(vehicle_damage_levels)

# Parse datetime and extract Month and Hour
df1["datetime"] = pd.to_datetime(df1["Crash Date/Time"], errors="coerce")  # Handle invalid dates gracefully
df1["Month"] = df1["datetime"].dt.month
df1["Hour"] = df1["datetime"].dt.hour 
# + (df1["datetime"].dt.minute / 60)

# Drop rows with invalid datetime entries (if any)
df1 = df1.dropna(subset=["datetime"])

# Normalize and prepare features
def normalize(data):
    return (data - data.min()) / (data.max() - data.min())

features = ["Weather Danger Level", "Surface Danger Level", "Speed Limit", "Month", "Hour"]
df_features = pd.DataFrame(index=df1.index)


for x in features:
    df_features[x] = normalize(df1[x])
df_features

### Correlation Heatmap

In [None]:
sample_num = 50
df_sample = pd.DataFrame()
for col in df_features:
    df_sample[col] = df_features[col].sample(n=sample_num).reset_index(drop=True)

pd.concat((df_sample, df1["Vehicle Damage Level"]))
sns.swarmplot(data=df_sample, palette=["#3276B2"])
plt.title("Swarm plot for features")
plt.grid()
plt.xticks(rotation=90)
plt.savefig("swarm_plot.png", bbox_inches="tight")
plt.show()


In [None]:
def get_heatmap(data):
    sns.heatmap(data.corr(), cmap="YlGnBu", annot=True, cbar=False)
    plt.title("Correlation Heatmap between Variables")
    plt.savefig("correlation_heatmap.png", bbox_inches="tight")
    plt.show()
get_heatmap(df_features)

In [None]:
def output_to_file(data, filename):
    with open(filename, "w") as file:
        json.dump(data, file)

### Hyper-parameter Tuning

In [None]:
X_train, X_test,y_train, y_test = train_test_split(df_features, df1["Vehicle Damage Level"], test_size=0.2, random_state=42)
def fit_KNN(start, k_value, X, y):
    '''Runs K-Nearest Neighbours and calculates the mean accuracy across five folds. Also increments k to find 
    optimal accuracy. Reutrns a list of tuples with (k, accuracy).
    '''
    scores = []
    algo_time = 0
    start_time = time.time()
    for i in range(start, k_value + 1):
        # Define KNN model with the current value of k
        knn = KNeighborsClassifier(n_neighbors=i)
        
        # Perform cross-validation and calculate mean accuracy
        score = cross_validate(knn, X, y, scoring="accuracy", cv=5)
        mean_score = np.mean(score['test_score'])
        
        # Append the result to the scores list
        scores.append((i, mean_score))
    end_time = time.time()
    algo_time = end_time - start_time
    return scores, algo_time
scores = fit_KNN(3, 100, X_train, y_train)
scores

0.40577298233005477, 0.40487748399025214

In [None]:
output_to_file(scores, "approach_1_score")

#### visualization

In [None]:
def plot_score_map(score, plot_name):
    '''plot the accuracy of the model with each different number of k'''
    df_score = pd.DataFrame(data=score[0], columns=["k_value", "Accuracy"])
    sns.lineplot(x=df_score["k_value"], y=df_score["Accuracy"])
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    plt.title(plot_name)
    plt.grid()
    plt.savefig(plot_name + "_" + timestamp)
    plt.show()
# plot_score_map(scores, "KNN Hyper-parameter Tuning")

### Approach 2: Use one-hot encoding for variables, and then reduce dimensions by PCA

In [None]:
df2 = df1.copy()
df2["Speed Limit"] = df2["Speed Limit"].map(lambda x:str(x))
df2["Month"] = df2["Month"].map(lambda x:str(x))
df2["Hour"] = df2["Hour"].map(lambda x:str(x))
df2_dummy= pd.get_dummies(data=df2[features], dtype=int)
df2_dummy

In [None]:
#reduce dimension of the data
pca = PCA(n_components=10)
components = pca.fit_transform(df2_dummy.to_numpy())
components

In [None]:
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker=".")
plt.xlabel('Number of Components')
plt.ylabel('Variance Explained')
plt.title('Elbow Plot')
plt.grid()
plt.savefig("pca_elbow_plot.png")
plt.show()


In [None]:
X_train_2, X_test_2,y_train_2, y_test_2 = train_test_split(components, df1["Vehicle Damage Level"], test_size=0.2, random_state=42)
# approach_2_score = fit_KNN(3, 100, X_train_2, y_train_2)
approach_2_score

In [None]:
output_to_file(approach_2_score, "approach_2_score")

#### visulization

In [None]:
plot_score_map(approach_2_score, "Approach 2 KNN Hyper-parameter Tuning")