<h1>Finding The Most Incident Prone Areas</h1>

In [3]:
import pandas as pd
from sklearn.cluster import DBSCAN
import numpy as np

data = pd.read_csv("Traffic_Collisions_2023_and_later.csv")

# Prepare coordinates
coords_rad = np.radians(data[['LAT_WGS84', 'LONG_WGS84']].dropna())

# DBSCAN with Haversine distance
kms_per_radian = 6371.0088
epsilon = 0.13 / kms_per_radian  # 0.5 km radius

db = DBSCAN(eps=epsilon, min_samples=30, algorithm='ball_tree', metric='haversine')
data['cluster'] = db.fit_predict(np.radians(data[['LAT_WGS84', 'LONG_WGS84']]))

# Filter out noise and NSA neighbourhoods
filtered = data[(data['cluster'] != -1) & (data['NEIGHBOURHOOD_158'] != 'NSA')]

# Aggregate cluster info
cluster_info = filtered.groupby('cluster').agg(
    collision_count=('NEIGHBOURHOOD_158', 'size'),
    neighbourhoods=('NEIGHBOURHOOD_158', lambda x: x.unique())
).reset_index()

# Top 10 clusters by collisions
top_10_clusters = cluster_info.sort_values('collision_count', ascending=False).head(10).reset_index(drop=True)

top_10_clusters


Unnamed: 0,cluster,collision_count,neighbourhoods
0,5,3198,"[Wellington Place (164), Kensington-Chinatown ..."
1,29,2518,"[Yonge-Bay Corridor (170), Downtown Yonge East..."
2,153,2277,"[Downtown Yonge East (168), Bay-Cloverhill (16..."
3,64,1160,"[Roncesvalles (86), Little Portugal (84), Trin..."
4,154,985,[South Parkdale (85)]
5,243,909,"[Forest Hill North (102), Humewood-Cedarvale (..."
6,96,872,"[Church-Wellesley (167), Downtown Yonge East (..."
7,111,817,"[Leaside-Bennington (56), North Toronto (173),..."
8,168,804,"[Yonge-Bay Corridor (170), Kensington-Chinatow..."
9,14,729,"[Yonge-Doris (151), Avondale (153), Lansing-We..."
