In [473]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import haversine_distances
import joblib
import folium
from folium.plugins import HeatMap
from branca.element import MacroElement, Template

import folium
from folium.plugins import HeatMap

from math import sin, cos, radians, asin, sqrt




In [474]:
df = pd.read_csv("/content/vulnerable_hotspots_final.csv")
df.head()


Unnamed: 0,cluster,severity_score,LAT_WGS84,LONG_WGS84,collision_count
0,0,7145,43.655219,-79.395866,2377
1,48,317,43.678612,-79.34613,93
2,60,164,43.689342,-79.299286,48
3,14,159,43.669329,-79.467077,49
4,10,156,43.675681,-79.405045,45


In [475]:
df.info()
df.columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cluster          99 non-null     int64  
 1   severity_score   99 non-null     int64  
 2   LAT_WGS84        99 non-null     float64
 3   LONG_WGS84       99 non-null     float64
 4   collision_count  99 non-null     int64  
dtypes: float64(2), int64(3)
memory usage: 4.0 KB


Index(['cluster', 'severity_score', 'LAT_WGS84', 'LONG_WGS84',
       'collision_count'],
      dtype='object')

In [476]:
df['risk_level'] = pd.qcut(df['severity_score'], q=3, labels=["Low","Medium","High"])
df['risk_level'].value_counts()


Unnamed: 0_level_0,count
risk_level,Unnamed: 1_level_1
Low,34
High,33
Medium,32


In [477]:
features = ["LAT_WGS84", "LONG_WGS84", "collision_count", "severity_score"]
X = df[features]
y = df['risk_level']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [478]:
model = RandomForestClassifier(
    n_estimators=300,
    class_weight={'High':2, 'Medium':1, 'Low':1},
    random_state=42
)

model.fit(X_train, y_train)


In [479]:
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

        High       1.00      1.00      1.00         7
         Low       1.00      1.00      1.00         7
      Medium       1.00      1.00      1.00         6

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

[[7 0 0]
 [0 7 0]
 [0 0 6]]


In [480]:
joblib.dump(model, "pedestrian_risk_model.pkl")
print("Model saved as pedestrian_risk_model.pkl")


Model saved as pedestrian_risk_model.pkl


In [481]:
model_loaded = joblib.load("/content/pedestrian_risk_model.pkl")
model_loaded


In [482]:
model_loaded.feature_names_in_


array(['LAT_WGS84', 'LONG_WGS84', 'collision_count', 'severity_score'],
      dtype=object)

In [483]:
vul_df = pd.read_csv("vulnerable_hotspots_final.csv")
print("Dataset loaded. Shape:", vul_df.shape)
display(vul_df.head())


Dataset loaded. Shape: (99, 5)


Unnamed: 0,cluster,severity_score,LAT_WGS84,LONG_WGS84,collision_count
0,0,7145,43.655219,-79.395866,2377
1,48,317,43.678612,-79.34613,93
2,60,164,43.689342,-79.299286,48
3,14,159,43.669329,-79.467077,49
4,10,156,43.675681,-79.405045,45


In [484]:
model_loaded = joblib.load("pedestrian_risk_model.pkl")
print("Model loaded successfully!")
print("Model expects features:", model_loaded.feature_names_in_)


Model loaded successfully!
Model expects features: ['LAT_WGS84' 'LONG_WGS84' 'collision_count' 'severity_score']


In [485]:
# Cell 4: Helper functions + prediction function

from math import sin, cos, radians, asin, sqrt

def haversine_distance_km(lat1, lon1, lat2, lon2):
    """
    Compute approx distance in km between two lat/lon points.
    """
    R = 6371  # Earth radius in km
    lat1_r, lon1_r, lat2_r, lon2_r = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2_r - lat1_r
    dlon = lon2_r - lon1_r
    a = sin(dlat/2)**2 + cos(lat1_r) * cos(lat2_r) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


def get_nearest_hotspot(lat, lon, df):
    """
    Find nearest hotspot based on df of cluster centers.
    """
    dlat = df["LAT_WGS84"] - lat
    dlon = df["LONG_WGS84"] - lon
    dist2 = dlat**2 + dlon**2

    idx = dist2.idxmin()
    row = df.loc[idx]

    distance_km = haversine_distance_km(
        lat, lon, row["LAT_WGS84"], row["LONG_WGS84"]
    )

    return row, distance_km


def predict_risk(lat, lon):
    # Compute distance to every hotspot
    vul_df['distance_km'] = vul_df.apply(
        lambda row: haversine_distance_km(lat, lon, row["LAT_WGS84"], row["LONG_WGS84"]),
        axis=1
    )

    # Closest point
    nearest = vul_df.sort_values(by="distance_km").iloc[0]

    # Build input sample for model
    sample = pd.DataFrame([{
        "LAT_WGS84": lat,
        "LONG_WGS84": lon,
        "collision_count": nearest["collision_count"],
        "severity_score": nearest["severity_score"]
    }])

    risk = model_loaded.predict(sample)[0]

    # Return standardized keys
    info = {
        "risk_label": risk,
        "input_lat": lat,
        "input_lon": lon,
        "nearest_hotspot_lat": float(nearest["LAT_WGS84"]),
        "nearest_hotspot_lon": float(nearest["LONG_WGS84"]),
        "severity_score": int(nearest["severity_score"]),
        "collision_count": int(nearest["collision_count"]),
        "distance_km": float(nearest["distance_km"])
    }

    return risk, info




In [486]:
# DEMO ‚Äì set user coordinate, run prediction, build map, save HTML

# Toronto rough coordinate range:
#   Latitude:  43.58  to  43.85
#   Longitude: -79.65 to -79.10

test_lat = 43.64  # e.g. downtown Toronto
test_lon = -79.4000 # e.g. near City Hall

risk, info = predict_risk(test_lat, test_lon)

print("üîç Model Prediction Summary")
print(f"Input Coordinates: ({info['input_lat']}, {info['input_lon']})")
print(f"Predicted Risk Level: {info['risk_label']}")
print(f"Nearest Hotspot: ({info['nearest_hotspot_lat']}, {info['nearest_hotspot_lon']})")
print(f"Severity Score: {info['severity_score']}")
print(f"Collision Count: {info['collision_count']}")
print(f"Distance to Hotspot: {info['distance_km']:.2f} km")


üîç Model Prediction Summary
Input Coordinates: (43.64, -79.4)
Predicted Risk Level: High
Nearest Hotspot: (43.63491981869565, -79.40896883869564)
Severity Score: 83
Collision Count: 23
Distance to Hotspot: 0.92 km


In [487]:
# Base map centered on Toronto
m = folium.Map(location=[43.70, -79.40], zoom_start=12)

# Add base heatmap from dataset
HeatMap(
    list(zip(vul_df["LAT_WGS84"], vul_df["LONG_WGS84"], vul_df["severity_score"])),
    radius=8,
    blur=12,
    min_opacity=0.25
).add_to(m)

# Risk color coding
risk_colors = {
    "Low": "green",
    "Medium": "orange",
    "High": "red"
}

# Add predicted point
folium.CircleMarker(
    location=[test_lat, test_lon],
    radius=10,
    color=risk_colors[risk],
    fill=True,
    fill_color=risk_colors[risk],
    fill_opacity=0.95
).add_to(m)

summary_html = f"""
<div style="
position: fixed; top: 10px; left: 50px; z-index: 9999;
background-color: white; padding: 10px 14px; border: 2px solid #444;
border-radius: 6px; box-shadow: 2px 2px 6px rgba(0,0,0,0.3);
font-family: Arial; font-size: 13px;">
<h4>Pedestrian Risk Prediction</h4>

<p><b>Input coord:</b> ({info['input_lat']:.5f}, {info['input_lon']:.5f})</p>
<p><b>Predicted Risk:</b> {info['risk_label']}</p>
<p><b>Nearest Hotspot:</b> ({info['nearest_hotspot_lat']:.5f}, {info['nearest_hotspot_lon']:.5f})</p>
<p><b>Severity Score:</b> {info['severity_score']}</p>
<p><b>Collision Count:</b> {info['collision_count']}</p>
<p><b>Distance:</b> {info['distance_km']:.2f} km</p>

</div>
"""

macro = MacroElement()
macro._template = Template(summary_html)
m.get_root().add_child(macro)

m.save("toronto_risk_visualization.html")
print("Saved ‚úî")


Saved ‚úî


In [488]:
m