In [4]:
# -------- Accident Hotspot & Severity Prediction (Fixed) --------

import os, pandas as pd, numpy as np
from math import radians, sin, cos, asin, sqrt
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib

DATA_PATH = '/content/drive/MyDrive/AccidentsBig.csv'
OUTPUT_DIR = '/mnt/data'
SEED = 42
HOTSPOT_RADIUS_M = 1000

# ---------------- Haversine ----------------
def haversine_m(lat1, lon1, lat2, lon2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon, dlat = lon2-lon1, lat2-lat1
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return 6371*1000*c

# ---------------- Load & Clean ----------------
def load_and_clean(path):
    df = pd.read_csv(path, low_memory=False)
    lat_col = next((c for c in df.columns if 'lat' in c.lower()), None)
    lon_col = next((c for c in df.columns if 'lon' in c.lower() or 'long' in c.lower()), None)
    if lat_col is None or lon_col is None:
        raise ValueError("Cannot detect lat/lon columns")
    df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
    df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
    df = df.dropna(subset=[lat_col, lon_col])
    df = df.rename(columns={lat_col:'latitude', lon_col:'longitude'})
    # Severity
    sev_col = next((c for c in df.columns if 'severity' in c.lower() or 'injury' in c.lower() or 'fatal' in c.lower()), None)
    if sev_col: df = df.rename(columns={sev_col:'accident_severity'})
    return df

# ---------------- Hotspot Detection ----------------
def detect_hotspots(df):
    coords = df[['latitude','longitude']].to_numpy()
    coords_rad = np.radians(coords)
    db = DBSCAN(eps=1/6371, min_samples=30, metric='haversine')
    df['cluster'] = db.fit_predict(coords_rad)
    clusters = []
    for lab in sorted(set(df['cluster'])):
        if lab==-1: continue
        mask = df['cluster']==lab
        subset = df.loc[mask,['latitude','longitude']]
        clusters.append({'cluster':int(lab),
                         'centroid_lat':subset['latitude'].mean(),
                         'centroid_lon':subset['longitude'].mean(),
                         'size':int(mask.sum())})
    return df, pd.DataFrame(clusters).sort_values('size', ascending=False)

def annotate_hotspot(df, centroids):
    if centroids.empty:
        df['within_5km_hotspot'] = 0
        return df
    def check(r):
        for _, row in centroids.iterrows():
            if haversine_m(r.latitude, r.longitude, row.centroid_lat, row.centroid_lon) <= HOTSPOT_RADIUS_M:
                return 1
        return 0
    df['within_1km_hotspot'] = df.apply(check, axis=1)
    return df

# ---------------- Feature Engineering ----------------
def feature_engineering(df):
    df['hour'] = 0
    df['day_of_week'] = -1
    for col in df.columns:
        if 'time' in col.lower():
            try: df['hour'] = pd.to_datetime(df[col], format='%H:%M').dt.hour
            except: df['hour'] = df[col].astype(str).str[:2].astype(int)
        if 'date' in col.lower():
            try: df['day_of_week'] = pd.to_datetime(df[col]).dt.dayofweek
            except: df['day_of_week']=-1
    candidate_features = ['Weather_Conditions','Light_Conditions','Road_Surface_Conditions','Road_Type',
                          'Urban_or_Rural_Area','Carriageway_Hazards','Special_Conditions_at_Site',
                          'Speed_limit','Number_of_Vehicles','Number_of_Casualties',
                          'latitude','longitude','hour','day_of_week','within_5km_hotspot']
    features = [f for f in candidate_features if f in df.columns]

    # Create risk based on severity or casualties if available
    if 'accident_severity' in df.columns:
        df['sev_class'] = df['accident_severity'].astype('category').cat.codes
        # Use severity as proxy for risk (higher severity = higher risk)
        df['risk'] = df['sev_class']
    elif 'Number_of_Casualties' in df.columns:
        # Use casualties as proxy for risk
        df['risk'] = (df['Number_of_Casualties'] > df['Number_of_Casualties'].median()).astype(int)
    else:
        df['risk'] = 1

    return df, features

# ---------------- Build Models ----------------
def build_models(df, features):
    X = df[features].copy()
    numeric_cols = X.select_dtypes(include=['int','float']).columns.tolist()
    categorical_cols = [c for c in features if c not in numeric_cols]
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])
    risk_pipe = Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=200, random_state=SEED))])
    severity_pipe = None
    if 'sev_class' in df.columns:
        severity_pipe = Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=200, random_state=SEED))])
    return risk_pipe, severity_pipe, numeric_cols, categorical_cols

# ---------------- Nearest Hotspot ----------------
def nearest_hotspot(lat, lon, centroids):
    if centroids.empty: return {'nearest_cluster': None,'distance_m': None,'within_5km': False,'cluster_size':0}
    centroids['dist_m'] = centroids.apply(lambda r: haversine_m(lat, lon, r.centroid_lat, r.centroid_lon), axis=1)
    nearest = centroids.sort_values('dist_m').iloc[0]
    return {'nearest_cluster': int(nearest.cluster),
            'distance_m': float(nearest.dist_m),
            'within_5km': bool(nearest.dist_m<=HOTSPOT_RADIUS_M),
            'cluster_size': int(nearest.size)}





In [5]:


# ---------------- Prediction ----------------
def predict_vehicle(lat, lon, features, numeric_cols, categorical_cols, risk_pipe=None, severity_pipe=None, centroids=None, **kwargs):
    entry = {}
    for f in features:
        entry[f] = kwargs.get(f, None)
    entry['latitude'] = lat
    entry['longitude'] = lon
    info = nearest_hotspot(lat, lon, centroids)
    entry['within_5km_hotspot'] = 5 if info['within_5km'] else 0
    df = pd.DataFrame([entry])

    for c in df.columns:
        if c in numeric_cols:
            df[c] = pd.to_numeric(df[c], errors='coerce').fillna(-999)
        elif c in categorical_cols:
            df[c] = df[c].astype(str).replace('None', 'Unknown').fillna('Unknown')

    out = {'nearest_hotspot': info}
    if risk_pipe: out.update({'risk_label': int(risk_pipe.predict(df[features])[0])})
    if severity_pipe: out.update({'severity_label': int(severity_pipe.predict(df[features])[0])})
    return out

def predict_vehicle_entry(lat, lon, features, numeric_cols, categorical_cols, risk_pipe=None,
                         severity_pipe=None, centroids_df=None, weather=None, light=None,
                         surface=None, road=None, urban=None, hazards=None, special=None,
                         speed=None, vehicles=None, casualties=None, hour=None, day=None):
    kwargs = {
        'Weather_Conditions': weather,
        'Light_Conditions': light,
        'Road_Surface_Conditions': surface,
        'Road_Type': road,
        'Urban_or_Rural_Area': urban,
        'Carriageway_Hazards': hazards,
        'Special_Conditions_at_Site': special,
        'Speed_limit': speed,
        'Number_of_Vehicles': vehicles,
        'Number_of_Casualties': casualties,
        'hour': hour,
        'day_of_week': day
    }
    return predict_vehicle(lat, lon, features, numeric_cols, categorical_cols, risk_pipe, severity_pipe, centroids_df, **kwargs)

# ---------------- New Test Data ----------------
test_entries = [
    {'latitude': 28.6139, 'longitude': 77.2090, 'weather': 'Raining', 'light': 'Daylight', 'surface': 'Wet', 'road': 'Dual carriageway', 'urban': 'Urban', 'hazards': 'None', 'special': 'None', 'speed': 50, 'vehicles': 2, 'casualties': 0, 'hour': 9, 'day': 2},
    {'latitude': 28.7041, 'longitude': 77.1025, 'weather': 'Clear', 'light': 'Night', 'surface': 'Dry', 'road': 'Single carriageway', 'urban': 'Urban', 'hazards': 'Oil spill', 'special': 'Roadworks', 'speed': 60, 'vehicles': 1, 'casualties': 0, 'hour': 22, 'day': 5},
    {'latitude': 28.5355, 'longitude': 77.3910, 'weather': 'Fog', 'light': 'Darkness - lights lit', 'surface': 'Wet', 'road': 'Roundabout', 'urban': 'Urban', 'hazards': 'Pedestrian', 'special': 'Roadworks', 'speed': 40, 'vehicles': 3, 'casualties': 0, 'hour': 6, 'day': 1},
    {'latitude': 19.0760, 'longitude': 72.8777, 'weather': 'Heavy rain', 'light': 'Daylight', 'surface': 'Wet', 'road': 'Motorway', 'urban': 'Urban', 'hazards': 'Standing water', 'special': 'None', 'speed': 80, 'vehicles': 4, 'casualties': 1, 'hour': 17, 'day': 4},
    {'latitude': 12.9716, 'longitude': 77.5946, 'weather': 'Clear', 'light': 'Darkness - no lighting', 'surface': 'Dry', 'road': 'One way street', 'urban': 'Urban', 'hazards': 'Animal', 'special': 'None', 'speed': 50, 'vehicles': 2, 'casualties': 0, 'hour': 2, 'day': 0},
    {'latitude': 13.0827, 'longitude': 80.2707, 'weather': 'Windy', 'light': 'Daylight', 'surface': 'Dry', 'road': 'Dual carriageway', 'urban': 'Urban', 'hazards': 'Debris', 'special': 'None', 'speed': 70, 'vehicles': 3, 'casualties': 2, 'hour': 14, 'day': 3},
    {'latitude': 22.5726, 'longitude': 88.3639, 'weather': 'Raining', 'light': 'Darkness - lights lit', 'surface': 'Wet', 'road': 'Single carriageway', 'urban': 'Urban', 'hazards': 'Pothole', 'special': 'Roadworks', 'speed': 40, 'vehicles': 2, 'casualties': 1, 'hour': 20, 'day': 6},
    {'latitude': 23.0225, 'longitude': 72.5714, 'weather': 'Clear', 'light': 'Daylight', 'surface': 'Dry', 'road': 'Roundabout', 'urban': 'Urban', 'hazards': 'None', 'special': 'None', 'speed': 30, 'vehicles': 2, 'casualties': 0, 'hour': 11, 'day': 2},
    {'latitude': 26.9124, 'longitude': 75.7873, 'weather': 'Dust storm', 'light': 'Daylight', 'surface': 'Dry', 'road': 'Dual carriageway', 'urban': 'Urban', 'hazards': 'Poor visibility', 'special': 'None', 'speed': 60, 'vehicles': 3, 'casualties': 2, 'hour': 15, 'day': 1},
    {'latitude': 17.3850, 'longitude': 78.4867, 'weather': 'Clear', 'light': 'Darkness - lights lit', 'surface': 'Dry', 'road': 'Slip road', 'urban': 'Urban', 'hazards': 'None', 'special': 'None', 'speed': 50, 'vehicles': 1, 'casualties': 0, 'hour': 21, 'day': 5},
]

def run_predictions_on_new_data(features, numeric_cols, categorical_cols, risk_pipe, severity_pipe, centroids):
    print("\n" + "="*80)
    print("PREDICTIONS ON NEW DATA")
    print("="*80 + "\n")

    for i, entry in enumerate(test_entries, start=1):
        pred = predict_vehicle_entry(
            lat=entry['latitude'], lon=entry['longitude'], weather=entry['weather'],
            light=entry['light'], surface=entry['surface'], road=entry['road'],
            urban=entry['urban'], hazards=entry['hazards'], special=entry['special'],
            speed=entry['speed'], vehicles=entry['vehicles'], casualties=entry['casualties'],
            hour=entry['hour'], day=entry['day'], features=features,
            numeric_cols=numeric_cols, categorical_cols=categorical_cols,
            risk_pipe=risk_pipe, severity_pipe=severity_pipe, centroids_df=centroids
        )

        print(f"Entry {i}: ({entry['latitude']:.4f}, {entry['longitude']:.4f})")
        print(f"  {entry['weather']}, {entry['light']}, {entry['road']}")
        print(f"  Distance to hotspot: {pred['nearest_hotspot']['distance_m']:.0f}m, Within 5km: {pred['nearest_hotspot']['within_5km']}")
        if 'risk_label' in pred: print(f"  Risk: {pred['risk_label']}", end="")
        if 'severity_label' in pred: print(f", Severity: {pred['severity_label']}")
        else: print()
        print()

# ---------------- Main ----------------
def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df = load_and_clean(DATA_PATH)
    df, centroids = detect_hotspots(df)
    df = annotate_hotspot(df, centroids)
    df, features = feature_engineering(df)
    risk_pipe, severity_pipe, numeric_cols, categorical_cols = build_models(df, features)

    X_data = df[features].copy()
    for c in X_data.columns:
        if c in numeric_cols:
            X_data[c] = pd.to_numeric(X_data[c], errors='coerce').fillna(-999)
        elif c in categorical_cols:
            X_data[c] = X_data[c].astype(str).fillna('Unknown')

    if df['risk'].nunique() > 1:
        X_train, X_test, y_train, y_test = train_test_split(X_data, df['risk'], test_size=0.2, random_state=SEED)
        risk_pipe.fit(X_train, y_train)
        print(f"Risk model trained. Accuracy: {risk_pipe.score(X_test, y_test):.3f}")
    else:
        print("Skipping risk model")
        risk_pipe = None

    if 'sev_class' in df.columns and df['sev_class'].nunique() > 1:
        X_train, X_test, y_train, y_test = train_test_split(X_data, df['sev_class'], test_size=0.2, random_state=SEED)
        severity_pipe.fit(X_train, y_train)
        print(f"Severity model trained. Accuracy: {severity_pipe.score(X_test, y_test):.3f}")
    else:
        print("Skipping severity model")
        severity_pipe = None

    if risk_pipe: joblib.dump(risk_pipe, os.path.join(OUTPUT_DIR,'risk_model.pkl'))
    if severity_pipe: joblib.dump(severity_pipe, os.path.join(OUTPUT_DIR,'severity_model.pkl'))
    joblib.dump({'features': features, 'numeric_cols': numeric_cols, 'categorical_cols': categorical_cols, 'centroids': centroids},
                os.path.join(OUTPUT_DIR,'metadata.pkl'))

    # RUN PREDICTIONS ON NEW DATA
    run_predictions_on_new_data(features, numeric_cols, categorical_cols, risk_pipe, severity_pipe, centroids)

if __name__=="__main__":
    main()

Risk model trained. Accuracy: 0.859
Severity model trained. Accuracy: 0.859

PREDICTIONS ON NEW DATA

Entry 1: (28.6139, 77.2090)
  Raining, Daylight, Dual carriageway
  Distance to hotspot: 1922m, Within 5km: False
  Risk: 2, Severity: 2

Entry 2: (28.7041, 77.1025)
  Clear, Night, Single carriageway
  Distance to hotspot: 13741m, Within 5km: False
  Risk: 1, Severity: 1

Entry 3: (28.5355, 77.3910)
  Fog, Darkness - lights lit, Roundabout
  Distance to hotspot: 20122m, Within 5km: False
  Risk: 2, Severity: 2

Entry 4: (19.0760, 72.8777)
  Heavy rain, Daylight, Motorway
  Distance to hotspot: 6269m, Within 5km: False
  Risk: 2, Severity: 2

Entry 5: (12.9716, 77.5946)
  Clear, Darkness - no lighting, One way street
  Distance to hotspot: 3930m, Within 5km: False
  Risk: 2, Severity: 2

Entry 6: (13.0827, 80.2707)
  Windy, Daylight, Dual carriageway
  Distance to hotspot: 250798m, Within 5km: False
  Risk: 2, Severity: 2

Entry 7: (22.5726, 88.3639)
  Raining, Darkness - lights lit, S