# Q2 — Duration & Longest-Block Hotspots (Exact Recompute)
**Source:** `violations_routes_filtered.csv`
**Scope:** M101, M60+, M15+; `is_exempt == True`

In [None]:
import pandas as pd, numpy as np
from pathlib import Path

DATA = "/mnt/data/violations_routes_filtered.csv"
usecols = ["Bus Route ID","Datetime","First Occurrence","Last Occurrence","is_exempt",
           "Violation Status","Violation Type","Stop Name","Violation Latitude","Violation Longitude","Vehicle ID"]
df = pd.read_csv(DATA, usecols=usecols, low_memory=True)
df.columns = [c.strip() for c in df.columns]

def route_tag(r):
    if pd.isna(r): return None
    r = str(r).upper().strip()
    if r.startswith("M101"): return "M101"
    if r.startswith("M60"):  return "M60+" if ("SBS" in r or "+" in r or "-SBS" in r or "-" in r) else "M60"
    if r.startswith("M15"):  return "M15+" if (("SBS" in r) or ("+" in r) or ("-SBS" in r)) else "M15"
    return r

df["route_tag"] = df["Bus Route ID"].apply(route_tag)
df = df[df["route_tag"].isin(["M101","M60+","M15+"])]
df["is_exempt"] = df["is_exempt"].astype(str).str.lower().isin(["true","1","t","yes","y"])

ex = df[df["is_exempt"]==True].copy()
for col in ["Datetime","First Occurrence","Last Occurrence"]:
    ex[col] = pd.to_datetime(ex[col], errors="coerce", infer_datetime_format=True)
ex["duration_min"] = (ex["Last Occurrence"] - ex["First Occurrence"]).dt.total_seconds()/60.0
ex.loc[ex["duration_min"]<0, "duration_min"] = np.nan

dur_route = (ex.groupby("route_tag")["duration_min"]
             .agg(n_events="count", avg_min="mean", median_min="median",
                  p90_min=lambda s: np.nanpercentile(s.dropna(), 90) if s.notna().any() else np.nan,
                  max_min="max").reset_index().round(2))
display(dur_route)

dur_status = (ex.groupby(["route_tag","Violation Status"])["duration_min"]
              .agg(n_events="count", avg_min="mean", median_min="median",
                   p90_min=lambda s: np.nanpercentile(s.dropna(), 90) if s.notna().any() else np.nan,
                   max_min="max").reset_index().round(2))
display(dur_status.sort_values(["route_tag","avg_min"], ascending=[True, False]).groupby("route_tag").head(10))

cols = ["First Occurrence","Last Occurrence","duration_min","Violation Status","Violation Type",
        "Stop Name","Violation Latitude","Violation Longitude","Vehicle ID"]
for r in ["M101","M60+","M15+"]:
    d = ex[ex["route_tag"]==r].sort_values("duration_min", ascending=False)[cols].head(10)
    print(f"\n=== Longest single events — {r} ==="); display(d)

def duration_hotspots(d, decimals=4, min_events=10, topn=10):
    x = d.copy()
    x["lat_r"] = x["Violation Latitude"].round(decimals)
    x["lon_r"] = x["Violation Longitude"].round(decimals)
    grp = (x.groupby(["lat_r","lon_r"])
           .agg(n=("duration_min","count"),
                avg_min=("duration_min","mean"),
                median_min=("duration_min","median"))
           .reset_index())
    grp = grp[grp["n"]>=min_events].sort_values(["avg_min","n"], ascending=[False, False]).head(topn)
    mode_stop = (x.groupby(["lat_r","lon_r"])["Stop Name"]
                 .agg(lambda s: s.mode().iloc[0] if not s.mode().empty else None).reset_index())
    return grp.merge(mode_stop, on=["lat_r","lon_r"], how="left")

for r in ["M101","M60+","M15+"]:
    dh = duration_hotspots(ex[ex["route_tag"]==r])
    print(f"\n=== Duration hotspots — {r} ==="); display(dh)

outdir = Path("/mnt/data/q2_notebooks/outputs"); outdir.mkdir(parents=True, exist_ok=True)
dur_route.to_csv(outdir/"duration_by_route.csv", index=False)
dur_status.to_csv(outdir/"duration_by_status.csv", index=False)
print("Saved outputs to", outdir)