<a href="https://colab.research.google.com/github/DDDS18-GTFS/ddds.18.capstone/blob/dev.Andrew/GTFS_ABQ_no_capture_cleaned_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Load the Libraries

In [None]:
#Required Libraries
import pandas as pd
import numpy as np

from datetime import datetime
import zipfile

from shapely.geometry import Point
from geopy.distance import geodesic
from shapely.geometry import LineString

# from folium import Map, FeatureGroup, CircleMarker, PolyLine, Marker, Icon, LayerControl
from matplotlib import colors as mcolors
import matplotlib.pyplot as plt
import folium


In [None]:
#These are defined in the Anomaly Detection section cells individually, but we will want to define them more easily, and may move it up here at some point
# # ---------------------
# # Threshold parameters
# # ---------------------

# # Distance (in degrees) that counts as a GPS "jump" (this is greater than the average stop distance)
# JUMP_DISTANCE_THRESHOLD = 0.005  # ≈ 500 meters

# # Time gap (in seconds) that counts as a disappearance
# DISAPPEARANCE_TIME_THRESHOLD = 300  # 5 minutes

# # Only show vehicles with at least this many jumps
# MIN_JUMP_COUNT_PER_VEHICLE = 1

# # Number of vehicles to sample for map clarity
# NUM_VEHICLES_TO_SAMPLE = 200


#Load the RT snapshot

In [None]:
#✅ Step 1a: Load and Inspect the New Snapshot
new_snapshot_path = "/content/cabq_gtfs_snapshots_20250722_1415.csv"
df_new = pd.read_csv(new_snapshot_path)
# df_new.info()
# df_new.head(3)

# #Also print the columns:
# print(df_new.columns.tolist())


In [None]:
#✅ Step 2a: Trip ID Validity
df_new["trip_id"] = df_new["trip_id"].astype(str)
invalid_trip_ids = df_new["trip_id"].isin(["0", "Undetermined", "nan", "", "None"]).sum()
total_rows = len(df_new)

print(f"Total rows: {total_rows}")
print(f"Invalid trip_ids: {invalid_trip_ids}")
print(f"Percent valid: {100 * (total_rows - invalid_trip_ids) / total_rows:.2f}%")


In [None]:
# I believe this is fully depricated and can be removed
# #✅ Step 3a: Route ID Normalization (Preview)
# # We'll also check route formatting now that we know floats were a problem previously:

# df_new["route_id"] = df_new["route_id"].astype(str).str.replace(r"\.0$", "", regex=True)
# print(df_new["route_id"].dropna().unique()[:10])


#Load the Static data

In [None]:
#✅ Step 1b: Reload Static GTFS and Normalize It

# Adjust if needed — make sure this is the static feed aligned with 2025-07-22
gtfs_zip_path = "/content/google_transit.zip"

with zipfile.ZipFile(gtfs_zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content/gtfs_static")

trips = pd.read_csv("/content/gtfs_static/trips.txt", dtype=str)
routes = pd.read_csv("/content/gtfs_static/routes.txt", dtype=str)


In [None]:
#✅ Step 2b: Filter invalid trip_ids
invalid_trip_ids = ["0", "Undetermined", "nan", "", "None"]
df_clean = df_new[~df_new["trip_id"].isin(invalid_trip_ids)].copy()


In [None]:
#✅ Step 3b: Merge with trips.txt to Get route_id
df_with_trips = df_clean.merge(trips, on="trip_id", how="left")


In [None]:
#✅ Step 4b: Merge with routes.txt to Get Descriptive Info
df_with_trips["route_id"] = df_with_trips["route_id"].astype(str)

# Ensure consistent types
trips["shape_id"] = trips["shape_id"].astype(str)
routes["route_id"] = routes["route_id"].astype(str)

df_full = df_with_trips.merge(routes, on="route_id", how="left")


In [None]:
#✅ Step 5b: Load shapes.txt
shapes = pd.read_csv("/content/gtfs_static/shapes.txt", dtype={"shape_id": str})

# Build LineStrings for each shape_id
shape_lines = {}
for shape_id, group in shapes.groupby("shape_id"):
    sorted_group = group.sort_values("shape_pt_sequence")
    coords = list(zip(sorted_group["shape_pt_lon"], sorted_group["shape_pt_lat"]))
    shape_lines[shape_id] = LineString(coords)

# Merge trips and routes to link shape_id to route_short_name
shape_route_map = (
    trips.merge(routes, on="route_id", how="left")
         .dropna(subset=["route_short_name"])
         .drop_duplicates(subset=["shape_id"])
         .set_index("shape_id")[["route_id", "route_short_name"]]
)

In [None]:
#✅ Step 6b: Create a data_quality Flag
def classify_row(row):
    if row["trip_id"] in invalid_trip_ids:
        return "invalid_trip_id"
    elif pd.isna(row["route_id"]):
        return "missing_route_id"
    elif pd.isna(row["route_long_name"]):
        return "missing_route_metadata"
    else:
        return "valid"

df_full["data_quality"] = df_full.apply(classify_row, axis=1)
print(df_full["data_quality"].value_counts())


In [None]:
#Load the trips, routes, shapes from Static data
with zipfile.ZipFile(gtfs_zip_path, 'r') as z:
    trips_df = pd.read_csv(z.open("trips.txt"))
    routes_df = pd.read_csv(z.open("routes.txt"))
    # Load GTFS shapes.txt into a DataFrame
    shapes_df = pd.read_csv(z.open("shapes.txt"))

#Clean the RT data

In [None]:
#🔹 1.1 Filter for Valid Rows
df_valid = df_full[df_full["data_quality"] == "valid"].copy()

#🔹 1.2 Parse Timestamps
df_valid["timestamp"] = pd.to_datetime(df_valid["timestamp_collected"], utc=True)

#🔹 1.3 Sort by Vehicle and Timestamp
df_valid = df_valid.sort_values(by=["vehicle_id", "timestamp"])

#🔹 1.4 Organize by Vehicle
#This creates a dictionary keyed by vehicle ID, each with a sorted DataFrame:
vehicle_groups = dict(tuple(df_valid.groupby("vehicle_id")))

#You can confirm how many distinct vehicles you’re tracking:
print("Vehicle count:", len(vehicle_groups))


#Anomly Detection

🔹 1. Detect Jumps and Gaps

Already implemented, but here’s the modular form:

In [None]:
#1. Detect Jumps and Gaps
def detect_jumps_and_gaps(df, distance_threshold=250, time_threshold=90):
    anomalies = []
    for i in range(1, len(df)):
        row_prev, row_curr = df.iloc[i - 1], df.iloc[i]
        time_diff = (row_curr["timestamp"] - row_prev["timestamp"]).total_seconds()
        distance = geodesic(
            (row_prev["latitude"], row_prev["longitude"]),
            (row_curr["latitude"], row_curr["longitude"])
        ).meters
        if time_diff > time_threshold or distance > distance_threshold:
            anomalies.append({
                "vehicle_id": row_curr["vehicle_id"],
                "timestamp_prev": row_prev["timestamp"],
                "timestamp_curr": row_curr["timestamp"],
                "time_diff_sec": time_diff,
                "distance_m": distance,
                "is_gap": time_diff > time_threshold,
                "is_jump": distance > distance_threshold,
                "anomaly_type": "jump_or_gap"
            })
    return anomalies


In [None]:
#🔹 2. Detect Stuck Vehicles
def detect_stuck_vehicle(df, speed_thresh=1.0, window=4):
    stuck_flags = (
        (df["speed_mph"].rolling(window).mean() < speed_thresh) &
        (df["latitude"].diff().abs().rolling(window).mean() < 0.0001) &
        (df["longitude"].diff().abs().rolling(window).mean() < 0.0001)
    )
    return df[stuck_flags.fillna(False)].assign(anomaly_type="stuck_vehicle")


In [None]:
#🔹 3. Detect Impossible Speeds
def detect_impossible_speeds(df, speed_limit_kph=120):
    records = []
    for i in range(1, len(df)):
        row_prev, row_curr = df.iloc[i - 1], df.iloc[i]
        time_diff = (row_curr["timestamp"] - row_prev["timestamp"]).total_seconds()
        if time_diff == 0:
            continue
        distance = geodesic(
            (row_prev["latitude"], row_prev["longitude"]),
            (row_curr["latitude"], row_curr["longitude"])
        ).meters
        speed_kph = (distance / time_diff) * 3.6
        if speed_kph > speed_limit_kph:
            records.append({
                "vehicle_id": row_curr["vehicle_id"],
                "timestamp_curr": row_curr["timestamp"],
                "computed_speed_kph": speed_kph,
                "distance_m": distance,
                "anomaly_type": "impossible_speed"
            })
    return pd.DataFrame(records)


In [None]:
#🔹 4. Detect Backtracking (Heading Reversal)
def detect_backtracking(df, reversal_thresh=160):
    backtrack_flags = df["heading"].diff().abs().between(reversal_thresh, 200)
    return df[backtrack_flags.fillna(False)].assign(anomaly_type="backtracking")


In [None]:
#🔹 5. Detect Repeated Points
def detect_repeated_points(df):
    repeated = (
        (df["latitude"].diff().abs() < 1e-5) &
        (df["longitude"].diff().abs() < 1e-5)
    )
    return df[repeated.fillna(False)].assign(anomaly_type="repeated_points")


In [None]:
#🔹 6. Detect Disappearance Without Return
def detect_disappeared(df, snapshot_end_time, min_gap_minutes=10):
    last_seen = df["timestamp"].max()
    if (snapshot_end_time - last_seen).total_seconds() > min_gap_minutes * 60:
        return pd.DataFrame([{
            "vehicle_id": df["vehicle_id"].iloc[0],
            "last_seen": last_seen,
            "anomaly_type": "disappearance"
        }])
    return pd.DataFrame()


In [None]:
#🔹 7. Detect Early Appearance
def detect_early_appearance(df, snapshot_start_time, margin_seconds=30):
    first_seen = df["timestamp"].min()
    if (first_seen - snapshot_start_time).total_seconds() < margin_seconds:
        return pd.DataFrame([{
            "vehicle_id": df["vehicle_id"].iloc[0],
            "first_seen": first_seen,
            "anomaly_type": "early_appearance"
        }])
    return pd.DataFrame()


In [None]:
#🔹 8. Detect Off-Route Movement
#1. Preprocess Route Shapes

# Load and group shape points
shapes = pd.read_csv("/content/gtfs_static/shapes.txt")
shapes["shape_id"] = shapes["shape_id"].astype(str)

# Build LineStrings for each shape_id
shape_lines = {}
for shape_id, group in shapes.groupby("shape_id"):
    sorted_group = group.sort_values("shape_pt_sequence")
    coords = list(zip(sorted_group["shape_pt_lon"], sorted_group["shape_pt_lat"]))
    shape_lines[shape_id] = LineString(coords)


#2. Detect Off-Route for Each Vehicle Point
#We check if the GPS point is >50m from its assigned shape line.

def detect_off_route(df_vehicle, shape_lines, buffer_m=50):
    records = []
    for _, row in df_vehicle.iterrows():
        shape_id = str(row.get("shape_id"))
        if shape_id not in shape_lines:
            continue  # shape not known

        route_line = shape_lines[shape_id]
        vehicle_point = Point(row["longitude"], row["latitude"])

        # Find closest point on route and compute geodesic distance
        closest_point = route_line.interpolate(route_line.project(vehicle_point))
        dist_m = geodesic(
            (row["latitude"], row["longitude"]),
            (closest_point.y, closest_point.x)
        ).meters

        if dist_m > buffer_m:
            records.append({
              "vehicle_id": row["vehicle_id"],
              "timestamp": row["timestamp"],
              "route_short_name": row.get("route_short_name"),
              "distance_from_route_m": dist_m,
              "latitude": row["latitude"],
              "longitude": row["longitude"],
              "shape_id": shape_id,
              "anomaly_type": "off_route"
            })
    return pd.DataFrame(records)



In [None]:
#Bring together all Anomalies

# Step 1: Normalize timestamp and rebuild vehicle groups
df_valid["timestamp"] = pd.to_datetime(df_valid["timestamp_collected"], utc=True)
vehicle_groups = dict(tuple(df_valid.groupby("vehicle_id")))

# Step 2: Detect each anomaly type

jumpgap_records = []
stuck_records = []
speed_records = []
backtrack_records = []
repeated_records = []
disappear_records = []
early_records = []
offroute_records = []

snapshot_start = df_valid["timestamp"].min()
snapshot_end = df_valid["timestamp"].max()

for vehicle_id, df_vehicle in vehicle_groups.items():
    df_vehicle = df_vehicle.sort_values("timestamp").reset_index(drop=True)

    # 1. Jumps and Gaps
    jumpgap_records.extend(detect_jumps_and_gaps(df_vehicle))

    # 2. Stuck Vehicles
    stuck = detect_stuck_vehicle(df_vehicle)
    if not stuck.empty:
        stuck_records.append(stuck)

    # 3. Impossible Speeds
    speed = detect_impossible_speeds(df_vehicle)
    if not speed.empty:
        speed_records.append(speed)

    # 4. Backtracking
    backtrack = detect_backtracking(df_vehicle)
    if not backtrack.empty:
        backtrack_records.append(backtrack)

    # 5. Repeated Points
    repeat = detect_repeated_points(df_vehicle)
    if not repeat.empty:
        repeated_records.append(repeat)

    # 6. Disappearance
    disappear = detect_disappeared(df_vehicle, snapshot_end)
    if not disappear.empty:
        disappear_records.append(disappear)

    # 7. Early Appearance
    early = detect_early_appearance(df_vehicle, snapshot_start)
    if not early.empty:
        early_records.append(early)

    # 8. Off-Route
    offroute = detect_off_route(df_vehicle, shape_lines)
    if not offroute.empty:
        offroute_records.append(offroute)

# Step 3: Combine to DataFrames
df_anomalies_jumpgap   = pd.DataFrame(jumpgap_records)
df_anomalies_stuck     = pd.concat(stuck_records, ignore_index=True) if stuck_records else pd.DataFrame()
df_anomalies_speed     = pd.concat(speed_records, ignore_index=True) if speed_records else pd.DataFrame()
df_anomalies_backtrack = pd.concat(backtrack_records, ignore_index=True) if backtrack_records else pd.DataFrame()
df_anomalies_repeated  = pd.concat(repeated_records, ignore_index=True) if repeated_records else pd.DataFrame()
df_anomalies_disappear = pd.concat(disappear_records, ignore_index=True) if disappear_records else pd.DataFrame()
df_anomalies_early     = pd.concat(early_records, ignore_index=True) if early_records else pd.DataFrame()
df_anomalies_offroute  = pd.concat(offroute_records, ignore_index=True) if offroute_records else pd.DataFrame()

# Step 4: Combine all anomalies into a single DataFrame
anomaly_frames = [
    df_anomalies_jumpgap,
    df_anomalies_stuck,
    df_anomalies_speed,
    df_anomalies_backtrack,
    df_anomalies_repeated,
    df_anomalies_disappear,
    df_anomalies_early,
    df_anomalies_offroute
]

anomaly_frames = [df for df in anomaly_frames if 'anomaly_type' in df.columns and not df.empty]
df_anomalies_full = pd.concat(anomaly_frames, ignore_index=True)
print("Unified anomaly count:", len(df_anomalies_full))
df_anomalies_full["anomaly_type"].value_counts()


In [None]:

# Ensure shape_id is string type
df_anomalies_offroute["shape_id"] = df_anomalies_offroute["shape_id"].astype(str)

# Join to enrich anomalies with route info
df_anomalies_offroute = (
    df_anomalies_offroute
    .merge(shape_route_map, on="shape_id", how="left", suffixes=("", "_from_map"))
)

# If route_short_name was missing, replace it
df_anomalies_offroute["route_short_name"] = (
    df_anomalies_offroute["route_short_name"]
    .fillna(df_anomalies_offroute["route_short_name_from_map"])
)

# Drop helper column
df_anomalies_offroute = df_anomalies_offroute.drop(columns=["route_short_name_from_map"])


In [None]:
#Sanity Check
missing_routes = df_anomalies_offroute["route_short_name"].isnull().sum()
print(f"Remaining anomalies with missing route_short_name: {missing_routes}")
#If this prints 0, you’ve successfully patched all entries.

In [None]:
# #Not sure if this is needed
# #Assuming your vehicle groups are stored like:
# vehicle_groups = dict(tuple(df_valid.groupby("vehicle_id")))

# #And you’re aggregating anomalies like:
# all_anomalies = []

# #Add off-route detection per vehicle:
# for vehicle_id, df_vehicle in vehicle_groups.items():
#     df_vehicle = df_vehicle.sort_values("timestamp").reset_index(drop=True)

#     # Call anomaly modules
#     off_route_df = detect_off_route(df_vehicle, shape_lines)

#     # Append results
#     if not off_route_df.empty:
#         all_anomalies.append(off_route_df)

# # Final result
# df_anomalies_offroute = pd.concat(all_anomalies, ignore_index=True)

# #Optional Check:
# print("Off-route anomalies detected:", len(df_anomalies_offroute))
# df_anomalies_offroute.sort_values("distance_from_route_m", ascending=False).head()

In [None]:
#🔹 1.1 Filter for Valid Rows
df_valid = df_full[df_full["data_quality"] == "valid"].copy()

#🔹 1.2 Parse Timestamps
df_valid["timestamp"] = pd.to_datetime(df_valid["timestamp_collected"], utc=True)

#🔹 1.3 Sort by Vehicle and Timestamp
df_valid = df_valid.sort_values(by=["vehicle_id", "timestamp"])

#🔹 1.4 Organize by Vehicle
#This creates a dictionary keyed by vehicle ID, each with a sorted DataFrame:
vehicle_groups = dict(tuple(df_valid.groupby("vehicle_id")))

#You can confirm how many distinct vehicles you’re tracking:
print("Vehicle count:", len(vehicle_groups))


#Summary Stats by Route/Anomaly

In [None]:
#Patch route_short_name into df_valid (ensures downstream consistency)
# Build trip-to-route lookup from your GTFS static data
trip_to_route_lookup = trips_df[["trip_id", "route_id"]].merge(
    routes_df[["route_id", "route_short_name"]], on="route_id", how="left"
)

#the data types of the trip_id column are mismatched between df_valid and trip_to_route_lookup
df_valid["trip_id"] = df_valid["trip_id"].astype(str)
trip_to_route_lookup["trip_id"] = trip_to_route_lookup["trip_id"].astype(str)

# Patch into df_valid
df_valid = df_valid.merge(trip_to_route_lookup, on="trip_id", how="left")


In [None]:
#Step 1: Build a lookup table
route_lookup = df_valid[["vehicle_id", "timestamp", "route_short_name"]].copy()
route_lookup["timestamp"] = pd.to_datetime(route_lookup["timestamp"], utc=True)


In [None]:
#Step 2: Also convert timestamp in anomalies to datetime
df_anomalies_full["timestamp"] = pd.to_datetime(df_anomalies_full["timestamp"], utc=True)


In [None]:
#Step 3: Join by vehicle and timestamp (merge nearest match within a short tolerance)
# Sort before merge_asof
route_lookup = route_lookup.sort_values(["vehicle_id", "timestamp"])
df_anomalies_full = df_anomalies_full.sort_values(["vehicle_id", "timestamp"])

#Filter out the rows with null timestamps before merging
# Check how many nulls you have
print("Null timestamps in anomalies:", df_anomalies_full["timestamp"].isna().sum())

# Drop them before merge_asof (can't match anything anyway)
df_anomalies_full = df_anomalies_full[df_anomalies_full["timestamp"].notna()]

# Enforce datetime dtype and full sort
route_lookup["timestamp"] = pd.to_datetime(route_lookup["timestamp"], utc=True, errors="coerce")
df_anomalies_full["timestamp"] = pd.to_datetime(df_anomalies_full["timestamp"], utc=True, errors="coerce")

#Re-sort before the merge
# Ensure sorting after dropping nulls
route_lookup = route_lookup.sort_values("timestamp")
df_anomalies_full = df_anomalies_full.sort_values("timestamp")


# Merge with tolerance: 60 seconds
df_anomalies_full = pd.merge_asof(
    df_anomalies_full,
    route_lookup,
    by="vehicle_id",
    on="timestamp",
    direction="nearest",
    tolerance=pd.Timedelta("60s"),
    suffixes=("", "_from_lookup")
)

# Final fallback
df_anomalies_full["route_short_name"] = df_anomalies_full["route_short_name"].fillna(
    df_anomalies_full["route_short_name_from_lookup"]
)

print("Remaining missing route names:", df_anomalies_full["route_short_name"].isna().sum())

In [None]:
# Treat 'Unknown' as null for fallback
df_anomalies_full["route_short_name"] = df_anomalies_full["route_short_name"].replace("Unknown", pd.NA)

# Re-apply fallback
df_anomalies_full["route_short_name"] = df_anomalies_full["route_short_name"].combine_first(
    df_anomalies_full["route_short_name_from_lookup"]
)

# Confirm
print("Remaining missing route names:", df_anomalies_full["route_short_name"].isna().sum())
print(df_anomalies_full["route_short_name"].value_counts(dropna=False).head())


In [None]:
anomaly_by_route_type = df_anomalies_full.pivot_table(
    index="route_short_name",
    columns="anomaly_type",
    aggfunc="size",
    fill_value=0
).sort_index()

anomaly_by_route_type


#Plot in Folium

In [None]:
#Step 1: Ensure latitude and longitude are floats
df_valid["latitude"] = pd.to_numeric(df_valid["latitude"], errors="coerce")
df_valid["longitude"] = pd.to_numeric(df_valid["longitude"], errors="coerce")

#Step 2: Ensure timestamp_collected is datetime
df_valid["timestamp_collected"] = pd.to_datetime(df_valid["timestamp_collected"], errors="coerce", utc=True)


In [None]:
# Use the full cleaned dataframe
df_full = df_valid.copy()

# Compute diffs for position and time
df_full["lat_diff"] = df_full.groupby("vehicle_id")["latitude"].diff()
df_full["lon_diff"] = df_full.groupby("vehicle_id")["longitude"].diff()
df_full["jump_dist"] = (df_full["lat_diff"]**2 + df_full["lon_diff"]**2)**0.5

df_full["time_diff"] = df_full.groupby("vehicle_id")["timestamp_collected"].diff().dt.total_seconds()

# Label events
df_full["is_jump"] = df_full["jump_dist"] > JUMP_DISTANCE_THRESHOLD
df_full["is_disappearance"] = df_full["time_diff"] > DISAPPEARANCE_TIME_THRESHOLD

# Enrich jump rows with previous position
jumps_df = df_full[df_full["is_jump"]].copy()
jumps_df["lat_prev"] = df_full.groupby("vehicle_id")["latitude"].shift()
jumps_df["lon_prev"] = df_full.groupby("vehicle_id")["longitude"].shift()
jumps_df["timestamp_prev"] = df_full.groupby("vehicle_id")["timestamp_collected"].shift()
jumps_df["timestamp_curr"] = jumps_df["timestamp_collected"]

# Filter for vehicles with sufficient jumps
jump_counts = jumps_df["vehicle_id"].value_counts()
keep_jumpers = jump_counts[jump_counts >= MIN_JUMP_COUNT_PER_VEHICLE].index
jumps_df = jumps_df[jumps_df["vehicle_id"].isin(keep_jumpers)]

# Recalculate filtered set for map display
df_jumpers_only = df_full[df_full["vehicle_id"].isin(keep_jumpers)].copy()

# Identify disappearance and reappearance points
disappear_df = df_jumpers_only[df_jumpers_only["is_disappearance"]].copy()
# Shift the is_disappearance column and fill any resulting NaNs with False
is_reappear = df_jumpers_only["is_disappearance"].shift(-1).fillna(False)
reappear_df = df_jumpers_only[is_reappear].copy()


In [None]:
#Here is the full code to patch route_short_name into shapes_df, allowing each shape to be associated with its route for proper filtering in Folium:
# Step 1: Extract shape_id to route_id mapping from trips_df
shape_to_route = trips_df[["shape_id", "route_id"]].drop_duplicates()

# Step 2: Map route_id to route_short_name from routes_df
route_id_to_name = routes_df[["route_id", "route_short_name"]]

# Step 3: Merge to associate shape_id with route_short_name
shape_route_map = shape_to_route.merge(route_id_to_name, on="route_id", how="left")

# Step 4: Merge back into shapes_df to add route_short_name column
shapes_df = shapes_df.merge(shape_route_map, on="shape_id", how="left")

# Step 5: Confirm result
print("Sample of shapes_df with route_short_name:")
print(shapes_df[["shape_id", "route_id", "route_short_name"]].drop_duplicates().head())

#This ensures that each shape_id in your GTFS shapes.txt data now carries the corresponding route_short_name.
#We can now group route shapes per route like this:
#for route_name, group in shapes_df.groupby("route_short_name"): ...



In [None]:
# --- Create base map ---
mymap = Map(location=[35.0844, -106.6504], zoom_start=12)

# --- Define color map for route-based layers ---
route_names = sorted(df_anomalies_full['route_short_name'].dropna().unique())
cmap = plt.get_cmap("tab20", len(route_names))
color_map = {route: mcolors.to_hex(cmap(i)) for i, route in enumerate(route_names)}

# --- Add anomaly points by (route, anomaly_type) ---
for (route, anomaly), subset in df_anomalies_full.groupby(["route_short_name", "anomaly_type"]):
    group = FeatureGroup(name=f"{route} – {anomaly}", show=False)
    for _, row in subset.iterrows():
        popup_text = (
            f"Anomaly: {anomaly}<br>"
            f"Route: {route}<br>"
            f"Vehicle: {row.get('vehicle_id', 'N/A')}<br>"
            f"Timestamp: {row.get('timestamp', 'N/A')}"
        )
        CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=4,
            color=color_map.get(route, "black"),
            fill=True,
            fill_opacity=0.9,
            popup=popup_text
        ).add_to(group)
    group.add_to(mymap)

# --- Add jump markers and lines by route ---
if 'jumps_df' in globals() and not jumps_df.empty:
    for route, group_df in jumps_df.groupby("route_short_name"):
        jump_line_group = FeatureGroup(name=f"{route} – Jump Lines", show=False)
        jump_point_group = FeatureGroup(name=f"{route} – Jump Start/End", show=False)

        for _, row in group_df.iterrows():
            start = [row["lat_prev"], row["lon_prev"]]
            end = [row["latitude"], row["longitude"]]
            vehicle = row.get("vehicle_id", "N/A")
            t_prev = row.get("timestamp_prev", "N/A")
            t_curr = row.get("timestamp_curr", "N/A")

            PolyLine(
                locations=[start, end],
                color="orange", weight=2,
                tooltip=f"Vehicle {vehicle} jump"
            ).add_to(jump_line_group)

            CircleMarker(
                location=start, radius=4, color="blue", fill=True,
                fill_opacity=0.9,
                popup=f"Vehicle {vehicle} START<br>{t_prev}"
            ).add_to(jump_point_group)

            CircleMarker(
                location=end, radius=4, color="purple", fill=True,
                fill_opacity=0.9,
                popup=f"Vehicle {vehicle} END<br>{t_curr}"
            ).add_to(jump_point_group)

        jump_line_group.add_to(mymap)
        jump_point_group.add_to(mymap)

# --- Add disappearances and reappearances by route ---
if 'disappear_df' in globals() and not disappear_df.empty:
    for route, group_df in disappear_df.groupby("route_short_name"):
        disappear_group = FeatureGroup(name=f"{route} – Disappearances", show=False)
        for _, row in group_df.iterrows():
            Marker(
                location=[row["latitude"], row["longitude"]],
                icon=Icon(color="red", icon="times-circle", prefix="fa"),
                tooltip=f"Vehicle {row.get('vehicle_id')} disappeared<br>{row.get('timestamp')}"
            ).add_to(disappear_group)
        disappear_group.add_to(mymap)

if 'reappear_df' in globals() and not reappear_df.empty:
    for route, group_df in reappear_df.groupby("route_short_name"):
        reappear_group = FeatureGroup(name=f"{route} – Reappearances", show=False)
        for _, row in group_df.iterrows():
            Marker(
                location=[row["latitude"], row["longitude"]],
                icon=Icon(color="green", icon="check-circle", prefix="fa"),
                tooltip=f"Vehicle {row.get('vehicle_id')} reappeared<br>{row.get('timestamp')}"
            ).add_to(reappear_group)
        reappear_group.add_to(mymap)

# --- Add route shapes as toggleable layers by route ---
if 'shapes_df' in globals() and not shapes_df.empty:
    for route, shape_group in shapes_df.groupby("route_short_name"):
        route_group = FeatureGroup(name=f"{route} – Route Shape", show=False)
        for shape_id, shape_data in shape_group.groupby("shape_id"):
            shape_data = shape_data.sort_values("shape_pt_sequence")
            latlons = list(zip(shape_data["shape_pt_lat"], shape_data["shape_pt_lon"]))
            PolyLine(
                locations=latlons,
                color=color_map.get(route, "gray"),
                weight=2,
                opacity=0.6,
                popup=f"Route {route} | Shape {shape_id}"
            ).add_to(route_group)
        route_group.add_to(mymap)

# --- Add layer controls ---
LayerControl(collapsed=False).add_to(mymap)

# --- Display map ---
mymap
