<a href="https://colab.research.google.com/github/DDDS18-GTFS/ddds.18.capstone/blob/dev.Andrew/ABQ_trial_RT_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

dev.Andrew text cell

#Capture the ABQ Data

In [None]:
#This Cell will capture the data to use for later analysis

import requests
import pandas as pd
import json
import time
import os
from datetime import datetime

# ✅ Albuquerque RT feed URL
GTFS_URL = "https://data.cabq.gov/transit/realtime/route/allroutes.json"

# 🕒 Configuration
NUM_SNAPSHOTS = 10           # Try 10 for ~5 minutes @ 30s
SLEEP_SECONDS = 30
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M")
OUTPUT_DIR = f"/mnt/data/cabq_snapshots_{RUN_TAG}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"🚍 Starting Albuquerque data collection ({NUM_SNAPSHOTS} snapshots)...")

# 📸 Define the fetch function
def fetch_cabq_snapshot_fixed(snapshot_id):
    url = GTFS_URL
    response = requests.get(url)
    data = json.loads(response.content.decode("iso-8859-1"))
    timestamp_collected = datetime.utcnow().isoformat()

    records = []
    for vehicle in data.get("allroutes", []):
        records.append({
            "snapshot_id": snapshot_id,
            "timestamp_collected": timestamp_collected,
            "vehicle_id": vehicle.get("vehicle_id"),
            "latitude": vehicle.get("latitude"),
            "longitude": vehicle.get("longitude"),
            "heading": vehicle.get("heading"),
            "speed_mph": vehicle.get("speed_mph"),
            "route_short_name": vehicle.get("route_short_name"),
            "trip_id": vehicle.get("trip_id"),
            "next_stop_id": vehicle.get("next_stop_id"),
            "next_stop_name": vehicle.get("next_stop_name"),
            "next_stop_sched_time": vehicle.get("next_stop_sched_time")
        })

    return pd.DataFrame(records)

# 📦 Collect and store all snapshots
all_snapshots = []

for i in range(NUM_SNAPSHOTS):
    try:
        print(f"\n📸 Snapshot {i+1}/{NUM_SNAPSHOTS} at {datetime.now().strftime('%H:%M:%S')}")
        df_snapshot = fetch_cabq_snapshot_fixed(i + 1)
        all_snapshots.append(df_snapshot)
        print(f"✅ Collected {len(df_snapshot)} vehicles")

        if i < NUM_SNAPSHOTS - 1:
            time.sleep(SLEEP_SECONDS)

    except Exception as e:
        print(f"❌ Error during snapshot {i+1}: {e}")
        continue

# 🧮 Combine all snapshots
df_all = pd.concat(all_snapshots, ignore_index=True)

# 💾 Save to CSV and Parquet
csv_path = os.path.join(OUTPUT_DIR, f"cabq_gtfs_snapshots_{RUN_TAG}.csv")
parquet_path = os.path.join(OUTPUT_DIR, f"cabq_gtfs_snapshots_{RUN_TAG}.parquet")

df_all.to_csv(csv_path, index=False)
df_all.to_parquet(parquet_path, index=False)

print(f"\n✅ Saved {len(df_all)} total vehicle records")
print(f"📄 CSV: {csv_path}")
print(f"📦 Parquet: {parquet_path}")


🚍 Starting Albuquerque data collection (40 snapshots)...

📸 Snapshot 1/40 at 01:26:49
✅ Collected 339 vehicles

📸 Snapshot 2/40 at 01:27:19
✅ Collected 339 vehicles

📸 Snapshot 3/40 at 01:27:49
✅ Collected 339 vehicles

📸 Snapshot 4/40 at 01:28:19
✅ Collected 339 vehicles

📸 Snapshot 5/40 at 01:28:49
✅ Collected 339 vehicles

📸 Snapshot 6/40 at 01:29:19
✅ Collected 339 vehicles

📸 Snapshot 7/40 at 01:29:50
✅ Collected 339 vehicles

📸 Snapshot 8/40 at 01:30:20
✅ Collected 339 vehicles

📸 Snapshot 9/40 at 01:30:50
✅ Collected 339 vehicles

📸 Snapshot 10/40 at 01:31:20
✅ Collected 339 vehicles

📸 Snapshot 11/40 at 01:31:50
✅ Collected 339 vehicles

📸 Snapshot 12/40 at 01:32:21
✅ Collected 339 vehicles

📸 Snapshot 13/40 at 01:32:51
✅ Collected 339 vehicles

📸 Snapshot 14/40 at 01:33:21
✅ Collected 339 vehicles

📸 Snapshot 15/40 at 01:33:51
✅ Collected 339 vehicles

📸 Snapshot 16/40 at 01:34:21
✅ Collected 339 vehicles

📸 Snapshot 17/40 at 01:34:51
✅ Collected 339 vehicles

📸 Snapshot 18/40

In [None]:
df_all.head()
df_all['vehicle_id'].nunique()
df_all.groupby('snapshot_id').size()


Unnamed: 0_level_0,0
snapshot_id,Unnamed: 1_level_1
1,339
2,339
3,339
4,339
5,339
6,339
7,339
8,339
9,339
10,339


#Tuning the Event Info

Some quick calculations to try to capture relevant events
- A relevant event is being set to a greater than average distance and time between stops, with a movement from one stop to another being used as a proxy for "relevant"
*******
The average speed of Albuquerque buses based on a sampled 5-minute GTFS snapshot is approximately:
- 6.37 km/h (kilometers per hour)
- 3.96 mph (miles per hour)
- These speeds are consistent with low-traffic stop-and-go urban transit patterns during short observation windows.
******
While the exact average distance between all bus stops in Albuquerque isn't immediately available, general guidelines and related statistics provide some insight:
- General Spacing Recommendations: Transit and bus stop design guidelines suggest an average stop spacing of 1,300 feet for local fixed routes in areas with higher population and employment density.
- Research Findings: Studies of bus stop spacing in the United States have found the overall mean spacing to be 313 meters, which is approximately 1027 feet. Another source mentions that buses often stop as frequently as every one or two blocks, with an average of 805 feet [245 m] between stops.
- ABQ Specific Context: ABQ Ride is currently undergoing a network plan review to ensure transit services reflect community priorities, including balancing ridership and coverage needs.
*******
Average time between ABQ Ride bus stops
- While there isn't a readily available precise average time between each individual bus stop on every route in Albuquerque, some information can help understand the general spacing and frequency of ABQ Ride buses:
- Frequency Varies: ABQ Ride offers several types of routes including Rapid Ride (now ARTx, effectively), regular, commuter, and BRT (ART). The frequency of buses varies significantly depending on the specific route and time of day, ranging from as frequent as every 15 minutes to as infrequent as once per hour. - The City of Albuquerque (.gov) ABQ Ride Forward Network Plan indicates that route frequencies in their proposed recovery network range from 15 minutes (red routes) to 60 minutes (light blue routes).
- ART/BRT Faster: The ART (Albuquerque Rapid Transit) system, running along Central Avenue, is known for being relatively fast and reliable.
- Potential Delays: It's important to be aware that even with scheduled frequencies, buses can experience delays due to factors like traffic, rider numbers, and potential issues with the bus itself. Some users have reported significant delays, especially outside of peak hours or with routes that run less frequently.


In [None]:
# ---------------------
# Threshold parameters
# ---------------------

# Distance (in degrees) that counts as a GPS "jump" (this is greater than the average stop distance)
JUMP_DISTANCE_THRESHOLD = 0.005  # ≈ 500 meters

# Time gap (in seconds) that counts as a disappearance
DISAPPEARANCE_TIME_THRESHOLD = 300  # 5 minutes

# Only show vehicles with at least this many jumps
MIN_JUMP_COUNT_PER_VEHICLE = 1

# Number of vehicles to sample for map clarity
NUM_VEHICLES_TO_SAMPLE = 200


##STEP 1: LOAD & PREPARE DATA

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

# Load your dataset
RUN_TAG = "20250721_0126"
csv_path = f"/content/cabq_gtfs_snapshots_{RUN_TAG}.csv"
df_all = pd.read_csv(csv_path)
df_all["timestamp_collected"] = pd.to_datetime(df_all["timestamp_collected"])
df_all.sort_values(by=["vehicle_id", "timestamp_collected"], inplace=True)

# Filter for vehicles with at least 3 data points
vehicle_counts = df_all["vehicle_id"].value_counts()
vehicles_ok = vehicle_counts[vehicle_counts >= 3].index

# Sample a manageable number of vehicles
sample_vehicles = np.random.choice(vehicles_ok, size=NUM_VEHICLES_TO_SAMPLE, replace=False)
df_sample = df_all[df_all["vehicle_id"].isin(sample_vehicles)].copy()

# 🧾 Check data
print("✅ Loaded GTFS snapshot data")
print("📊 Shape:", df_all.shape)
print("🕒 Timestamp range:", df_all['timestamp_collected'].min(), "to", df_all['timestamp_collected'].max())
df_all.head()

✅ Loaded GTFS snapshot data
📊 Shape: (13560, 12)
🕒 Timestamp range: 2025-07-21 01:26:49.211366 to 2025-07-21 01:46:26.391846


Unnamed: 0,snapshot_id,timestamp_collected,vehicle_id,latitude,longitude,heading,speed_mph,route_short_name,trip_id,next_stop_id,next_stop_name,next_stop_sched_time
17,1,2025-07-21 01:26:49.211366,351,35.089151,-106.734347,313.0,0.0,Off Duty,0,0,No Data,14:46:07
356,2,2025-07-21 01:27:19.385662,351,35.089151,-106.734347,313.0,0.0,Off Duty,0,0,No Data,14:46:07
695,3,2025-07-21 01:27:49.564446,351,35.089151,-106.734347,313.0,0.0,Off Duty,0,0,No Data,14:46:07
1034,4,2025-07-21 01:28:19.745104,351,35.089151,-106.734347,313.0,0.0,Off Duty,0,0,No Data,14:46:07
1372,5,2025-07-21 01:28:49.929879,351,35.089151,-106.734347,313.0,0.0,Off Duty,0,0,No Data,14:46:07


##STEP 2: CALCULATE JUMPS & TIME GAPS

In [None]:
# Compute diffs for position and time
df_sample["lat_diff"] = df_sample.groupby("vehicle_id")["latitude"].diff()
df_sample["lon_diff"] = df_sample.groupby("vehicle_id")["longitude"].diff()
df_sample["jump_dist"] = (df_sample["lat_diff"]**2 + df_sample["lon_diff"]**2)**0.5

df_sample["time_diff"] = df_sample.groupby("vehicle_id")["timestamp_collected"].diff().dt.total_seconds()

# Label events
df_sample["is_jump"] = df_sample["jump_dist"] > JUMP_DISTANCE_THRESHOLD
df_sample["is_disappearance"] = df_sample["time_diff"] > DISAPPEARANCE_TIME_THRESHOLD


##STEP 3: FILTER & ORGANIZE ANOMALIES

In [None]:
# Get jump rows and enrich with previous position
jumps_df = df_sample[df_sample["is_jump"]].copy()
jumps_df["lat_prev"] = df_sample.groupby("vehicle_id")["latitude"].shift()
jumps_df["lon_prev"] = df_sample.groupby("vehicle_id")["longitude"].shift()

# Filter for vehicles with sufficient jumps
jump_counts = jumps_df["vehicle_id"].value_counts()
keep_jumpers = jump_counts[jump_counts >= MIN_JUMP_COUNT_PER_VEHICLE].index
jumps_df = jumps_df[jumps_df["vehicle_id"].isin(keep_jumpers)]

# Recalculate filtered set for map
df_sample = df_sample[df_sample["vehicle_id"].isin(keep_jumpers)].copy()

# Disappearances and reappearances
disappear_df = df_sample[df_sample["is_disappearance"] == True].copy()
reappear_df = df_sample[df_sample["is_disappearance"].shift(-1) == True].copy()


##STEP 4: PLOT WITH FOLIUM

In [None]:
import folium

m = folium.Map(location=[35.0844, -106.6504], zoom_start=12)

# -- Plot jump lines and markers --
for _, row in jumps_df.iterrows():
    start = [row["lat_prev"], row["lon_prev"]]
    end = [row["latitude"], row["longitude"]]

    # Line between jumps
    folium.PolyLine(
        locations=[start, end],
        color="orange", weight=2,
        tooltip=f"{row['vehicle_id']} jump"
    ).add_to(m)

    # Start marker (blue)
    folium.CircleMarker(
        location=start,
        radius=4, color="blue", fill=True, fill_opacity=0.9,
        tooltip=f"{row['vehicle_id']}_start"
    ).add_to(m)

    # End marker (purple)
    folium.CircleMarker(
        location=end,
        radius=4, color="purple", fill=True, fill_opacity=0.9,
        tooltip=f"{row['vehicle_id']}_end"
    ).add_to(m)

# -- Disappearances (red X) --
for _, row in disappear_df.iterrows():
    folium.Marker(
        location=[row["latitude"], row["longitude"]],
        icon=folium.Icon(color="red", icon="times-circle", prefix="fa"),
        tooltip=f"{row['vehicle_id']} disappeared"
    ).add_to(m)

# -- Reappearances (green check) --
for _, row in reappear_df.iterrows():
    folium.Marker(
        location=[row["latitude"], row["longitude"]],
        icon=folium.Icon(color="green", icon="check-circle", prefix="fa"),
        tooltip=f"{row['vehicle_id']} reappeared"
    ).add_to(m)

m
