In [25]:
%load_ext autoreload
%autoreload 2
import os 
os.chdir("/Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance")
from src.s3 import list_files_in_bucket, filter_files_by_pattern, read_parquet_from_s3, load_all_parquet_files
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from src.gtfs_segments import GTFS_shape_processor
from src.speeds import BusSpeedCalculator
from src.api import parse_zipped_gtfs
import geopandas as gpd
from src.api import query_feed_data, get_access_token
ACCESS_TOKEN = get_access_token()
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from collections import defaultdict
from shapely.geometry import Point, Polygon
from datetime import datetime
import pytz
import contextily as ctx
from typing import List


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preparation: Match Feed and Date

- Bronx Bus: mdb-528
- Queens Bus: mdb-520
- Brooklyn Bus: mdb-512
- NYC Bus Company: mdb-510
- Manhattan Bus: mdb-513
- New Jersey Transit (NJ Transit): mdb-508

- Williamsburg Bridge: B39 - BK?
- Lincoln Tunnel: SIM24 - NJ?
- Hugh Carey Tunnel: SIM4X - NJ?

s3://dataclinic-gtfs-rt/norm/bus-mta-vp/vehicles/

- start date: date=2024-10-04/
- consective date start: date=2024-11-15/



In [26]:
prefix = "norm/bus-mta-vp/vehicles/"
bucket = "dataclinic-gtfs-rt"

In [27]:
def fetch_all_static_feeds(mdb_id: str, ACCESS_TOKEN: str) -> pd.DataFrame:
    response = query_feed_data(mdb_id, ACCESS_TOKEN)
    if response is None:
        raise ValueError("No response for mdb_id: ", mdb_id)
    # Convert the response to a DataFrame
    feed_updates = pd.DataFrame(response)
    # Create 2 new cols: predicted_start_date, predicted_end_date
    # Use service_date_range_start if available, otherwise extract date from downloaded_at
    feed_updates["predicted_start_date"] = (
        feed_updates["service_date_range_start"].where(
            feed_updates["service_date_range_start"].notna(),
            feed_updates["downloaded_at"].str.extract(r"(\d{4}-\d{2}-\d{2})")[0]
        )
    )
    feed_updates["predicted_end_date"] = (
        feed_updates["service_date_range_end"].where(
            feed_updates["service_date_range_end"].notna(),
            feed_updates["predicted_start_date"].shift(-1).apply(
                lambda x: (pd.to_datetime(x) - pd.Timedelta(days=1)).strftime('%Y-%m-%d') if pd.notna(x) else None
            )
        )
    )
    return feed_updates   

def clean_feed_updates(feed_updates: pd.DataFrame) -> pd.DataFrame:
    # keep only: id, hosted_url, downloaded_at, service_date_range_start, service_date_range_end, predicted_start_date, predicted_end_date
    feed_updates = feed_updates[["id", "hosted_url", "downloaded_at", "service_date_range_start", "service_date_range_end", "predicted_start_date", "predicted_end_date"]]
    return feed_updates

def create_runner_command(feed_updates: pd.DataFrame, routes: str) -> str:
    """
    python runner.py \
    --start-date 2024-12-12 \
    --end-date 2025-01-04 \
    --feed-id mdb-513-202412120015 \
    --gtfs-url "https://files.mobilitydatabase.org/mdb-513/mdb-513-202412120015/mdb-513-202412120015.zip" \
    --routes M102,M50
    """
    for index, row in feed_updates.iterrows():
        command = f"""
        python runner.py \
        --start-date {row['predicted_start_date']} \
        --end-date {row['predicted_end_date']} \
        --feed-id {row['id']} \
        --gtfs-url "{row['hosted_url']}" \
        --routes {routes}
        """
        print(command)


## mdb-513

In [28]:
mdb_id = "mdb-513"
mdb_513_updates = clean_feed_updates(fetch_all_static_feeds(mdb_id, ACCESS_TOKEN=ACCESS_TOKEN))
mdb_513_updates

Unnamed: 0,id,hosted_url,downloaded_at,service_date_range_start,service_date_range_end,predicted_start_date,predicted_end_date
0,mdb-513-202402080022,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-02-08T00:52:25.481924Z,,,2024-02-08,2024-03-31
1,mdb-513-202404010033,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-04-01T00:26:36.207481Z,,,2024-04-01,2024-06-30
2,mdb-513-202407010038,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-07-01T00:41:40.236140Z,,,2024-07-01,2024-08-28
3,mdb-513-202408290052,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-08-29T00:52:10.152009Z,,,2024-08-29,2024-09-08
4,mdb-513-202409090026,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-09-09T00:26:46.736595Z,,,2024-09-09,2024-08-30
5,mdb-513-202412120015,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-12-12T00:15:17.484870Z,2024-08-31,2025-01-04,2024-08-31,2025-01-04
6,mdb-513-202501020055,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-01-02T00:55:23.717771Z,2025-01-04,2025-03-29,2025-01-04,2025-03-29
7,mdb-513-202501230024,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-01-23T00:24:27.979522Z,2025-01-04,2025-03-29,2025-01-04,2025-03-29
8,mdb-513-202502170105,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-02-17T01:05:23.814135Z,2025-02-09,2025-03-29,2025-02-09,2025-03-29


In [32]:
# Fix date ranges based on the log data
mdb_513_updates.loc[4, 'predicted_end_date'] = "2024-12-11"

mdb_513_updates.loc[5, 'predicted_start_date'] = "2024-12-12" 
mdb_513_updates.loc[5, 'predicted_end_date'] = "2025-01-04"

mdb_513_updates.loc[6, 'predicted_start_date'] = "2025-01-05" 
mdb_513_updates.loc[6, 'predicted_end_date'] = "2025-01-23"

mdb_513_updates.loc[7, 'predicted_start_date'] = "2025-01-24"
mdb_513_updates.loc[7, 'predicted_end_date'] = "2025-02-08"



mdb_513_updates


Unnamed: 0,id,hosted_url,downloaded_at,service_date_range_start,service_date_range_end,predicted_start_date,predicted_end_date
0,mdb-513-202402080022,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-02-08T00:52:25.481924Z,,,2024-02-08,2024-03-31
1,mdb-513-202404010033,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-04-01T00:26:36.207481Z,,,2024-04-01,2024-06-30
2,mdb-513-202407010038,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-07-01T00:41:40.236140Z,,,2024-07-01,2024-08-28
3,mdb-513-202408290052,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-08-29T00:52:10.152009Z,,,2024-08-29,2024-09-08
4,mdb-513-202409090026,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-09-09T00:26:46.736595Z,,,2024-09-09,2024-12-11
5,mdb-513-202412120015,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-12-12T00:15:17.484870Z,2024-08-31,2025-01-04,2024-12-12,2025-01-04
6,mdb-513-202501020055,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-01-02T00:55:23.717771Z,2025-01-04,2025-03-29,2025-01-05,2025-01-23
7,mdb-513-202501230024,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-01-23T00:24:27.979522Z,2025-01-04,2025-03-29,2025-01-24,2025-02-08
8,mdb-513-202502170105,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-02-17T01:05:23.814135Z,2025-02-09,2025-03-29,2025-02-09,2025-03-29


In [35]:
create_runner_command(mdb_513_updates, routes="M50,M102")


        python runner.py         --start-date 2024-02-08         --end-date 2024-03-31         --feed-id mdb-513-202402080022         --gtfs-url "https://files.mobilitydatabase.org/mdb-513/mdb-513-202402080022/mdb-513-202402080022.zip"         --routes M50,M102
        

        python runner.py         --start-date 2024-04-01         --end-date 2024-06-30         --feed-id mdb-513-202404010033         --gtfs-url "https://files.mobilitydatabase.org/mdb-513/mdb-513-202404010033/mdb-513-202404010033.zip"         --routes M50,M102
        

        python runner.py         --start-date 2024-07-01         --end-date 2024-08-28         --feed-id mdb-513-202407010038         --gtfs-url "https://files.mobilitydatabase.org/mdb-513/mdb-513-202407010038/mdb-513-202407010038.zip"         --routes M50,M102
        

        python runner.py         --start-date 2024-08-29         --end-date 2024-09-08         --feed-id mdb-513-202408290052         --gtfs-url "https://files.mobilitydatabase.org/md

## mdb-508 NJ Transit

In [34]:
mdb_id = "mdb-508" 
mdb_508_updates = clean_feed_updates(fetch_all_static_feeds(mdb_id, ACCESS_TOKEN=ACCESS_TOKEN))
mdb_508_updates


Unnamed: 0,id,hosted_url,downloaded_at,service_date_range_start,service_date_range_end,predicted_start_date,predicted_end_date
0,mdb-508-202402080025,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-02-08T00:51:29.721648Z,,,2024-02-08,2024-02-11
1,mdb-508-202402120047,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-02-12T00:22:53.480770Z,,,2024-02-12,2024-03-17
2,mdb-508-202403180046,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-03-18T00:08:54.186011Z,,,2024-03-18,2024-03-31
3,mdb-508-202404010021,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-04-01T00:20:27.273623Z,,,2024-04-01,2024-04-07
4,mdb-508-202404080019,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-04-08T00:12:24.706146Z,,,2024-04-08,2024-04-14
5,mdb-508-202404150016,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-04-15T00:00:20.573690Z,,,2024-04-15,2024-05-05
6,mdb-508-202405060020,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-05-06T00:05:26.901876Z,,,2024-05-06,2024-05-19
7,mdb-508-202405200013,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-05-20T00:06:19.984109Z,,,2024-05-20,2024-06-09
8,mdb-508-202406100016,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-06-10T00:04:23.289997Z,,,2024-06-10,2024-07-07
9,mdb-508-202407080057,https://files.mobilitydatabase.org/mdb-508/mdb...,2024-07-08T00:14:06.622045Z,,,2024-07-08,2024-07-14


### Match

Row 15:
  id: mdb-508-202411280006
  service_date_range_start: 2024-11-24
  hosted_url: https://files.mobilitydatabase.org/mdb-508/mdb-508-202411280006/mdb-508-202411280006.zip
  2024-12-01, 2024-12-16

Row 16:
  id: mdb-508-202412190024
  service_date_range_start: 2024-12-17
  hosted_url: https://files.mobilitydatabase.org/mdb-508/mdb-508-202412190024/mdb-508-202412190024.zip
  2024-12-17, 2024-12-28

Row 17:
  id: mdb-508-202501020035
  service_date_range_start: 2024-12-29
  hosted_url: https://files.mobilitydatabase.org/mdb-508/mdb-508-202501020035/mdb-508-202501020035.zip
  2024-12-29, 2025-01-06

Row 18:
  id: mdb-508-202501130011
  service_date_range_start: 2025-01-07
  hosted_url: https://files.mobilitydatabase.org/mdb-508/mdb-508-202501130011/mdb-508-202501130011.zip
  2025-01-07, 

Row 19:
  id: mdb-508-202501200033
  service_date_range_start: 2025-01-16
  hosted_url: https://files.mobilitydatabase.org/mdb-508/mdb-508-202501200033/mdb-508-202501200033.zip

Row 20:
  id: mdb-508-202503130047
  service_date_range_start: 2025-03-11
  hosted_url: https://files.mobilitydatabase.org/mdb-508/mdb-508-202503130047/mdb-508-202503130047.zip


# Fetch Segment and GTFS dict

In [19]:
# Define url and date range
url = "https://files.mobilitydatabase.org/mdb-513/mdb-513-202501020055/mdb-513-202501020055.zip"
start = "2025-01-05"
end = "2025-01-23"
feed_id = "mdb-513-202501020055"

# Based on the correct url
segment_df = GTFS_shape_processor(url, 4326, 2263).process_shapes()
GTFS_dict = parse_zipped_gtfs(url)

# Preview segment and GTFS_dict
print("--- segment_df.columns ---")
print(segment_df.columns)
print("--- GTFS_dict.keys() ---")
print(GTFS_dict.keys())

Parsed GTFS static feed


  return lib.line_locate_point(line, other)


Parsed GTFS static feed
--- segment_df.columns ---
Index(['trip_id', 'shape_id', 'stop_sequence', 'stop_id', 'stop_name',
       'prev_stop_id', 'prev_stop_name', 'projected_position',
       'prev_projected_position', 'segment_length', 'geometry'],
      dtype='object')
--- GTFS_dict.keys() ---
dict_keys(['agency.txt', 'calendar.txt', 'calendar_dates.txt', 'routes.txt', 'shapes.txt', 'stops.txt', 'stop_times.txt', 'trips.txt'])


In [25]:
# Based on the correct url
segment_df = GTFS_shape_processor(url, 4326, 2263).process_shapes()
GTFS_dict = parse_zipped_gtfs(url)

# create folder for date range under data 
# data/raw-speed/start_to_end
if not os.path.exists(f'data/raw-speeds/{feed_id}'):
    os.makedirs(f'data/raw-speeds/{feed_id}')

# 1. save GTFS "stops.txt" df as Parquet
GTFS_dict["stops.txt"].to_parquet(f'data/raw-speeds/{feed_id}/stops.parquet')

# 2. save segment df as GEOJSON
segment_df.to_file(f'data/raw-speeds/{feed_id}/segments.geojson', driver='GeoJSON')

# 3. save speed df as PARQUET: bus_speeds_{date}.parquet
# for date in date ranges
# for routes: filter out selected routes

# Get date list from start and end date: List[str]
date_list = pd.date_range(start=start, end=end).strftime('%Y-%m-%d').tolist()
print("--- date range ---")
print(date_list)

# Define route list: List[str] 
route_list = ["M50"]
print("--- route list ---")
print(route_list)

Parsed GTFS static feed


  return lib.line_locate_point(line, other)


Parsed GTFS static feed
--- date range ---
['2025-01-05', '2025-01-06', '2025-01-07']
--- route list ---
['M50']
