In [8]:
%load_ext autoreload
%autoreload 2
import os 
os.chdir("/Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance")
from src.s3 import list_files_in_bucket, filter_files_by_pattern, read_parquet_from_s3, load_all_parquet_files
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from src.gtfs_segments import GTFS_shape_processor
from src.speeds import BusSpeedCalculator
from src.api import parse_zipped_gtfs
import geopandas as gpd
from src.api import query_feed_data, get_access_token
ACCESS_TOKEN = get_access_token()
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from collections import defaultdict
from shapely.geometry import Point, Polygon
from datetime import datetime
import pytz
import contextily as ctx
from typing import List


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preparation: Match Feed and Date

- Bronx Bus: mdb-528
- Queens Bus: mdb-520
- Brooklyn Bus: mdb-512
- NYC Bus Company: mdb-510
- Manhattan Bus: mdb-513
- New Jersey Transit (NJ Transit): mdb-508

- Williamsburg Bridge: B39 - BK?
- Lincoln Tunnel: SIM24 - NJ?
- Hugh Carey Tunnel: SIM4X - NJ?

s3://dataclinic-gtfs-rt/norm/bus-mta-vp/vehicles/

- start date: date=2024-10-04/
- consective date start: date=2024-11-15/



In [9]:
prefix = "norm/bus-mta-vp/vehicles/"
bucket = "dataclinic-gtfs-rt"

In [10]:
def fetch_all_static_feeds(mdb_id: str, ACCESS_TOKEN: str) -> pd.DataFrame:
    response = query_feed_data(mdb_id, ACCESS_TOKEN)
    if response is None:
        raise ValueError("No response for mdb_id: ", mdb_id)
    # Convert the response to a DataFrame
    feed_updates = pd.DataFrame(response)
    # Create 2 new cols: predicted_start_date, predicted_end_date
    # Use service_date_range_start if available, otherwise extract date from downloaded_at
    feed_updates["predicted_start_date"] = (
        feed_updates["service_date_range_start"].where(
            feed_updates["service_date_range_start"].notna(),
            feed_updates["downloaded_at"].str.extract(r"(\d{4}-\d{2}-\d{2})")[0]
        )
    )
    feed_updates["predicted_end_date"] = (
        feed_updates["service_date_range_end"].where(
            feed_updates["service_date_range_end"].notna(),
            feed_updates["predicted_start_date"].shift(-1).apply(
                lambda x: (pd.to_datetime(x) - pd.Timedelta(days=1)).strftime('%Y-%m-%d') if pd.notna(x) else None
            )
        )
    )
    return feed_updates   

def clean_feed_updates(feed_updates: pd.DataFrame) -> pd.DataFrame:
    # keep only: id, hosted_url, downloaded_at, service_date_range_start, service_date_range_end, predicted_start_date, predicted_end_date
    feed_updates = feed_updates[["id", "hosted_url", "downloaded_at", "service_date_range_start", "service_date_range_end", "predicted_start_date", "predicted_end_date"]]
    return feed_updates

def create_runner_command(feed_updates: pd.DataFrame, routes: str) -> str:
    """
    python runner.py \
    --start-date 2024-12-12 \
    --end-date 2025-01-04 \
    --feed-id mdb-513-202412120015 \
    --gtfs-url "https://files.mobilitydatabase.org/mdb-513/mdb-513-202412120015/mdb-513-202412120015.zip" \
    --routes M102,M50
    """
    for index, row in feed_updates.iterrows():
        command = f"""
        python runner.py \
        --start-date {row['predicted_start_date']} \
        --end-date {row['predicted_end_date']} \
        --feed-id {row['id']} \
        --gtfs-url "{row['hosted_url']}" \
        --routes {routes}
        """
        print(command)

def save_segment_for_feed(feed_updates: pd.DataFrame, selected_indices: List[int]) -> None:
    for index in selected_indices:
        row = feed_updates.iloc[index]
        url = row["hosted_url"]
        feed_id = row["id"]

        # Based on the correct url
        segment_df = GTFS_shape_processor(url, 4326, 2263).process_shapes()
        GTFS_dict = parse_zipped_gtfs(url)

        # data/raw-speed/start_to_end
        if not os.path.exists(f'data/raw-speeds/{feed_id}'):
            os.makedirs(f'data/raw-speeds/{feed_id}')

        # 1. save GTFS "stops.txt" df as Parquet
        # check existance
        if not os.path.exists(f'data/raw-speeds/{feed_id}/stops.parquet'):
            GTFS_dict["stops.txt"].to_parquet(f'data/raw-speeds/{feed_id}/stops.parquet')

        # 2. save segment df as GEOJSON
        if not os.path.exists(f'data/raw-speeds/{feed_id}/segments.geojson'):
            segment_df.to_file(f'data/raw-speeds/{feed_id}/segments.geojson', driver='GeoJSON')
    


## mdb-513: Manhattan

In [11]:
mdb_id = "mdb-513"
mdb_513_updates = clean_feed_updates(fetch_all_static_feeds(mdb_id, ACCESS_TOKEN=ACCESS_TOKEN))
mdb_513_updates

Unnamed: 0,id,hosted_url,downloaded_at,service_date_range_start,service_date_range_end,predicted_start_date,predicted_end_date
0,mdb-513-202402080022,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-02-08T00:52:25.481924Z,,,2024-02-08,2024-03-31
1,mdb-513-202404010033,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-04-01T00:26:36.207481Z,,,2024-04-01,2024-06-30
2,mdb-513-202407010038,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-07-01T00:41:40.236140Z,,,2024-07-01,2024-08-28
3,mdb-513-202408290052,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-08-29T00:52:10.152009Z,,,2024-08-29,2024-09-08
4,mdb-513-202409090026,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-09-09T00:26:46.736595Z,,,2024-09-09,2024-08-30
5,mdb-513-202412120015,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-12-12T00:15:17.484870Z,2024-08-31,2025-01-04,2024-08-31,2025-01-04
6,mdb-513-202501020055,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-01-02T00:55:23.717771Z,2025-01-04,2025-03-29,2025-01-04,2025-03-29
7,mdb-513-202501230024,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-01-23T00:24:27.979522Z,2025-01-04,2025-03-29,2025-01-04,2025-03-29
8,mdb-513-202502170105,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-02-17T01:05:23.814135Z,2025-02-09,2025-03-29,2025-02-09,2025-03-29


In [12]:
# Fix date ranges based on the log data
mdb_513_updates.loc[4, 'predicted_end_date'] = "2024-12-11"

mdb_513_updates.loc[5, 'predicted_start_date'] = "2024-12-12" 
mdb_513_updates.loc[5, 'predicted_end_date'] = "2025-01-04"

mdb_513_updates.loc[6, 'predicted_start_date'] = "2025-01-05" 
mdb_513_updates.loc[6, 'predicted_end_date'] = "2025-01-23"

mdb_513_updates.loc[7, 'predicted_start_date'] = "2025-01-24"
mdb_513_updates.loc[7, 'predicted_end_date'] = "2025-02-08"



mdb_513_updates


Unnamed: 0,id,hosted_url,downloaded_at,service_date_range_start,service_date_range_end,predicted_start_date,predicted_end_date
0,mdb-513-202402080022,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-02-08T00:52:25.481924Z,,,2024-02-08,2024-03-31
1,mdb-513-202404010033,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-04-01T00:26:36.207481Z,,,2024-04-01,2024-06-30
2,mdb-513-202407010038,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-07-01T00:41:40.236140Z,,,2024-07-01,2024-08-28
3,mdb-513-202408290052,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-08-29T00:52:10.152009Z,,,2024-08-29,2024-09-08
4,mdb-513-202409090026,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-09-09T00:26:46.736595Z,,,2024-09-09,2024-12-11
5,mdb-513-202412120015,https://files.mobilitydatabase.org/mdb-513/mdb...,2024-12-12T00:15:17.484870Z,2024-08-31,2025-01-04,2024-12-12,2025-01-04
6,mdb-513-202501020055,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-01-02T00:55:23.717771Z,2025-01-04,2025-03-29,2025-01-05,2025-01-23
7,mdb-513-202501230024,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-01-23T00:24:27.979522Z,2025-01-04,2025-03-29,2025-01-24,2025-02-08
8,mdb-513-202502170105,https://files.mobilitydatabase.org/mdb-513/mdb...,2025-02-17T01:05:23.814135Z,2025-02-09,2025-03-29,2025-02-09,2025-03-29


In [13]:
create_runner_command(mdb_513_updates, routes="M50,M102")
save_segment_for_feed(mdb_513_updates, selected_indices=[4, 5, 6, 7, 8])


        python runner.py         --start-date 2024-02-08         --end-date 2024-03-31         --feed-id mdb-513-202402080022         --gtfs-url "https://files.mobilitydatabase.org/mdb-513/mdb-513-202402080022/mdb-513-202402080022.zip"         --routes M50,M102
        

        python runner.py         --start-date 2024-04-01         --end-date 2024-06-30         --feed-id mdb-513-202404010033         --gtfs-url "https://files.mobilitydatabase.org/mdb-513/mdb-513-202404010033/mdb-513-202404010033.zip"         --routes M50,M102
        

        python runner.py         --start-date 2024-07-01         --end-date 2024-08-28         --feed-id mdb-513-202407010038         --gtfs-url "https://files.mobilitydatabase.org/mdb-513/mdb-513-202407010038/mdb-513-202407010038.zip"         --routes M50,M102
        

        python runner.py         --start-date 2024-08-29         --end-date 2024-09-08         --feed-id mdb-513-202408290052         --gtfs-url "https://files.mobilitydatabase.org/md

  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)


## mdb-512: BK

In [14]:
mdb_id = "mdb-512" 
mdb_512_updates = clean_feed_updates(fetch_all_static_feeds(mdb_id, ACCESS_TOKEN=ACCESS_TOKEN))
mdb_512_updates

Unnamed: 0,id,hosted_url,downloaded_at,service_date_range_start,service_date_range_end,predicted_start_date,predicted_end_date
0,mdb-512-202402080014,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-02-08T00:52:21.166300Z,,,2024-02-08,2024-03-31
1,mdb-512-202404010042,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-04-01T00:21:46.528483Z,,,2024-04-01,2024-06-30
2,mdb-512-202407010036,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-07-01T00:18:40.331042Z,,,2024-07-01,2024-07-14
3,mdb-512-202407150005,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-07-15T00:01:12.393632Z,,,2024-07-15,2024-08-28
4,mdb-512-202408290005,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-08-29T00:05:11.530719Z,,,2024-08-29,2024-08-30
5,mdb-512-202412120015,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-12-12T00:15:43.516826Z,2024-08-31,2025-01-04,2024-08-31,2025-01-04
6,mdb-512-202501020103,https://files.mobilitydatabase.org/mdb-512/mdb...,2025-01-02T01:03:35.583486Z,2025-01-04,2025-03-29,2025-01-04,2025-03-29
7,mdb-512-202502170011,https://files.mobilitydatabase.org/mdb-512/mdb...,2025-02-17T00:11:46.201569Z,2025-02-09,2025-03-29,2025-02-09,2025-03-29


In [15]:
mdb_512_updates.loc[6, 'predicted_end_date'] = "2025-02-08"

mdb_512_updates

Unnamed: 0,id,hosted_url,downloaded_at,service_date_range_start,service_date_range_end,predicted_start_date,predicted_end_date
0,mdb-512-202402080014,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-02-08T00:52:21.166300Z,,,2024-02-08,2024-03-31
1,mdb-512-202404010042,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-04-01T00:21:46.528483Z,,,2024-04-01,2024-06-30
2,mdb-512-202407010036,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-07-01T00:18:40.331042Z,,,2024-07-01,2024-07-14
3,mdb-512-202407150005,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-07-15T00:01:12.393632Z,,,2024-07-15,2024-08-28
4,mdb-512-202408290005,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-08-29T00:05:11.530719Z,,,2024-08-29,2024-08-30
5,mdb-512-202412120015,https://files.mobilitydatabase.org/mdb-512/mdb...,2024-12-12T00:15:43.516826Z,2024-08-31,2025-01-04,2024-08-31,2025-01-04
6,mdb-512-202501020103,https://files.mobilitydatabase.org/mdb-512/mdb...,2025-01-02T01:03:35.583486Z,2025-01-04,2025-03-29,2025-01-04,2025-02-08
7,mdb-512-202502170011,https://files.mobilitydatabase.org/mdb-512/mdb...,2025-02-17T00:11:46.201569Z,2025-02-09,2025-03-29,2025-02-09,2025-03-29


In [17]:
create_runner_command(mdb_512_updates, routes="B39")
save_segment_for_feed(mdb_512_updates, selected_indices=[4, 5, 6, 7])


        python runner.py         --start-date 2024-02-08         --end-date 2024-03-31         --feed-id mdb-512-202402080014         --gtfs-url "https://files.mobilitydatabase.org/mdb-512/mdb-512-202402080014/mdb-512-202402080014.zip"         --routes B39
        

        python runner.py         --start-date 2024-04-01         --end-date 2024-06-30         --feed-id mdb-512-202404010042         --gtfs-url "https://files.mobilitydatabase.org/mdb-512/mdb-512-202404010042/mdb-512-202404010042.zip"         --routes B39
        

        python runner.py         --start-date 2024-07-01         --end-date 2024-07-14         --feed-id mdb-512-202407010036         --gtfs-url "https://files.mobilitydatabase.org/mdb-512/mdb-512-202407010036/mdb-512-202407010036.zip"         --routes B39
        

        python runner.py         --start-date 2024-07-15         --end-date 2024-08-28         --feed-id mdb-512-202407150005         --gtfs-url "https://files.mobilitydatabase.org/mdb-512/mdb-512-2

  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)


## mdb514: Staten Island

In [18]:
mdb_id = "mdb-514" 
mdb_514_updates = clean_feed_updates(fetch_all_static_feeds(mdb_id, ACCESS_TOKEN=ACCESS_TOKEN))
mdb_514_updates

Unnamed: 0,id,hosted_url,downloaded_at,service_date_range_start,service_date_range_end,predicted_start_date,predicted_end_date
0,mdb-514-202402080029,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-02-08T00:51:31.713951Z,,,2024-02-08,2024-03-31
1,mdb-514-202404010001,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-04-01T00:21:04.294474Z,,,2024-04-01,2024-07-07
2,mdb-514-202407080023,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-07-08T00:06:24.815121Z,,,2024-07-08,2024-08-28
3,mdb-514-202408290048,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-08-29T00:48:56.501333Z,,,2024-08-29,2024-08-31
4,mdb-514-202412120006,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-12-12T00:06:05.085862Z,2024-09-01,2025-01-04,2024-09-01,2025-01-04
5,mdb-514-202501020130,https://files.mobilitydatabase.org/mdb-514/mdb...,2025-01-02T01:30:07.125612Z,2025-01-05,2025-03-29,2025-01-05,2025-03-29
6,mdb-514-202502170029,https://files.mobilitydatabase.org/mdb-514/mdb...,2025-02-17T00:29:23.644990Z,2025-02-10,2025-03-29,2025-02-10,2025-03-29


In [19]:
mdb_514_updates.loc[5, 'predicted_end_date'] = "2025-02-09"

mdb_514_updates

Unnamed: 0,id,hosted_url,downloaded_at,service_date_range_start,service_date_range_end,predicted_start_date,predicted_end_date
0,mdb-514-202402080029,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-02-08T00:51:31.713951Z,,,2024-02-08,2024-03-31
1,mdb-514-202404010001,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-04-01T00:21:04.294474Z,,,2024-04-01,2024-07-07
2,mdb-514-202407080023,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-07-08T00:06:24.815121Z,,,2024-07-08,2024-08-28
3,mdb-514-202408290048,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-08-29T00:48:56.501333Z,,,2024-08-29,2024-08-31
4,mdb-514-202412120006,https://files.mobilitydatabase.org/mdb-514/mdb...,2024-12-12T00:06:05.085862Z,2024-09-01,2025-01-04,2024-09-01,2025-01-04
5,mdb-514-202501020130,https://files.mobilitydatabase.org/mdb-514/mdb...,2025-01-02T01:30:07.125612Z,2025-01-05,2025-03-29,2025-01-05,2025-02-09
6,mdb-514-202502170029,https://files.mobilitydatabase.org/mdb-514/mdb...,2025-02-17T00:29:23.644990Z,2025-02-10,2025-03-29,2025-02-10,2025-03-29


In [20]:
create_runner_command(mdb_514_updates, routes="SIM24,SIM4X")
save_segment_for_feed(mdb_514_updates, selected_indices=[4, 5, 6])


        python runner.py         --start-date 2024-02-08         --end-date 2024-03-31         --feed-id mdb-514-202402080029         --gtfs-url "https://files.mobilitydatabase.org/mdb-514/mdb-514-202402080029/mdb-514-202402080029.zip"         --routes SIM24,SIM4X
        

        python runner.py         --start-date 2024-04-01         --end-date 2024-07-07         --feed-id mdb-514-202404010001         --gtfs-url "https://files.mobilitydatabase.org/mdb-514/mdb-514-202404010001/mdb-514-202404010001.zip"         --routes SIM24,SIM4X
        

        python runner.py         --start-date 2024-07-08         --end-date 2024-08-28         --feed-id mdb-514-202407080023         --gtfs-url "https://files.mobilitydatabase.org/mdb-514/mdb-514-202407080023/mdb-514-202407080023.zip"         --routes SIM24,SIM4X
        

        python runner.py         --start-date 2024-08-29         --end-date 2024-08-31         --feed-id mdb-514-202408290048         --gtfs-url "https://files.mobilitydataba

  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
