In [1]:
import requests
import json
import logging
import csv
import time
import yaml
import os
from datetime import datetime
from pathlib import Path
import pandas as pd
from src.tfl_client import *

# Gets all lines and their valid routes for given modes, including the name and id of the originating and terminating stops for each route
# GET https://api.tfl.gov.uk/Line/Mode/bus/Route

# Get disruptions for all lines of the given modes.
# GET https://api.tfl.gov.uk/Line/Mode/bus/Disruption

# Get the list of arrival predictions for given line ids based at the given stop
# GET https://api.tfl.gov.uk/Line/{lineId}/Arrivals

def load_schema(path: str) -> dict:
    with open(path, "r") as f:
        return yaml.safe_load(f)

def load_config(path: str) -> dict:
    with open(path, "r") as f:
        return json.load(f)

def load_secrets():
    app_id = os.getenv("TFL_APP_ID")
    app_key = os.getenv("TFL_APP_KEY")

    if not app_id or not app_key:
        raise RuntimeError("Missing TfL credentials: set TFL_APP_ID and TFL_APP_KEY")
    return app_id, app_key

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
app_id, app_key = load_secrets()

In [2]:
# Preprocess the Line routes data
snapshot_date = datetime.utcnow().strftime("%Y-%m-%d")
OUTPUT_DIR = Path(f"data/reference/line_routes_snapshot/dt={snapshot_date}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE = OUTPUT_DIR / "line_routes.csv"

# Load config, schemas and params
config = load_config(Path("config/routes.json"))
schema = load_schema(Path("schemas/reference/line_routes.yaml"))

# Fetch data using the client
data = get_line_routes(app_id=app_id, app_key=app_key)

rows = []
target_id = set()

for line in data:
    line_id = line.get("id")
    if not isinstance(line_id, str) or not line_id.isdigit():
        continue
    
    target_id.add(line_id)   
    name = line.get("name")
    mode = line.get("modeName")

    for rs in line.get("routeSections", []):
        direction = f"{rs.get('direction')}"
        originationName = f"{rs.get('originationName')}"
        destinationName = f"{rs.get('destinationName')}"

        rows.append({
            "line_id": line_id,
            "mode": mode,
            "direction": direction,
            "origination_name": originationName,
            "destination_name": destinationName,
        })

# --- schema validation ---
required = set(schema["required_columns"])
for i, row in enumerate(rows):
    missing = required - row.keys()
    if missing:
        raise ValueError(f"Row {i} missing fields: {missing}")
        
output_columns = list(schema["output_columns"].keys())

with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=output_columns)
    writer.writeheader()
    writer.writerows(rows)

logger.info(f"Saved {len(rows)} rows to {OUTPUT_FILE}")

2026-01-13 04:04:20,565 - INFO - Saved 1026 rows to data/reference/line_routes_snapshot/dt=2026-01-13/line_routes.csv


In [3]:
# Preprocess the names and ids of stops on the given lines ids.
schema = load_schema("schemas/reference/line_stop.yaml")

snapshot_date = datetime.utcnow().strftime("%Y-%m-%d")
OUTPUT_DIR = Path(f"data/reference/line_stop_snapshot/dt={snapshot_date}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

rows = []
line_ids = config["line_ids"]
logger.info(f"Loading stops for lines: {line_ids}")

for line in line_ids:
    inbound, outbound = get_stops_sequence(id=line, app_id=app_id, app_key=app_key)
    inbound_routes = inbound.get("orderedLineRoutes", [])
    outbound_routes = outbound.get("orderedLineRoutes", [])
    
    if inbound_routes:
        for seq, stop_id in enumerate(inbound_routes[0].get("naptanIds", [])):
            rows.append({
                "snapshot_date": snapshot_date,
                "line_id": line,
                "direction": "inbound",
                "stop_id": stop_id,
                "stop_sequence": seq,
            }) # 

    if outbound_routes:
        for seq, stop_id in enumerate(outbound_routes[0].get("naptanIds", [])):
            rows.append({
                "snapshot_date": snapshot_date,
                "line_id": line,
                "direction": "outbound",
                "stop_id": stop_id,
                "stop_sequence": seq,
            })

# --- schema validation ---
required = set(schema["required_columns"])
for i, row in enumerate(rows):
    missing = required - row.keys()
    if missing:
        raise ValueError(f"Row {i} missing fields: {missing}")

output_columns = list(schema["output_columns"].keys())
with open(OUTPUT_DIR / f"line_stop.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=output_columns)
    writer.writeheader()
    writer.writerows(rows)

# Save in Parquet
df = pd.DataFrame(rows)[output_columns]
df.to_parquet(OUTPUT_DIR / f"line_stop.parquet", engine="pyarrow")

logger.info(f"Saved {len(rows)} rows to {OUTPUT_DIR / 'line_stop.csv'} and {OUTPUT_DIR / 'line_stop.parquet'}")

2026-01-13 04:04:22,346 - INFO - Loading stops for lines: ['12', '34']
2026-01-13 04:04:24,835 - INFO - Saved 180 rows to data/reference/line_stop_snapshot/dt=2026-01-13/line_stop.csv and data/reference/line_stop_snapshot/dt=2026-01-13/line_stop.parquet


In [4]:
# Get the Timetable for every stop on a list of lines
snapshot_date = datetime.utcnow().strftime("%Y-%m-%d")
line_stop_DIR = Path(f"data/reference/line_stop_snapshot/dt={snapshot_date}")
df = pd.read_parquet(line_stop_DIR/f"line_stop.parquet")
OUTPUT_DIR = Path(f"data/reference/stop_timetable_snapshot/dt={snapshot_date}") 
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

schema = load_schema(Path("schemas/reference/stop_timetable.yaml"))

line_stop_pairs = (
    df[["line_id", "stop_id", "stop_sequence"]]
    .drop_duplicates(subset=["line_id", "stop_id"])
    .sort_values(["line_id", "stop_sequence"], kind="stable")
)

total_pairs = len(line_stop_pairs)
logger.info(f"Starting timetable ingestion for {total_pairs} line-stop pairs")
processed = 0
success = 0
skipped = 0
failed = 0
LOG_EVERY = 50

timetable_rows = []

for line_id, stop_id, stop_sequence in line_stop_pairs.itertuples(index=False):
    processed += 1

    try:
        timetable_json = get_timetable(
            id=line_id,
            stop_id=stop_id,
            app_id=app_id,
            app_key=app_key
        )

        if timetable_json is None:
            skipped += 1
            continue

        timetable_rows.extend(
            extract_timetable_rows(
                timetable_json=timetable_json,
                snapshot_date=snapshot_date,
                stop_sequence=stop_sequence
            )
        )
        success += 1

    except Exception as e:
        failed += 1
        logger.error(
            f"Failed line={line_id}, stop={stop_id}: {e}"
        )

    if processed % LOG_EVERY == 0 or processed == total_pairs:
        logger.info(
            "Progress: %d/%d processed | success=%d | skipped=%d | failed=%d",
            processed,
            total_pairs,
            success,
            skipped,
            failed
        )

df = pd.DataFrame(timetable_rows)

# --- schema validation ---
required = set(schema["required_columns"])
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

output_columns = list(schema["output_columns"].keys())
df = df[output_columns]
df.to_parquet(OUTPUT_DIR / "stop_timetable.parquet", engine="pyarrow")


df.head(300).to_csv(OUTPUT_DIR / "stop_timetable_sample.csv", index=False)
logger.info(f"Wrote {len(df)} rows to {OUTPUT_DIR/f'stop_timetable.parquet'}")
logger.info("Sample written to stop_timetable_sample.csv")  

2026-01-13 04:04:33,565 - INFO - Starting timetable ingestion for 180 line-stop pairs
2026-01-13 04:04:53,332 - INFO - Progress: 50/180 processed | success=50 | skipped=0 | failed=0
2026-01-13 04:05:12,536 - INFO - Progress: 100/180 processed | success=98 | skipped=2 | failed=0
2026-01-13 04:05:30,801 - INFO - Progress: 150/180 processed | success=148 | skipped=2 | failed=0
2026-01-13 04:05:41,392 - INFO - Wrote 65399 rows to data/reference/stop_timetable_snapshot/dt=2026-01-13/stop_timetable.parquet
2026-01-13 04:05:41,393 - INFO - Sample written to stop_timetable_sample.csv


In [None]:
from datetime import datetime
from pathlib import Path

run_ts = datetime.utcnow()
date_part = run_ts.strftime("%Y-%m-%d")
hour_part = run_ts.strftime("%H")
minute_part = run_ts.strftime("%M")

output_dir = Path(f"Artifacts/predictions/dt={date_part}")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / f"hour={hour_part}_minute={minute_part}.csv"

# Take an example target
example_id = config["line_ids"]
print("example_id: ", example_id) 
data = get_arrivals(ids=example_id, app_id=app_id, app_key=app_key)

rows = []

for vehicle in data:
    rows.append({
        # partition / snapshot metadata
        "prediction_time": run_ts.isoformat(),

        # core identifiers
        "vehicle_id": vehicle.get("vehicleId"),
        "line_id": vehicle.get("lineId"),

        # stop / location
        "stop_id": vehicle.get("naptanId"),
        "stop_name": vehicle.get("stationName"),

        # movement
        "direction": vehicle.get("direction"),
        "destination_name": vehicle.get("destinationName"),

        # KPIs
        "time_to_station_sec": vehicle.get("timeToStation"),
        "expected_arrival": vehicle.get("expectedArrival"),
    })

FIELDNAMES = [
    "prediction_time",
    "vehicle_id",
    "line_id",
    "stop_id",
    "stop_name",
    "direction",
    # "destination_name",
    "time_to_station_sec",
    "expected_arrival",
]


# if rows:
#     with open(output_file, "w", newline="", encoding="utf-8") as f:
#         writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
#         writer.writeheader()
#         writer.writerows(rows)

rows_df = pd.DataFrame(rows).sort_values(["line_id", "vehicle_id", "expected_arrival"], kind="stable")[FIELDNAMES]
rows_df.to_csv(output_file, index=False)

print(f"Saved {len(rows)} rows to {output_file}")
