In [0]:
from pyspark.sql import functions as F
import requests, zipfile, io, datetime as dt, os, shutil, tempfile   # dt alias

# ---------- config ----------
GTFS_URL    = "https://metro.kingcounty.gov/gtfs/google_transit.zip"
TODAY       = dt.date.today().isoformat()          # ← use dt instead of datetime
BRONZE_BASE = "dbfs:/bronze"
BRONZE_PATH = f"{BRONZE_BASE}/gtfs_static/{TODAY}"

tmp_dir = tempfile.mkdtemp(prefix="gtfs_")
print(f"Temp dir: {tmp_dir}")

try:
    print("Downloading GTFS zip …")
    z = zipfile.ZipFile(io.BytesIO(requests.get(GTFS_URL, timeout=30).content))

    for name in z.namelist():
        if not name.endswith(".txt"):
            continue
        print(f"Processing {name} …")

        local_file = os.path.join(tmp_dir, name)
        os.makedirs(os.path.dirname(local_file), exist_ok=True)
        with z.open(name) as src, open(local_file, "wb") as dst:
            dst.write(src.read())

        df = (spark.read
                .option("header", True)
                .csv(f"file://{local_file}"))

        (df.write
           .format("delta")
           .mode("overwrite")
           .save(f"{BRONZE_PATH}/{name.replace('.txt','')}"))

    print("✓ GTFS static ingest complete")

finally:
    shutil.rmtree(tmp_dir, ignore_errors=True)


In [0]:
spark.read.format("delta").load(f"{BRONZE_PATH}/stops").show(5)

In [0]:
# View 5 rows of 'routes' table
spark.read.format("delta").load(f"{BRONZE_PATH}/routes").show(5)

# View 'calendar' table
spark.read.format("delta").load(f"{BRONZE_PATH}/calendar").show(5)

# View 'trips' table
spark.read.format("delta").load(f"{BRONZE_PATH}/trips").show(5)


In [0]:
df_routes = spark.read.format("delta").load(f"{BRONZE_PATH}/routes")
df_routes.select("route_id", "route_short_name", "route_type").show(10)