In [0]:
# dbutils.fs.rm("dbfs:/gold/gtfs_rt_enriched/", recurse=True)       One-time utility: This cell is only needed the very first time you switch from overwrite to append logic.

In [0]:
import datetime as dt

# Use today for real-time ingestion (Silver input + Gold partition)
INGESTION_DATE = dt.date.today().isoformat()

# Keep static GTFS date as-is
STATIC_DATE = "2025-05-21"


In [0]:
from pyspark.sql import functions as F

# Silver (partitioned) input — dynamic
RT_SILVER_PATH = "dbfs:/silver/gtfs_rt/"

# Static GTFS (same as before)
TRIPS_SILVER_PATH  = f"dbfs:/silver/gtfs_static/{STATIC_DATE}/trips"
ROUTES_SILVER_PATH = f"dbfs:/silver/gtfs_static/{STATIC_DATE}/routes"

# Partitioned Gold output
GOLD_PATH = "dbfs:/gold/gtfs_rt_enriched/"


In [0]:
# Read partitioned Silver table — filter by today's ingestion + validate input
df_rt = (
    spark.read.format("delta").load(RT_SILVER_PATH)
    .filter(F.col("ingestion_date") == INGESTION_DATE)
    .filter(F.col("trip_id").isNotNull())  # ✅ Required for joining
    .filter(F.col("latitude").isNotNull() & F.col("longitude").isNotNull())  # ✅ Required for location
)


df_trips  = spark.read.format("delta").load(TRIPS_SILVER_PATH)
df_routes = spark.read.format("delta").load(ROUTES_SILVER_PATH)


In [0]:
# If df_rt had route_id already, we’ll drop it before the joins
df_rt_clean = df_rt.drop("route_id")  # 👈 prevents duplicate later

df_joined = (
    df_rt_clean
    .join(df_trips.select("trip_id", "route_id", "direction_id"), on="trip_id", how="left")
)

df_enriched = (
    df_joined
    .join(
        df_routes.select("route_id", "route_short_name", "route_type"),
        on="route_id",
        how="left"
    )
    .withColumn("joined_at", F.current_timestamp())
)


In [0]:
# Sanity check — how many records have missing route_id or route_short_name?
df_enriched.filter(F.col("route_id").isNull() | F.col("route_short_name").isNull()).count()


In [0]:
df_enriched.select(
    "vehicle_id", "route_short_name", "direction_id", 
    "latitude", "longitude", "event_ts", "joined_at"
).show(5, truncate=False)


In [0]:
# Append to partitioned Gold table
df_enriched = df_enriched.withColumn("ingestion_date", F.lit(INGESTION_DATE))

df_enriched.write \
    .format("delta") \
    .mode("append") \
    .partitionBy("ingestion_date") \
    .save(GOLD_PATH)

print("✓ GTFS-RT Enriched data appended to Gold")


In [0]:
print(df_enriched.columns)
