In [0]:
TODAY = "2025-05-25"            # for real-time input (df_rt)
STATIC_DATE = "2025-05-21"      # for static GTFS inputs (df_trips, df_routes)


In [0]:
import datetime as dt
from pyspark.sql import functions as F


# Silver inputs
RT_SILVER_PATH     = f"dbfs:/silver/gtfs_rt/{TODAY}"
TRIPS_SILVER_PATH  = f"dbfs:/silver/gtfs_static/{STATIC_DATE}/trips"
ROUTES_SILVER_PATH = f"dbfs:/silver/gtfs_static/{STATIC_DATE}/routes"

# Gold output
GOLD_PATH = f"dbfs:/gold/gtfs_rt_enriched/{TODAY}"


In [0]:
df_rt     = spark.read.format("delta").load(RT_SILVER_PATH)
df_trips  = spark.read.format("delta").load(TRIPS_SILVER_PATH)
df_routes = spark.read.format("delta").load(ROUTES_SILVER_PATH)


In [0]:
# If df_rt had route_id already, we’ll drop it before the joins
df_rt_clean = df_rt.drop("route_id")  # 👈 prevents duplicate later

df_joined = (
    df_rt_clean
    .join(df_trips.select("trip_id", "route_id", "direction_id"), on="trip_id", how="left")
)

df_enriched = (
    df_joined
    .join(
        df_routes.select("route_id", "route_short_name", "route_type"),
        on="route_id",
        how="left"
    )
    .withColumn("joined_at", F.current_timestamp())
)


In [0]:
df_enriched.select(
    "vehicle_id", "route_short_name", "direction_id", 
    "latitude", "longitude", "event_ts", "joined_at"
).show(5, truncate=False)


In [0]:
df_enriched.write.format("delta").mode("overwrite").save(GOLD_PATH)
print("✓ GTFS-RT Enriched data saved to Gold")

In [0]:
print(df_enriched.columns)
