In [0]:
# dbutils.fs.rm("dbfs:/gold/gtfs_rt_weather_joined/", recurse=True)         #One-time cleanup

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import datetime as dt

# Set dynamic date for today’s ingest
INGESTION_DATE = dt.date.today().isoformat()

# Partitioned input paths
RT_PATH = "dbfs:/gold/gtfs_rt_enriched/"
WEATHER_PATH = "dbfs:/silver/weather/"

# Partitioned output path
JOINED_PATH = "dbfs:/gold/gtfs_rt_weather_joined/"


In [0]:
df_weather = (
    spark.read.format("delta").load(WEATHER_PATH)
    .filter(F.col("ingestion_date") == INGESTION_DATE)
    .drop("processed_at")
    .filter("forecast_time IS NOT NULL")
)


In [0]:
df_rt = (
    spark.read.format("delta").load(RT_PATH)
    .filter(F.col("ingestion_date") == INGESTION_DATE)
)


In [0]:
df_cross = df_rt.alias("rt").join(
    df_weather.alias("w"),
    F.col("w.forecast_time") >= F.col("rt.event_ts")
)

windowSpec = Window.partitionBy("rt.vehicle_id", "rt.event_ts").orderBy(F.col("w.forecast_time").asc())

df_ranked = (
    df_cross.withColumn("rank", F.row_number().over(windowSpec))
            .filter("rank = 1")
            .drop("rank")
            .withColumn("joined_at", F.current_timestamp())
            .drop("ingestion_date")  # 👈 important
            .withColumn("ingestion_date", F.lit(INGESTION_DATE))
)



In [0]:
df_ranked.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("ingestion_date") \
    .save(JOINED_PATH)

print("✓ GTFS-RT + Weather (nearest forecast) join saved to Gold")


In [0]:
print(df_ranked.columns)


In [0]:
df_ranked.select("event_ts", "forecast_time", "condition", "temperature", "route_short_name").show(10, truncate=False)


In [0]:
df_rt.select("event_ts").orderBy("event_ts").show(10, False)


In [0]:
df_weather.select("forecast_time").orderBy("forecast_time").show(10, False)


In [0]:
df_rt.select("event_ts").orderBy("event_ts", ascending=False).show(5, False)
df_weather.select("forecast_time").orderBy("forecast_time", ascending=True).show(5, False)
