In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

TODAY = "2025-05-25"

# Input Paths
RT_PATH = f"dbfs:/gold/gtfs_rt_enriched/{TODAY}"
WEATHER_PATH = f"dbfs:/silver/weather/{TODAY}"

# Output_Path
JOINED_PATH = f"dbfs:/gold/gtfs_rt_weather_joined/{TODAY}"


In [0]:
df_weather = (
    spark.read.format("delta").load(WEATHER_PATH)
    .drop("processed_at")  # ðŸ‘ˆ Drop the duplicate before joining
    .filter("forecast_time IS NOT NULL")
)


In [0]:
df_rt = spark.read.format("delta").load(RT_PATH)

In [0]:
# Join each vehicle event to the earliest forecast that happens *after* it
df_cross = df_rt.alias("rt").join(
    df_weather.alias("w"),
    F.col("w.forecast_time") >= F.col("rt.event_ts")
)

# Window: for each vehicle + event, get the nearest later forecast
windowSpec = Window.partitionBy("rt.vehicle_id", "rt.event_ts").orderBy(F.col("w.forecast_time").asc())

df_ranked = (
    df_cross.withColumn("rank", F.row_number().over(windowSpec))
            .filter("rank = 1")
            .drop("rank")
            .withColumn("joined_at", F.current_timestamp())
)


In [0]:
df_ranked.write.format("delta").mode("overwrite").save(JOINED_PATH)

print("âœ“ GTFS-RT + Weather (nearest earlier forecast) join saved to Gold")


In [0]:
print(df_ranked.columns)


In [0]:
df_ranked.select("event_ts", "forecast_time", "condition", "temperature", "route_short_name").show(10, truncate=False)


In [0]:
df_rt.select("event_ts").orderBy("event_ts").show(10, False)


In [0]:
df_weather.select("forecast_time").orderBy("forecast_time").show(10, False)
