In [0]:
from pyspark.sql import functions as F

TODAY = "2025-05-21"

# Input paths
RT_PATH = f"dbfs:/gold/gtfs_rt_enriched/{TODAY}"
WEATHER_PATH = f"dbfs:/silver/weather/{TODAY}"

# Output path
JOINED_PATH = f"dbfs:/gold/gtfs_rt_weather_joined/{TODAY}"


In [0]:
df_rt = spark.read.format("delta").load(RT_PATH)
df_weather = spark.read.format("delta").load(WEATHER_PATH)


In [0]:
df_rt.select("vehicle_id", "event_ts", "route_short_name").show(3)
df_weather.select("forecast_time", "temperature", "condition").show(3)


In [0]:
# Round weather forecast to hour for safe join, and drop duplicate column
df_weather_rounded = (
    df_weather
    .drop("processed_at")  # 👈 Fix here
    .withColumn("forecast_hour", F.date_trunc("hour", "forecast_time"))
)

# Round vehicle timestamp to hour
df_rt_rounded = df_rt.withColumn("event_hour", F.date_trunc("hour", "event_ts"))

# Join on hour
df_joined = (
    df_rt_rounded.join(
        df_weather_rounded,
        df_rt_rounded.event_hour == df_weather_rounded.forecast_hour,
        how="left"
    )
    .withColumn("joined_at", F.current_timestamp())
)


In [0]:
df_joined.write.format("delta").mode("overwrite").save(JOINED_PATH)
print("✓ GTFS-RT + Weather join saved")
