## 07 - Join Transit Data with Weather (Gold Layer)

This notebook performs a time-based join between enriched real-time transit data and hourly weather forecasts to build a comprehensive dataset for analysis.

### Purpose
To associate each vehicle update with the nearest **future** weather forecast, enabling exploration of how conditions affect transit behavior.

### Workflow Summary
- Loads partitioned GTFS-RT (Gold) and weather forecast (Silver) data for today
- Uses a time-windowed join to match each vehicle update to its closest hourly forecast
- Writes the final joined output to: `dbfs:/gold/gtfs_rt_weather_joined/`


In [0]:
# dbutils.fs.rm("dbfs:/gold/gtfs_rt_weather_joined/", recurse=True)         #One-time cleanup

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import datetime as dt

# Set today's ingestion date dynamically
INGESTION_DATE = dt.date.today().isoformat()

# Define input and output paths
RT_PATH = "dbfs:/gold/gtfs_rt_enriched/"      # Enriched GTFS real-time
WEATHER_PATH = "dbfs:/silver/weather/"        # Cleaned hourly weather
JOINED_PATH = "dbfs:/gold/gtfs_rt_weather_joined/"    # Final output


In [0]:
# Load today’s weather forecast and clean up unnecessary columns
df_weather = (
    spark.read.format("delta").load(WEATHER_PATH)
    .filter(F.col("ingestion_date") == INGESTION_DATE)
    .drop("processed_at")
    .filter("forecast_time IS NOT NULL")
)


In [0]:
# Preview weather data coverage
print("Weather records:", df_weather.count())
df_weather.select("forecast_time").summary("min", "max").show()


In [0]:
# Load enriched GTFS real-time data for today
df_rt = (
    spark.read.format("delta").load(RT_PATH)
    .filter(F.col("ingestion_date") == INGESTION_DATE)
    .filter(F.col("event_ts").isNotNull())
)


In [0]:
# Preview transit data coverage
print("GTFS-RT records:", df_rt.count())
df_rt.select("event_ts").summary("min", "max").show()


In [0]:
# Join: Find the next available forecast for each transit event
df_cross = df_rt.alias("rt").join(
    df_weather.alias("w"),
    F.col("w.forecast_time") >= F.col("rt.event_ts")
)
# Rank to pick the closest (minimum) forecast_time
windowSpec = Window.partitionBy("rt.vehicle_id", "rt.event_ts").orderBy(F.col("w.forecast_time").asc())

df_ranked = (
    df_cross.withColumn("rank", F.row_number().over(windowSpec))        # Assign ranking within each group
            .filter("rank = 1")                                         # Keep only the closest match
            .drop("rank")
            .withColumn("joined_at", F.current_timestamp())             # Track when join occurred
            .drop("ingestion_date")                                     # Drop weather ingestion date
            .withColumn("ingestion_date", F.lit(INGESTION_DATE))        # Replace with unified date
)



In [0]:
# Check final temporal ranges for quality assurance
df_ranked.select("event_ts", "forecast_time").summary("min", "max").show()


In [0]:
# Deduplicate based on vehicle_id, event_ts, and forecast_time before writing to Gold
try:
    existing_joined = spark.read.format("delta").load(JOINED_PATH).select("vehicle_id", "event_ts", "forecast_time")
    
    df_ranked = df_ranked.alias("new").join(
        existing_joined.alias("existing"),
        on=["vehicle_id", "event_ts", "forecast_time"],
        how="left_anti"
    )
except Exception as e:
    print(f"✓ No existing joined data found or table is empty. Proceeding without anti-join. Error: {e}")

# Save joined dataset to Gold layer
df_ranked.write \
    .format("delta") \
    .mode("append") \
    .partitionBy("ingestion_date") \
    .save(JOINED_PATH)

print("✓ GTFS-RT + Weather (nearest forecast) join saved to Gold (duplicates avoided)")

In [0]:
# Quick look at all available columns
print(df_ranked.columns)


In [0]:
# Preview sample rows with weather and transit merged
df_ranked.select("event_ts", "forecast_time", "condition", "temperature", "route_short_name").show(10, truncate=False)


In [0]:
# Check GTFS timestamps
df_rt.select("event_ts").orderBy("event_ts").show(10, False)


In [0]:
# Check weather timestamps
df_weather.select("forecast_time").orderBy("forecast_time").show(10, False)


In [0]:
# Compare the most recent GTFS event timestamps with the earliest weather forecast timestamps
# This helps validate that the join window logic (forecast_time ≥ event_ts) works correctly
df_rt.select("event_ts").orderBy("event_ts", ascending=False).show(5, False)
df_weather.select("forecast_time").orderBy("forecast_time", ascending=True).show(5, False)
