In [0]:
# dbutils.fs.rm("dbfs:/silver/gtfs_rt/", recurse=True)   One-time utility: This cell is only needed the very first time you switch from overwrite to append logic.

In [0]:
import datetime as dt
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Use today's date dynamically
INGESTION_DATE = dt.date.today().isoformat()

# Shared table paths (no date in the path)
BRONZE_RT_PATH = "dbfs:/bronze/gtfs_rt/"
SILVER_RT_PATH = "dbfs:/silver/gtfs_rt/"


In [0]:
df_bronze = (
    spark.read.format("delta").load(BRONZE_RT_PATH)
    .filter(F.col("ingestion_date") == INGESTION_DATE)
    .filter(
        (F.col("timestamp") > 0) &
        F.col("latitude").isNotNull() &
        F.col("longitude").isNotNull()
    )
)

df_bronze.show(3)


In [0]:
df_silver = (
    df_bronze
    .withColumn("event_ts", F.to_timestamp(F.col("timestamp")))
    .withColumn("event_date", F.to_date(F.col("event_ts")))
    .withColumn("location", F.struct(F.col("latitude"), F.col("longitude")))
    .withColumn("processed_at", F.current_timestamp())
    .dropDuplicates(["vehicle_id", "timestamp"])
)


In [0]:
df_silver.write \
    .format("delta") \
    .mode("append") \
    .partitionBy("ingestion_date") \
    .save(SILVER_RT_PATH)

print("✓ GTFS-RT Silver transform complete")


In [0]:
spark.read.format("delta").load(SILVER_RT_PATH) \
    .filter(F.col("ingestion_date") == INGESTION_DATE) \
    .show(5, truncate=False)


In [0]:
spark.read.format("delta").load("dbfs:/silver/gtfs_rt").select("event_date").distinct().orderBy("event_date").show()


In [0]:
spark.read.format("delta").load("dbfs:/silver/gtfs_rt") \
    .filter("event_date = '1970-01-01'") \
    .select("vehicle_id", "timestamp", "event_ts", "event_date") \
    .show(10, truncate=False)


In [0]:
spark.read.format("delta").load("dbfs:/silver/gtfs_rt") \
    .filter("timestamp = 0") \
    .count()
