## 03 - Transform GTFS Real-Time Data (Silver Layer)

Cleans and enriches real-time GTFS vehicle position data to prepare it for analysis.


### Purpose
Convert raw real-time transit signals into structured, deduplicated records with timestamps and locations for consistent downstream use.


### Steps
- Filters out invalid or null coordinates
- Converts `timestamp` to `event_ts` and derives `event_date`
- Adds `location` and processing metadata
- Removes duplicate vehicle-timestamp pairs
- Writes transformed data to: `dbfs:/silver/gtfs_rt/`


In [0]:
# dbutils.fs.rm("dbfs:/silver/gtfs_rt/", recurse=True)   One-time utility: This cell is only needed the very first time you switch from overwrite to append logic.

In [0]:
import datetime as dt
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Set today's ingestion date (used for filtering)
INGESTION_DATE = dt.date.today().isoformat()

# Define paths for Bronze and Silver Delta layers
BRONZE_RT_PATH = "dbfs:/bronze/gtfs_rt/"
SILVER_RT_PATH = "dbfs:/silver/gtfs_rt/"


In [0]:
# Read from Bronze layer and filter today’s data with valid GPS and timestamp values
df_bronze = (
    spark.read.format("delta").load(BRONZE_RT_PATH)
    .filter(F.col("ingestion_date") == INGESTION_DATE)
    .filter((F.col("timestamp") > 0))
    .filter(F.col("latitude").isNotNull() & F.col("longitude").isNotNull())
    .filter(~((F.col("latitude") == 0.0) & (F.col("longitude") == 0.0)))  
)

df_bronze.show(3)


In [0]:
# Transform Bronze data:
# - Convert raw timestamp to datetime
# - Create a separate event date column
# - Combine lat/lon into a struct
# - Add a processing timestamp
# - Remove duplicate events by vehicle and timestamp
df_silver = (
    df_bronze
    .withColumn("event_ts", F.to_timestamp(F.col("timestamp")))
    .withColumn("event_date", F.to_date(F.col("event_ts")))
    .withColumn("location", F.struct(F.col("latitude"), F.col("longitude")))
    .withColumn("processed_at", F.current_timestamp())
    .dropDuplicates(["vehicle_id", "timestamp"])
)


In [0]:
# Deduplicate based on vehicle_id and timestamp before writing to Silver
try:
    existing_silver = spark.read.format("delta").load(SILVER_RT_PATH).select("vehicle_id", "timestamp")
    
    df_silver = df_silver.alias("new").join(
        existing_silver.alias("existing"),
        on=["vehicle_id", "timestamp"],
        how="left_anti"
    )
except Exception as e:
    print(f"✓ No existing Silver data found or table is empty. Proceeding without anti-join. Error: {e}")

# Write the transformed Silver-layer data back to Delta format
# Partitioned by ingestion date to enable efficient querying
df_silver.write \
    .format("delta") \
    .mode("append") \
    .partitionBy("ingestion_date") \
    .save(SILVER_RT_PATH)

print("✓ GTFS-RT Silver transform complete (duplicates avoided)")


In [0]:
# Sanity check: show some of today’s newly written Silver-layer data
spark.read.format("delta").load(SILVER_RT_PATH) \
    .filter(F.col("ingestion_date") == INGESTION_DATE) \
    .show(5, truncate=False)


In [0]:
# Check distinct dates in the Silver data to confirm successful partitioning
spark.read.format("delta").load("dbfs:/silver/gtfs_rt").select("event_date").distinct().orderBy("event_date").show()


In [0]:
# Investigate if any default (Unix epoch) timestamps slipped through
spark.read.format("delta").load("dbfs:/silver/gtfs_rt") \
    .filter("event_date = '1970-01-01'") \
    .select("vehicle_id", "timestamp", "event_ts", "event_date") \
    .show(10, truncate=False)


In [0]:
# Check if any records still have timestamp = 0 (which shouldn't happen after filtering)
spark.read.format("delta").load("dbfs:/silver/gtfs_rt") \
    .filter("timestamp = 0") \
    .count()
