## 02 - Ingest GTFS Real-Time Data (Bronze Layer)

Ingests real-time vehicle position data from King County Metro into the Bronze Delta Lake layer.


### Purpose
Capture daily transit activity (vehicle location and metadata) for use in downstream analysis and weather correlation.


### Steps
- Fetches GTFS-RT feed (Protobuf format)  
- Extracts fields: `vehicle_id`, `route_id`, `trip_id`, `latitude`, `longitude`, `timestamp`  
- Adds `ingestion_date` for partitioning  
- Writes to Delta table: `dbfs:/bronze/gtfs_rt/`


In [0]:
# dbutils.fs.rm("dbfs:/bronze/gtfs_rt/", recurse=True)     One-time utility: This cell is only needed the very first time you switch from overwrite to append logic.

In [0]:
%pip install gtfs-realtime-bindings --quiet
dbutils.library.restartPython()
# ⚠️ Only needed once per cluster restart

In [0]:
import datetime as dt
from google.transit import gtfs_realtime_pb2
import requests

# Use today's date dynamically
INGESTION_DATE = dt.date.today().isoformat()

# Bronze layer path (partitioned by ingestion_date)
BRONZE_RT_PATH = "dbfs:/bronze/gtfs_rt/"


In [0]:
URL = "https://s3.amazonaws.com/kcm-alerts-realtime-prod/vehiclepositions.pb"
response = requests.get(URL)
# Parse GTFS-RT feed
feed = gtfs_realtime_pb2.FeedMessage()
feed.ParseFromString(response.content)

print(f"# of entities: {len(feed.entity)}")
print(feed.entity[0]) if feed.entity else print('No vehicle updates available.')


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, TimestampType
from pyspark.sql.functions import lit
from datetime import datetime

# Define schema for structured data
schema = StructType([
    StructField("vehicle_id", StringType(), True),
    StructField("trip_id", StringType(), True),
    StructField("route_id", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("timestamp", LongType(), True),
    StructField("ingestion_ts", TimestampType(), True)
])

# Parse rows from feed
rows = []

if feed.entity:
    for entity in feed.entity:
        vehicle = entity.vehicle
        position = vehicle.position
        trip = vehicle.trip

        rows.append((
            entity.id,
            trip.trip_id,
            trip.route_id,
            position.latitude,
            position.longitude,
            vehicle.timestamp,
            datetime.utcnow()
        ))
else:
    print("No vehicle updates available. Creating an empty DataFrame.")

# Create Spark DataFrame
df_rt = spark.createDataFrame(rows, schema)

# Add ingestion_date for partitioning
df_rt = df_rt.withColumn("ingestion_date", lit(INGESTION_DATE))

# Filter out rows with invalid timestamp
df_rt = df_rt.filter("timestamp > 0")




In [0]:
# 🆕 Step: Remove duplicates using anti-join based on vehicle_id and timestamp
try:
    existing_df = spark.read.format("delta").load(BRONZE_RT_PATH).select("vehicle_id", "timestamp")
    df_rt = df_rt.alias("new").join(
        existing_df.alias("existing"),
        on=["vehicle_id", "timestamp"],
        how="left_anti"
    )
except Exception as e:
    print(f"✓ No existing data found or table is empty. Proceeding without anti-join. Error: {e}")


In [0]:
# Write to Delta (append mode with daily partition)
df_rt.write \
    .format("delta") \
    .mode("append") \
    .partitionBy("ingestion_date") \
    .save(BRONZE_RT_PATH)

print("✓ GTFS-RT Bronze ingest appended to Delta without duplicates")


In [0]:
df_check = spark.read.format("delta").load(BRONZE_RT_PATH)
df_check.filter(f"ingestion_date = '{INGESTION_DATE}'").show(5, truncate=False)
