## 13 – Create Platinum Dimensions (Static Data)

This notebook builds the **dimension tables** for static GTFS data (*routes*, and *trips*) in a **Platinum Star Schema**.

- It uses **SCD Type 2** records from the Bronze layer as input  
- Each dimension is enriched with a **surrogate key** for BI lookups  
- Output is saved to `dbfs:/plat/` in Delta format (one table per dimension)


In [0]:
from pyspark.sql import functions as F, Window
import datetime as dt
# Path to Bronze GTFS static data (with SCD2)
BRONZE_STATIC = "dbfs:/bronze/gtfs_static" # SCD‑2 routes, trips
# Output path for Platinum dimension tables
PLAT = "dbfs:/plat"

In [0]:
def add_surrogate(df, cols, sk_col):
    """
    Deterministically hashes the chosen columns so the same
    natural‑key + start_time pair always gets the same surrogate key.
    """
    concat_expr = F.concat_ws("||", *[F.col(c).cast("string") for c in cols])
    return df.withColumn(sk_col, F.sha2(concat_expr, 256))

In [0]:
# Build dim_route table from SCD2 Bronze routes data (transformation)
dim_route = (
    spark.read.format("delta").load(f"{BRONZE_STATIC}/routes")
    .withColumn("route_id", F.col("route_id").cast("string"))
    .withColumn("route_short_name", F.col("route_short_name").cast("string"))
    .withColumn("route_type", F.col("route_type").cast("int"))
    .withColumn("ingestion_ts", F.current_timestamp())
    .transform(lambda d: add_surrogate(d, ["route_id", "start_time"],
                                       "route_sk"))
)

# Write to Platinum layer
dim_route.write.format("delta").mode("overwrite").save(f"{PLAT}/dim_route")
print(f"✓ dim_route rows: {dim_route.count()}")


In [0]:
# Count distinct surrogate keys to confirm uniqueness
dim_route.select('route_sk').distinct().count()

In [0]:
# Build dim_trip table from SCD2 Bronze trips data (transformation)
dim_trip = (
    spark.read.format("delta").load(f"{BRONZE_STATIC}/trips")
    .withColumn("trip_id", F.col("trip_id").cast("string"))
    .withColumn("route_id", F.col("route_id").cast("string"))
    .withColumn("service_id", F.col("service_id").cast("string"))
    .withColumn("direction_id", F.col("direction_id").cast("int"))
    .withColumn("ingestion_ts", F.current_timestamp())
    .transform(lambda d: add_surrogate(d, ["trip_id", "start_time"],
                                       "trip_sk"))
)
# Write to Platinum layer
dim_trip.write.format("delta").mode("overwrite").save(f"{PLAT}/dim_trip")
print(f"✓ dim_trip rows: {dim_trip.count()}")


In [0]:
# Count distinct surrogate keys to confirm uniqueness
dim_trip.select('trip_sk').distinct().count()