In [0]:
# Imports
from pyspark.sql.functions import col, to_date, date_format, year, month, dayofweek, length

In [0]:
# Define paths
SILVER_PATH = "/mnt/silver/"
GOLD_PATH = "/mnt/gold/"


In [0]:
# Load Silver tables
df_realtime = spark.read.format("delta").load(f"{SILVER_PATH}/realtime_trips")
df_stops = spark.read.format("delta").load(f"{SILVER_PATH}/stops")
df_trips = spark.read.format("delta").load(f"{SILVER_PATH}/trips")
df_routes = spark.read.format("delta").load(f"{SILVER_PATH}/routes")
df_calendar_dates = spark.read.format("delta").load(f"{SILVER_PATH}/calendar_dates")

In [0]:
# Dimension: DimDate
dim_date = (
    df_calendar_dates
    .withColumn("Date", to_date(col("date").cast("string"), "yyyyMMdd"))
    .select(
        col("date").alias("DateKey"),
        col("Date"),
        year("Date").alias("Year"),
        month("Date").alias("Month"),
        dayofweek("Date").alias("DayOfWeek"),
        date_format("Date", "EEEE").alias("DayName"),
        col("service_id").alias("ServiceKey")
    )
    .distinct()
)
dim_date.write.format("delta").mode("overwrite").save(f"{GOLD_PATH}/DimDate")

# Dimension: DimStop
dim_stop = df_stops.select(
    col("stop_id").alias("StopKey"),
    col("stop_name").alias("StopName"),
    col("latitude").alias("Latitude"),
    col("longitude").alias("Longitude"),
    col("wheelchair_boarding").alias("WheelchairAccessible")
)
dim_stop.write.format("delta").mode("overwrite").save(f"{GOLD_PATH}/DimStop")

# Dimension: DimRoute
dim_route = df_routes.select(
    col("route_id").alias("RouteKey"),
    col("route_long_name").alias("RouteName"),
    col("route_short_name").alias("RouteAlias"),
    col("route_type").alias("RouteType")
)
dim_route.write.format("delta").mode("overwrite").save(f"{GOLD_PATH}/DimRoute")

# Dimension: DimTrip
dim_trip = df_trips.select(
    col("trip_id").alias("TripKey"),
    col("route_id").alias("RouteKey"), # FK to DimRoute
    col("service_id").alias("ServiceKey"), # FK to DimDate
    col("shape_id").alias("ShapeKey"), # FK to DimShape
)
dim_trip.write.format("delta").mode("overwrite").save(f"{GOLD_PATH}/DimTrip")

# Dimension: DimVehicle
dim_vehicle = df_realtime.select(col("vehicle_id").alias("VehicleKey")).distinct()
dim_vehicle.write.format("delta").mode("overwrite").save(f"{GOLD_PATH}/DimVehicle")

print("\nAll Dimension tables created successfully.")


All Dimension tables created successfully.


In [0]:
# Facts: FactVehicleUpdates
base_facts = df_realtime.select(
    to_date(col("event_timestamp")).alias("EventDate"),
    col("trip_id").alias("TripKey"),
    col("stop_id").alias("StopKey"),
    col("vehicle_id").alias("VehicleKey"),
    col("stop_sequence").alias("StopSequence"),
    col("arrival_delay_seconds").alias("ArrivalDelaySeconds"),
    col("arrival_timestamp").alias("ArrivalTime"),
    col("departure_delay_seconds").alias("DepartureDelaySeconds"),
    col("departure_timestamp").alias("DepartureTime"),
    col("latitude").alias("Latitude"),
    col("longitude").alias("Longitude"),
)

# Enriching
fact_enriched = (
    base_facts
    .join(dim_trip, on="TripKey", how="left")
    .join(dim_date,
          (base_facts.EventDate == dim_date.Date) & (dim_trip.ServiceKey == dim_date.ServiceKey),
          how="left"
    )
    .select(
        col("DateKey"),
        col("StopKey"),
        col("RouteKey"),
        col("TripKey"),
        col("VehicleKey"),
        col("ShapeKey"),
        col("StopSequence"),
        col("ArrivalDelaySeconds"),
        col("ArrivalTime"),
        col("DepartureDelaySeconds"),
        col("DepartureTime"),
        col("Latitude"),
        col("Longitude"),
    )
)

fact_enriched.write.format("delta").partitionBy("DateKey").mode("overwrite").save(f"{GOLD_PATH}/FactVehicleUpdates")

print("Fact table created successfully.")

Fact table created successfully.
