In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

# ==========================================
# 1. BOOKINGS (Fact Table)
# Flow: Stage -> Trans -> Silver (3 Steps)
# ==========================================

@dlt.table(name="stage_bookings")
def stage_bookings():
    return (
        spark.readStream.format("delta")
        .load("/Volumes/workspace/bronze/bronzevolume/bookings/data")
    )

@dlt.view(name="trans_bookings")
def trans_bookings():
    return (
        dlt.read_stream("stage_bookings")
        .withColumn("amount", col("amount").cast(DoubleType()))
        .withColumn("booking_date", to_date(col("booking_date")))
        .withColumn("modifiedDate", current_timestamp())
        .drop("_rescued_data")
    )

booking_rules = {
    "valid_booking_id": "booking_id IS NOT NULL",
    "valid_passenger": "passenger_id IS NOT NULL"
}

@dlt.table(name="silver_bookings")
@dlt.expect_all_or_drop(booking_rules)
def silver_bookings():
    return dlt.read_stream("trans_bookings")


# ==========================================
# 2. FLIGHTS (Dimension)
# Flow: Trans -> Silver (Direct Read - 2 Steps)
# ==========================================

@dlt.view(name="trans_flights")
def trans_flights():
    return (
        spark.readStream.format("delta")
        .load("/Volumes/workspace/bronze/bronzevolume/flights/data")
        .withColumn("flight_date", to_date(col("flight_date")))
        .withColumn("modifiedDate", current_timestamp())
        .drop("_rescued_data")
    )

dlt.create_streaming_table("silver_flights")

dlt.apply_changes(
    target = "silver_flights",
    source = "trans_flights",
    keys = ["flight_id"],
    sequence_by = col("modifiedDate"),
    stored_as_scd_type = 1
)


# ==========================================
# 3. PASSENGERS (Dimension)
# Flow: Trans -> Silver (Direct Read - 2 Steps)
# ==========================================

@dlt.view(name="trans_passengers")
def trans_passengers():
    return (
        spark.readStream.format("delta")
        .load("/Volumes/workspace/bronze/bronzevolume/customers/data")
        .withColumn("modifiedDate", current_timestamp())
        .drop("_rescued_data")
    )

dlt.create_streaming_table("silver_passengers")

dlt.apply_changes(
    target = "silver_passengers",
    source = "trans_passengers",
    keys = ["passenger_id"],
    sequence_by = col("modifiedDate"),
    stored_as_scd_type = 1
)


# ==========================================
# 4. AIRPORTS (Dimension)
# Flow: Trans -> Silver (Direct Read - 2 Steps)
# ==========================================

@dlt.view(name="trans_airports")
def trans_airports():
    return (
        spark.readStream.format("delta")
        .load("/Volumes/workspace/bronze/bronzevolume/airports/data")
        .withColumn("modifiedDate", current_timestamp())
        .drop("_rescued_data")
    )

dlt.create_streaming_table("silver_airports")

dlt.apply_changes(
    target = "silver_airports",
    source = "trans_airports",
    keys = ["airport_id"],
    sequence_by = col("modifiedDate"),
    stored_as_scd_type = 1
)


# ==========================================
# 5. BUSINESS LAYER (The Big Join)
# Fix: Drops duplicate 'modifiedDate' to prevent errors
# ==========================================

@dlt.table(name="silver_business")
def silver_business():
    # 1. Read Bookings (The Main Table)
    bookings_df = dlt.read_stream("silver_bookings")
    
    # 2. Read Dimensions and DROP the timestamp column so it doesn't conflict
    flights_df = dlt.read("silver_flights").drop("modifiedDate")
    passengers_df = dlt.read("silver_passengers").drop("modifiedDate")
    airports_df = dlt.read("silver_airports").drop("modifiedDate")

    # 3. Join them all together safely
    # We use list syntax ["column_name"] to merge keys automatically
    
    final_df = (
        bookings_df
        .join(passengers_df, ["passenger_id"], "left")
        .join(flights_df, ["flight_id"], "left")
        # Joining Airports assuming 'airport_id' exists in the data after Flights join
        # If this join fails, comment out the next line, but usually Flights has an airport link.
        .join(airports_df, ["airport_id"], "left")
    )
    
    return final_df