In [0]:
dbutils.widgets.text("city", "la")
city = dbutils.widgets.get("city").lower().strip()
assert city in {"la","nyc"}

from pyspark.sql import functions as F, types as T

CATALOG   = "airbnb_lab3"
BRONZE_DB = "airbnb_bronze"
SILVER_DB = "airbnb_silver"

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SILVER_DB}")

def fq(db, t): 
    return f"{CATALOG}.{db}.{t}"

def table_exists(db_3part: str, table_name: str) -> bool:

    return spark.sql(f"SHOW TABLES IN {db_3part} LIKE '{table_name}'").count() > 0

def money_to_double(col):
    return F.regexp_replace(F.regexp_replace(col, r"^\$", ""), ",", "").cast("double")


In [0]:
bronze_listings_tbl = fq(BRONZE_DB, f"bronze_listings_{city}")
l = spark.table(bronze_listings_tbl)

# ---- silver_listings ----
silver_listings = (
    l.select(
        F.col("id").cast("long").alias("listing_id"),
        F.col("host_id").cast("long").alias("host_id"),
        F.col("host_name"),
        F.coalesce(F.col("neighbourhood_cleansed"), F.col("neighbourhood")).alias("neighbourhood"),
        F.col("room_type"),
        money_to_double(F.col("price")).alias("price"),
        F.col("number_of_reviews").cast("int").alias("number_of_reviews"),
        F.col("review_scores_rating").cast("double").alias("review_scores_rating"),
        F.col("latitude").cast("double").alias("latitude"),
        F.col("longitude").cast("double").alias("longitude"),
    )
    .withColumn("city", F.lit(city.upper()))
    .withColumn("room_type", F.lower(F.trim("room_type")))
    .withColumn("neighbourhood", F.lower(F.trim("neighbourhood")))
)

# Minimal DQ hardening
allowed_rooms = F.array([F.lit(x) for x in ["entire home/apt","private room","shared room","hotel room"]])
silver_listings = (
    silver_listings
      .filter(F.col("listing_id").isNotNull())
      .filter(F.col("price").isNull() | (F.col("price") >= F.lit(0.0)))
      .filter(F.col("room_type").isNull() | F.array_contains(allowed_rooms, F.col("room_type")))
)

silver_listings_tbl = fq(SILVER_DB, f"silver_listings_{city}")

silver_listings = (
    silver_listings
    .withColumn("price", F.col("price").cast("double"))
    .withColumn("review_scores_rating", F.col("review_scores_rating").cast("double"))
)

# Drop ANY NULL across all columns
silver_listings = silver_listings.na.drop("any")

if not table_exists(f"{CATALOG}.{SILVER_DB}", f"silver_listings_{city}"):
    (silver_listings.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema","true")
        .saveAsTable(silver_listings_tbl))
else:
    silver_listings.createOrReplaceTempView("updates_listings")
    spark.sql(f"""
        MERGE INTO {silver_listings_tbl} AS t
        USING updates_listings AS s
        ON  t.listing_id = s.listing_id
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
    """)

# =======================================================
#                    CALENDAR  -> SILVER
# =======================================================
bronze_calendar_tbl = fq(BRONZE_DB, f"bronze_calendar_{city}")
c = spark.table(bronze_calendar_tbl)

silver_calendar = (
    c.select(
        F.col("listing_id").cast("long").alias("listing_id"),
        F.to_date("date").alias("date"),
        money_to_double(F.col("price")).alias("price"),
        money_to_double(F.col("adjusted_price")).alias("adjusted_price"),
        F.col("minimum_nights").cast("int").alias("minimum_nights"),
        F.col("maximum_nights").cast("int").alias("maximum_nights"),
        F.when(F.col("available") == F.lit("t"), F.lit(True))
         .when(F.col("available") == F.lit("f"), F.lit(False))
         .otherwise(F.lit(None)).alias("is_available"),
    )
    .withColumn("is_occupied", F.when(F.col("is_available") == F.lit(False), F.lit(True)).otherwise(F.lit(False)))
    .withColumn("city", F.lit(city.upper()))
    .withColumn("yyyymm", F.date_format("date","yyyy-MM"))
    .filter(F.col("date").isNotNull())
    .filter(F.col("price").isNull() | (F.col("price") >= F.lit(0.0)))
    .filter(F.col("adjusted_price").isNull() | (F.col("adjusted_price") >= F.lit(0.0)))
)

silver_calendar_tbl = fq(SILVER_DB, f"silver_calendar_{city}")

silver_calendar = (
    silver_calendar
    .withColumn("price", F.col("price").cast("double"))
    .withColumn("minimum_nights", F.col("minimum_nights").cast("int"))
    .withColumn("maximum_nights", F.col("maximum_nights").cast("int"))
    .withColumn("is_available", F.col("is_available").cast("boolean"))
)

# Add yyyymm before null drop
silver_calendar = silver_calendar.withColumn("yyyymm", F.date_format("date","yyyy-MM"))

silver_calendar = silver_calendar.na.drop("any")

# First time: create partitioned table; next times: MERGE
if not table_exists(f"{CATALOG}.{SILVER_DB}", f"silver_calendar_{city}"):
    (silver_calendar.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema","true")
        .partitionBy("yyyymm")
        .saveAsTable(silver_calendar_tbl))
else:
    silver_calendar.createOrReplaceTempView("updates_calendar")
    spark.sql(f"""
        MERGE INTO {silver_calendar_tbl} AS t
        USING updates_calendar AS s
        ON  t.listing_id = s.listing_id AND t.date = s.date
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
    """)

# =======================================================
#                    REVIEWS   -> SILVER
# =======================================================
bronze_reviews_name = f"bronze_reviews_{city}"
if table_exists(f"{CATALOG}.{BRONZE_DB}", bronze_reviews_name):
    r = spark.table(fq(BRONZE_DB, bronze_reviews_name))
    silver_reviews = (
        r.select(
            F.col("listing_id").cast("long").alias("listing_id"),
            F.col("id").cast("long").alias("review_id"),
            F.to_date("date").alias("review_date"),
            F.col("reviewer_id").cast("long").alias("reviewer_id"),
            F.col("reviewer_name"),
            F.col("comments"),
        )
        .withColumn("city", F.lit(city.upper()))
        .filter(F.col("review_id").isNotNull())
    )

    silver_reviews_tbl = fq(SILVER_DB, f"silver_reviews_{city}")

    silver_reviews = silver_reviews.na.drop("any")

    if not table_exists(f"{CATALOG}.{SILVER_DB}", f"silver_reviews_{city}"):
        (silver_reviews.write
            .format("delta")
            .mode("overwrite")
            .option("overwriteSchema","true")
            .saveAsTable(silver_reviews_tbl))
    else:
        silver_reviews.createOrReplaceTempView("updates_reviews")
        spark.sql(f"""
            MERGE INTO {silver_reviews_tbl} AS t
            USING updates_reviews AS s
            ON  t.review_id = s.review_id
            WHEN MATCHED THEN UPDATE SET *
            WHEN NOT MATCHED THEN INSERT *
        """)

print(f"[SILVER âœ…] {city.upper()} written to {CATALOG}.{SILVER_DB}")


In [0]:
spark.sql("SHOW TABLES IN airbnb_lab3.airbnb_silver").show()
