In [0]:
dbutils.widgets.text("city", "la")
city = dbutils.widgets.get("city").lower().strip()
assert city in {"la","nyc"}

from pyspark.sql import functions as F

CATALOG = "airbnb_lab3"
BRONZE_DB = "airbnb_bronze"
SILVER_DB = "airbnb_silver"

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SILVER_DB}")
def fq(db, t): return f"{CATALOG}.{db}.{t}"

def money_to_double(col):
    return F.regexp_replace(F.regexp_replace(col, r"^\$", ""), ",", "").cast("double")

def table_exists(db_3part: str, table_name: str) -> bool:
    return spark.sql(f"SHOW TABLES IN {db_3part} LIKE '{table_name}'").count() > 0


In [0]:
# ===================== LISTINGS -> SILVER =====================
bronze_listings = fq(BRONZE_DB, f"bronze_listings_{city}")
l = spark.table(bronze_listings)

silver_listings = (
    l.select(
        F.col("id").cast("long").alias("listing_id"),
        F.col("host_id").cast("long").alias("host_id"),
        F.col("host_name"),
        F.coalesce(F.col("neighbourhood_cleansed"), F.col("neighbourhood")).alias("neighbourhood"),
        F.col("room_type"),
        money_to_double(F.col("price")).alias("price"),
        F.col("number_of_reviews").cast("int").alias("number_of_reviews"),
        F.col("review_scores_rating").cast("double").alias("review_scores_rating"),
        F.col("latitude").cast("double").alias("latitude"),
        F.col("longitude").cast("double").alias("longitude")
    )
    .withColumn("city", F.lit(city.upper()))
    .withColumn("room_type", F.lower(F.trim("room_type")))
    .withColumn("neighbourhood", F.lower(F.trim("neighbourhood")))
)

silver_listings_tbl = fq(SILVER_DB, f"silver_listings_{city}")
(silver_listings.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable(silver_listings_tbl))

# ===================== CALENDAR -> SILVER =====================
bronze_calendar = fq(BRONZE_DB, f"bronze_calendar_{city}")
c = spark.table(bronze_calendar)

silver_calendar = (
    c.select(
        F.col("listing_id").cast("long").alias("listing_id"),
        F.to_date("date").alias("date"),
        money_to_double(F.col("price")).alias("price"),
        money_to_double(F.col("adjusted_price")).alias("adjusted_price"),
        F.col("minimum_nights").cast("int").alias("minimum_nights"),
        F.col("maximum_nights").cast("int").alias("maximum_nights"),
        F.when(F.col("available") == F.lit("t"), F.lit(True))
         .when(F.col("available") == F.lit("f"), F.lit(False))
         .otherwise(F.lit(None)).alias("is_available")
    )
    .withColumn("is_occupied", F.when(F.col("is_available") == F.lit(False), F.lit(True)).otherwise(F.lit(False)))
    .withColumn("city", F.lit(city.upper()))
)

silver_calendar_tbl = fq(SILVER_DB, f"silver_calendar_{city}")
(silver_calendar.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable(silver_calendar_tbl))

# ===================== REVIEWS -> SILVER =====================
bronze_reviews_name = f"bronze_reviews_{city}"
if table_exists(f"{CATALOG}.{BRONZE_DB}", bronze_reviews_name):
    r = spark.table(fq(BRONZE_DB, bronze_reviews_name))
    silver_reviews = (
        r.select(
            F.col("listing_id").cast("long").alias("listing_id"),
            F.col("id").cast("long").alias("review_id"),
            F.to_date("date").alias("review_date"),
            F.col("reviewer_id").cast("long").alias("reviewer_id"),
            F.col("reviewer_name"),
            F.col("comments")
        ).withColumn("city", F.lit(city.upper()))
    )
    silver_reviews_tbl = fq(SILVER_DB, f"silver_reviews_{city}")
    (silver_reviews.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable(silver_reviews_tbl))


In [0]:
print(f"[SILVER âœ…] {city.upper()} written:\n  {silver_listings_tbl}\n  {silver_calendar_tbl}" + (f"\n  {silver_reviews_tbl}" if table_exists(f'{CATALOG}.{SILVER_DB}', f'silver_reviews_{city}') else ""))

In [0]:
spark.sql("SHOW TABLES IN airbnb_lab3.airbnb_silver").show()
