In [0]:
from pyspark.sql import functions as F

CATALOG   = "airbnb_lab3"
SILVER_DB = "airbnb_silver"
GOLD_DB   = "airbnb_gold"

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{GOLD_DB}")
def fq(db, t): return f"{CATALOG}.{db}.{t}"
def city_tbl(db, base, city): return f"{CATALOG}.{db}.{base}_{city}"

In [0]:
# ---------- Monthly city metrics (incl. occupancy) ----------
def monthly_metrics_for(city):
    l = spark.table(city_tbl(SILVER_DB, "silver_listings", city)).alias("l")
    c = spark.table(city_tbl(SILVER_DB, "silver_calendar", city)).alias("c")
    cm = c.withColumn("yyyymm", F.date_format("date","yyyy-MM"))

    day_agg = (cm.groupBy("listing_id","yyyymm")
                 .agg(F.sum(F.col("is_occupied").cast("int")).alias("occupied_days"),
                      F.count(F.lit(1)).alias("total_days"),
                      F.avg("price").alias("avg_price")))
    j = (day_agg.join(l.select("listing_id","review_scores_rating"), "listing_id", "left")
              .withColumn("city", F.lit(city.upper()))
              .withColumn("occupancy_rate", F.col("occupied_days")/F.col("total_days")))

    return (j.groupBy("city","yyyymm")
             .agg(F.countDistinct("listing_id").alias("total_listings"),
                  F.avg("avg_price").alias("avg_nightly_price"),
                  F.avg("review_scores_rating").alias("avg_review_score"),
                  F.avg("occupancy_rate").alias("avg_occupancy_rate")))

In [0]:
city_month = monthly_metrics_for("la").unionByName(monthly_metrics_for("nyc"), allowMissingColumns=True)
gold_city_month = fq(GOLD_DB, "gold_city_month_metrics")
(city_month.write.format("delta").mode("overwrite").saveAsTable(gold_city_month))

In [0]:
# ---------- Neighbourhood by month ----------
def neighbourhood_metrics_for(city):
    l = spark.table(city_tbl(SILVER_DB, "silver_listings", city))
    c = spark.table(city_tbl(SILVER_DB, "silver_calendar", city)).withColumn("yyyymm", F.date_format("date","yyyy-MM"))
    dm = (c.groupBy("listing_id","yyyymm")
            .agg(F.sum(F.col("is_occupied").cast("int")).alias("occupied_days"),
                 F.count(F.lit(1)).alias("total_days"),
                 F.avg("price").alias("avg_price")))
    j = dm.join(l.select("listing_id","neighbourhood"), "listing_id", "left")
    return (j.groupBy(F.lit(city.upper()).alias("city"), "yyyymm", "neighbourhood")
            .agg(F.countDistinct("listing_id").alias("listing_count"),
                 F.avg("avg_price").alias("avg_price"),
                 F.avg(F.col("occupied_days")/F.col("total_days")).alias("avg_occupancy_rate")))

In [0]:
neigh_all = neighbourhood_metrics_for("la").unionByName(neighbourhood_metrics_for("nyc"))
gold_neigh = fq(GOLD_DB, "gold_neighbourhood_metrics")
(neigh_all.write.format("delta").mode("overwrite").saveAsTable(gold_neigh))

In [0]:
# ---------- Room type share by month (active listings) ----------
def roomtype_share_for(city):
    l = spark.table(city_tbl(SILVER_DB, "silver_listings", city))
    c = spark.table(city_tbl(SILVER_DB, "silver_calendar", city)).withColumn("yyyymm", F.date_format("date","yyyy-MM"))
    active = c.select("listing_id","yyyymm").distinct().join(l.select("listing_id","room_type"), "listing_id")
    by_rm = active.groupBy(F.lit(city.upper()).alias("city"), "yyyymm","room_type").agg(F.countDistinct("listing_id").alias("cnt"))
    total = by_rm.groupBy("city","yyyymm").agg(F.sum("cnt").alias("total"))
    return by_rm.join(total, ["city","yyyymm"]).withColumn("share", F.col("cnt")/F.col("total"))

In [0]:
roomtype_all = roomtype_share_for("la").unionByName(roomtype_share_for("nyc"))
gold_roomtype = fq(GOLD_DB, "gold_roomtype_share")
(roomtype_all.write.format("delta").mode("overwrite").saveAsTable(gold_roomtype))

In [0]:
def table_exists(db_3part: str, table_name: str) -> bool:
    return spark.sql(f"SHOW TABLES IN {db_3part} LIKE '{table_name}'").count() > 0


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

def reviews_monthly_for(city):
    db = f"{CATALOG}.{SILVER_DB}"
    tbl_name = f"silver_reviews_{city}"
    if table_exists(db, tbl_name):
        r = spark.table(f"{db}.{tbl_name}").withColumn("yyyymm", F.date_format("review_date","yyyy-MM"))
        return r.groupBy(F.lit(city.upper()).alias("city"), "yyyymm").agg(F.count("*").alias("review_count"))
    else:
        empty_schema = StructType([
            StructField("city", StringType(), True),
            StructField("yyyymm", StringType(), True),
            StructField("review_count", LongType(), True),
        ])
        return spark.createDataFrame([], empty_schema)

In [0]:


reviews_month = reviews_monthly_for("la").unionByName(reviews_monthly_for("nyc"), allowMissingColumns=True)
gold_reviews_m = fq(GOLD_DB, "gold_reviews_monthly")
(reviews_month.write.format("delta").mode("overwrite").saveAsTable(gold_reviews_m))

In [0]:
# ---------- Cross-city compare (month-aligned) ----------
g = spark.table(gold_city_month)
cross = (g.groupBy("yyyymm")
    .pivot("city", ["LA","NYC"])
    .agg(F.first("total_listings").alias("total_listings"),
         F.first("avg_nightly_price").alias("avg_price"),
         F.first("avg_occupancy_rate").alias("occ_rate"),
         F.first("avg_review_score").alias("review")))
cross = (cross
    .withColumn("delta_listings", F.col("LA_total_listings")-F.col("NYC_total_listings"))
    .withColumn("delta_avg_price", F.col("LA_avg_price")-F.col("NYC_avg_price"))
    .withColumn("delta_occ_rate", F.col("LA_occ_rate")-F.col("NYC_occ_rate"))
    .withColumn("delta_review", F.col("LA_review")-F.col("NYC_review")))

gold_cross = fq(GOLD_DB, "gold_cross_city_compare")
(cross.write.format("delta").mode("overwrite").saveAsTable(gold_cross))

print("[GOLD âœ…] Curated all gold tables:")
print(" ", gold_city_month)
print(" ", gold_neigh)
print(" ", gold_roomtype)
print(" ", gold_reviews_m)
print(" ", gold_cross)