In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.sql("SELECT 1").collect()

In [0]:
# ---- Param: which city to load ----
dbutils.widgets.text("city", "la")  # "la" or "nyc"
city = dbutils.widgets.get("city").lower().strip()
assert city in {"la", "nyc"}, f"city must be 'la' or 'nyc', got {city!r}"

# ---- Unity Catalog targets ----
CATALOG   = "airbnb_lab3"
BRONZE_DB = "airbnb_bronze"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{BRONZE_DB}")

def fq(db, table): 
    return f"{CATALOG}.{db}.{table}"

# ---- UC Volume with files ----
VOLUME_ROOT = "/Volumes/airbnb_lab3/default/airbnb"

# LA = no suffix, NYC = '-2'
CITY_SUFFIX = {"la": "", "nyc": "-2"}

def build_path(dataset: str, city: str) -> str:
    # matches .csv and .csv.gz
    return f"{VOLUME_ROOT}/{dataset}{CITY_SUFFIX[city]}.csv*"

# ---- Reader (CSV) ----
from pyspark.sql import functions as F

def read_csv_any(path_glob: str):
    return (spark.read
            .option("header", True)
            .option("multiLine", True)
            .option("escape", "\"")
            .option("quote",  "\"")
            .csv(path_glob))

In [0]:
def with_uc_metadata(df, city_lit: str):
    return (df
            .withColumn("ingest_file_path", F.col("_metadata.file_path"))
            .withColumn("ingest_file_name",
                        F.regexp_extract(F.col("_metadata.file_path"), r'([^/]+)$', 1))
            .withColumn("ingest_city", F.lit(city_lit)))

In [0]:
# ===================== LISTINGS =====================
listings_path = build_path("listings", city)
df_listings = with_uc_metadata(read_csv_any(listings_path), city)

tbl_listings = fq(BRONZE_DB, f"bronze_listings_{city}")
(df_listings.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(tbl_listings))

# ===================== CALENDAR =====================
calendar_path = build_path("calendar", city)
df_calendar = with_uc_metadata(read_csv_any(calendar_path), city)

tbl_calendar = fq(BRONZE_DB, f"bronze_calendar_{city}")
(df_calendar.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(tbl_calendar))

# ===================== REVIEWS ======================
reviews_path = build_path("reviews", city)
df_reviews = with_uc_metadata(read_csv_any(reviews_path), city)

tbl_reviews = fq(BRONZE_DB, f"bronze_reviews_{city}")
(df_reviews.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(tbl_reviews))

In [0]:
print(f"[BRONZE ✅] Ingested {city}")
print(f"  {tbl_listings}")
print(f"  {tbl_calendar}")
print(f"  {tbl_reviews}")

In [0]:
spark.sql("SHOW TABLES IN airbnb_lab3.airbnb_bronze").show()


In [0]:
for ds in ["listings","calendar","reviews"]:
    for city in ["la","nyc"]:
        try:
            cnt = spark.table(f"airbnb_lab3.airbnb_bronze.bronze_{ds}_{city}").count()
            print(f"{ds}_{city}: {cnt}")
        except:
            print(f"{ds}_{city}: MISSING ❌")
