In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DebugDataValidation") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

In [None]:
listings_path = "../output/cleaned_listings.parquet"

try:
    listings_df = spark.read.parquet(listings_path)
    print(f"✅ Loaded {listings_df.count()} rows from cleaned_listings.parquet")
    listings_df.show(5, truncate=False)
except Exception as e:
    print(f"❌ Failed to load {listings_path}\n{e}")

In [None]:
calendar_path = "../output/cleaned_calendar.parquet"

try:
    calendar_df = spark.read.parquet(calendar_path)
    print(f"✅ Loaded {calendar_df.count()} rows from cleaned_calendar.parquet")

    calendar_df.select("listing_id", "date", "price", "available").show(5, truncate=False)

    # Check for null prices
    null_prices = calendar_df.filter(calendar_df.price.isNull()).count()
    print(f"⚠️  {null_prices} rows have null price")

except Exception as e:
    print(f"❌ Failed to load {calendar_path}\n{e}")

In [None]:
revenue_path = "../output/revenue_by_listing.csv"

try:
    revenue_df = spark.read.option("header", True).csv(revenue_path)
    revenue_df = revenue_df.withColumn("total_revenue", revenue_df["total_revenue"].cast("double"))

    print(f"✅ Loaded {revenue_df.count()} rows from revenue_by_listing.csv")
    revenue_df.select("listing_id", "total_revenue", "occupancy_rate").show(5, truncate=False)

    null_revenue = revenue_df.filter(revenue_df.total_revenue.isNull()).count()
    print(f"⚠️  {null_revenue} rows have null total_revenue")

except Exception as e:
    print(f"❌ Failed to load {revenue_path}\n{e}")

In [None]:
spark.stop()