In [None]:
# ============================================================
# DATA VISUALIZATION — CLEAN, SAFE, ERROR-PROOF VERSION
# ============================================================

# CSV VISUAL

import pandas as pd
import plotly.express as px
import numpy as np

# =====================
# Load raw Airbnb data
# =====================
listings = pd.read_csv("../data/listings.csv", low_memory=False)
calendar = pd.read_csv("../data/calendar.csv.gz", low_memory=False)
reviews = pd.read_csv("../data/reviews.csv.gz", low_memory=False)

# Minimal preprocessing
calendar["date"] = pd.to_datetime(calendar["date"], errors="coerce")
calendar["available_bool"] = calendar["available"].map({"t": 1, "f": 0})

In [None]:
# ============================================================
# 1) ROOM TYPE DISTRIBUTION
# ============================================================
if "room_type" in listings.columns:
    room_counts = listings["room_type"].value_counts().reset_index()
    room_counts.columns = ["room_type", "count"]

    fig = px.bar(room_counts, x="room_type", y="count")
    fig.update_layout(title_text="Room Type Distribution", title_x=0.5)
    fig.show()
else:
    print("⚠️ Column 'room_type' not found.")


# ============================================================
# 2) SUPERHOST DISTRIBUTION (Auto-detect)
# ============================================================
superhost_col = None
for col in listings.columns:
    if "superhost" in col.lower():
        superhost_col = col
        break

if superhost_col:
    host_counts = listings[superhost_col].value_counts().reset_index()
    host_counts.columns = ["superhost_status", "count"]

    fig = px.bar(host_counts, x="superhost_status", y="count")
    fig.update_layout(
        title_text=f"Superhost Distribution (Column: {superhost_col})",
        title_x=0.5
    )
    fig.show()
else:
    print("⚠️ No superhost column found in listings.csv.")
#dd


# ============================================================
# 3) DAILY AVAILABILITY TREND
# ============================================================
availability_by_day = (
    calendar.groupby("date")["available_bool"]
            .mean()
            .reset_index()
)

fig = px.line(availability_by_day, x="date", y="available_bool")
fig.update_layout(title_text="Daily Availability Rate Over Time", title_x=0.5)
fig.show()


# ============================================================
# 4) AVAILABILITY BY DAY OF WEEK


# ============================================================
calendar["dayofweek"] = calendar["date"].dt.dayofweek

availability_dow = (
    calendar.groupby("dayofweek")["available_bool"]
            .mean()
            .reset_index()
)

fig = px.bar(availability_dow, x="dayofweek", y="available_bool")
fig.update_layout(title_text="Average Availability by Day of Week", title_x=0.5)
fig.show()


# ============================================================
# 5) AVAILABILITY BY MONTH
# ============================================================
calendar["month"] = calendar["date"].dt.month

availability_month = (
    calendar.groupby("month")["available_bool"]
            .mean()
            .reset_index()
)

fig = px.line(availability_month, x="month", y="available_bool")
fig.update_layout(title_text="Average Availability by Month", title_x=0.5)
fig.show()


# ============================================================
# 6) NEIGHBOURHOOD DISTRIBUTION (TOP 20)
# ============================================================
if "neighbourhood_cleansed" in listings.columns:
    neigh_counts = (
        listings["neighbourhood_cleansed"]
        .value_counts()
        .head(20)
        .reset_index()
    )
    neigh_counts.columns = ["neighbourhood", "count"]

    fig = px.bar(neigh_counts, x="neighbourhood", y="count")
    fig.update_layout(
        title_text="Top 20 Neighbourhoods by Listing Count",
        title_x=0.5
    )
    fig.show()
else:
    print("⚠️ Column 'neighbourhood_cleansed' not found.")


# ============================================================
# 7) AVERAGE REVIEW LENGTH OVER TIME
# ============================================================
reviews["date"] = pd.to_datetime(reviews["date"], errors="coerce")
reviews["review_length"] = reviews["comments"].astype(str).str.len()

daily_review_len = (
    reviews.groupby("date")["review_length"]
           .mean()
           .reset_index()
)

fig = px.line(daily_review_len, x="date", y="review_length")
fig.update_layout(
    title_text="Average Review Length Over Time",
    title_x=0.5
)
fig.show()


# ============================================================
# 8) ACCOMMODATES VS AVAILABILITY
# ============================================================

# Detect the listing ID column
id_candidates = [col for col in listings.columns if "id" in col.lower()]
listing_id_col = None
for col in id_candidates:
    if col not in ["host_id", "owner_id"]:
        listing_id_col = col
        break

if listing_id_col and "accommodates" in listings.columns:
    merged_cap = calendar.merge(
        listings[[listing_id_col, "accommodates"]],
        left_on="listing_id",
        right_on=listing_id_col,
        how="left"
    )

    cap_avail = (
        merged_cap.groupby("accommodates")["available_bool"]
                  .mean()
                  .reset_index()
    )

    fig = px.line(cap_avail, x="accommodates", y="available_bool")
    fig.update_layout(
        title_text="Average Availability by Number of Guests",
        title_x=0.5
    )
    fig.show()
else:
    print("⚠️ Could not generate 'accommodates vs availability' plot (missing columns).")


⚠️ No superhost column found in listings.csv.


⚠️ Column 'neighbourhood_cleansed' not found.


⚠️ Could not generate 'accommodates vs availability' plot (missing columns).
