In [None]:
# --- STEP 1: INSTALL & SETUP ---
!pip install kagglehub --quiet

import kagglehub
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# --- STEP 2: DOWNLOAD BOSTON DATA (Public & Open) ---
print("ðŸ‡ºðŸ‡¸ Downloading Boston Airbnb Data...")
# This dataset is open access (no 403 errors!)
path = kagglehub.dataset_download("airbnb/boston")
print(f"Dataset downloaded to: {path}")

# Find the listings.csv file
csv_file = os.path.join(path, "listings.csv")

# --- STEP 3: LOAD & CLEAN DATA ---
df = pd.read_csv(csv_file)

# --- CRITICAL FIX: CLEAN THE PRICE COLUMN ---
# Boston prices look like "$1,200.00". We must remove '$' and ',' to do math.
if df['price'].dtype == 'object':
    df['price'] = df['price'].astype(str).str.replace('$', '').str.replace(',', '').astype(float)

print("âœ… Prices converted to numbers successfully.")

# --- FIX NEIGHBOURHOOD COLUMN ---
# Boston uses 'neighbourhood_cleansed' usually
if 'neighbourhood_group' not in df.columns:
    if 'neighbourhood_cleansed' in df.columns:
        df['neighbourhood_group'] = df['neighbourhood_cleansed']
    else:
        df['neighbourhood_group'] = df['neighbourhood'] # Fallback

# Standard Cleaning
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
# Filter for reasonable prices (Now safe because we fixed the '$' issue)
df = df[(df['price'] > 0) & (df['price'] < 1000)]

print(f"âœ… Data Loaded & Cleaned: {df.shape[0]} listings ready.")

# --- STEP 4: RUN THE PRICING MODEL ---
print("\n--- ðŸ¤– Training Pricing AI ---")

# 1. Select Features
features_to_use = ['neighbourhood_group', 'room_type', 'minimum_nights',
                   'number_of_reviews', 'reviews_per_month']

# 2. Filter dataset
model_df = df[features_to_use + ['price']].dropna()

# 3. One-Hot Encoding
model_df = pd.get_dummies(model_df, columns=['neighbourhood_group', 'room_type'], drop_first=True)

# 4. Split & Train
X = model_df.drop('price', axis=1)
y = model_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

print(f"Model Accuracy (R2 Score): {r2_score(y_test, model.predict(X_test)):.2f}")

# --- STEP 5: SAVE FOR DASHBOARD ---
df.to_csv('Boston_Airbnb_Cleaned.csv', index=False)
print("\nðŸŽ‰ SUCCESS: 'Boston_Airbnb_Cleaned.csv' is ready to download!")