# Airbnb Host Segmentation & Superhost Prediction: End-to-End Pipeline

This notebook demonstrates the full workflow: data loading, preprocessing, clustering, superhost prediction, and feature analysis.

In [None]:
# Imports and utility setup
import sys
sys.path.append('../src')
import pandas as pd
from data_utils import load_data, clean_data, scale_features
from segmentation import run_kmeans
from superhost_prediction import train_superhost_classifier, get_feature_importance
from visualization import plot_feature_importance, plot_cluster_summary

## 1. Load and Preprocess Data

In [None]:
# Load data (update path as needed)
data = load_data('../data/processed_data_before_split.csv')

# Clean data
data = clean_data(data)

## 2. Market Segmentation (Clustering)

In [None]:
# Select features for clustering
segmentation_features = [
    'numReviews_pastYear', 'rating_ave_pastYear', 'Nightly Rate',
    'available_days', 'booked_days', 'booked_days_avePrice', 'available_days_aveListedPrice'
]

# Scale features
data_scaled, scaler = scale_features(data, segmentation_features)

# Run KMeans clustering
clustered_data, kmeans = run_kmeans(data_scaled, segmentation_features, n_clusters=6)

# View cluster counts
print(clustered_data['Cluster'].value_counts())

## 3. Superhost Prediction

In [None]:
# Define features and target for prediction
features = [
    'rating_ave_pastYear', 'numReviews_pastYear', 'numCancel_pastYear',
    'Nightly Rate', 'available_days', 'booked_days',
    'available_days_aveListedPrice', 'booked_days_avePrice',
    'tract_superhosts_ratio', 'tract_price_variance'
]
target = 'host_is_superhost_in_period'

# Drop rows with missing target
model_data = clustered_data[features + [target]].dropna()
X = model_data[features]
y = model_data[target]

# Train classifier
model = train_superhost_classifier(X, y, model_type='xgb')

# Feature importance
importances = get_feature_importance(model, features)
plot_feature_importance(importances, top_n=10, title="Global Feature Importances")

## 4. Cluster-wise Analysis

In [None]:
# Compute cluster-level superhost probability
clustered_data['Predicted_Probability_Superhost'] = model.predict_proba(clustered_data[features])[:, 1]
cluster_summary = clustered_data.groupby('Cluster').agg(
    Average_Probability=('Predicted_Probability_Superhost', 'mean'),
    Total_Hosts=('Cluster', 'size'),
    Superhost_Likely_Count=('Predicted_Probability_Superhost', lambda x: (x > 0.8).sum())
).reset_index()

plot_cluster_summary(cluster_summary, value_col='Average_Probability', title='Average Superhost Probability by Cluster')

print(cluster_summary)

## 5. Save Outputs

In [None]:
# Save cluster summary and predictions
clustered_data.to_csv('../outputs/segmented_data_with_predictions.csv', index=False)
cluster_summary.to_csv('../outputs/cluster_summary.csv', index=False)
print("Outputs saved to outputs/ directory.")