# NYC Airbnb 2023 Data Science Analysis

This notebook reproduces the data cleaning, exploratory analysis, modeling and clustering steps used in our New York City Airbnb pricing project. Upload the `listings.csv` file from the Inside Airbnb 2023 dataset to get started.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score

sns.set(style='whitegrid', context='notebook')

In [None]:
# TODO: Upload the listings.csv file to Colab and set the correct path here
# For example, after uploading you can use:
# from google.colab import files
# uploaded = files.upload()
# df_raw = pd.read_csv('listings.csv')

file_path = 'listings.csv'  # update this if needed

# Load the dataset
print('Loading data...')
df_raw = pd.read_csv(file_path)
print(df_raw.head())
print('Shape:', df_raw.shape)

In [None]:
# Convert price to numeric by removing dollar signs and commas
# Remove rows with missing price or latitude/longitude

df = df_raw.copy()

# Clean price
price_col = df['price'].astype(str).str.replace('[\$,]', '', regex=True)
df['price'] = pd.to_numeric(price_col, errors='coerce')

# Drop rows with missing critical values
cols_to_check = ['price', 'minimum_nights', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
df = df.dropna(subset=cols_to_check)

# Parse last_review to datetime and compute days_since_last_review
from datetime import datetime

df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')
df['days_since_last_review'] = (pd.Timestamp.now() - df['last_review']).dt.days

# Create additional features
# Cap price at $500 to remove extreme outliers
cap_value = 500
df['price_capped'] = df['price'].clip(upper=cap_value)

# Log transform price and capped price (add 1 to avoid log(0))
df['log_price'] = np.log1p(df['price'])
df['log_price_capped'] = np.log1p(df['price_capped'])

# Minimum nights transformation
cap_min_nights = 30
df['minimum_nights_capped'] = df['minimum_nights'].clip(upper=cap_min_nights)
df['log_minimum_nights'] = np.log1p(df['minimum_nights'])

# Host listing count transformation
df['log_host_listings_count'] = np.log1p(df['calculated_host_listings_count'])

# Reviews transformations
df['log_reviews_per_month'] = np.log1p(df['reviews_per_month'])

# Days since last review transformation
# Fill missing days_since_last_review with a large number
max_days = df['days_since_last_review'].max()
df['days_since_last_review'] = df['days_since_last_review'].fillna(max_days)
df['log_days_since_last_review'] = np.log1p(df['days_since_last_review'])

# Display processed data
print(df.head())
print('Processed shape:', df.shape)

In [None]:
# Plot distribution of capped price and log price
fig, axs = plt.subplots(1, 2, figsize=(12, 4))

sns.histplot(df['price'], bins=50, kde=True, ax=axs[0])
axs[0].set_title('Distribution of Price')
axs[0].set_xlabel('Price (USD)')

sns.histplot(df['log_price'], bins=50, kde=True, ax=axs[1])
axs[1].set_title('Distribution of Log Price')
axs[1].set_xlabel('Log Price')

plt.show()

# Correlation heatmap of numeric variables
numeric_cols = ['price', 'minimum_nights', 'calculated_host_listings_count',
                'availability_365', 'reviews_per_month', 'days_since_last_review']

corr = df[numeric_cols].corr()
plt.figure(figsize=(6, 5))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Choose target and features
features = df[['minimum_nights', 'minimum_nights_capped',
               'calculated_host_listings_count', 'log_host_listings_count',
               'availability_365', 'reviews_per_month', 'log_reviews_per_month',
               'days_since_last_review', 'log_days_since_last_review']]

target = df['log_price_capped']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Baseline (mean)
y_pred_mean = np.repeat(y_train.mean(), len(y_test))
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_mean))
mae_baseline = mean_absolute_error(y_test, y_pred_mean)
print(f"Baseline (mean) RMSE: {rmse_baseline:.4f}, MAE: {mae_baseline:.4f}")

# Build preprocessing pipeline
numeric_features = features.columns
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features)
])

# Models
def evaluate_model(model, name):
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"{name}: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}")

# Linear regression
evaluate_model(LinearRegression(), 'Linear Regression')
# Ridge
evaluate_model(Ridge(alpha=1.0), 'Ridge Regression')
# Lasso
evaluate_model(Lasso(alpha=0.001), 'Lasso Regression')

In [None]:
# Random Forest and Gradient Boosting on log price
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42)

for model, name in [(rf, 'Random Forest'), (gb, 'Gradient Boosting')]:
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"{name}: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}")

In [None]:
# KMeans clustering on selected features
from sklearn.preprocessing import StandardScaler

cluster_features = df[['price_capped', 'minimum_nights', 'calculated_host_listings_count',
                       'availability_365', 'reviews_per_month', 'days_since_last_review']]

scaler_cluster = StandardScaler()
cluster_scaled = scaler_cluster.fit_transform(cluster_features)

# Fit KMeans
k = 4
kmeans = KMeans(n_clusters=k, n_init=20, random_state=42)
df['kmeans_cluster'] = kmeans.fit_predict(cluster_scaled)

# Cluster summary
cluster_summary = df.groupby('kmeans_cluster')['price_capped'].agg(['median','mean','count'])
print(cluster_summary)

# DBSCAN on lat/lon
coords = df_raw[['latitude','longitude']].dropna()
scaler_geo = StandardScaler()
coords_scaled = scaler_geo.fit_transform(coords)
dbscan = DBSCAN(eps=0.2, min_samples=20)
coords['geo_cluster'] = dbscan.fit_predict(coords_scaled)

# View counts per cluster
print(coords['geo_cluster'].value_counts())