In [None]:
# =========================
# Airbnb price prediction modelling
# =========================

# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib  

import warnings
warnings.filterwarnings('ignore')

sns.set(style='whitegrid')

# =========================
# Step 1: Load Data
# =========================

bnb = pd.read_csv(r"C:\Users\cheru\Downloads\AB_NYC_2019.csv")
bnb.head()

# Basic info
print("Data Shape:")
print(bnb.shape)
print("Data Info:")
print(bnb.info())
print("\nData Description:")
print(bnb.describe(include='all'))

# =======================================
# Step 2: Exploratory Data Analysis (EDA)
# =======================================

num_cols = [
    'price',
    'latitude',
    'longitude',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]

for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(bnb[col], kde=False)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

categorical_cols = [
    'neighbourhood_group','neighbourhood','room_type'
]

for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=bnb[col])
    plt.title(f'Count plot of {col}')
    plt.xlabel(col)
    plt.xticks(rotation=45)
    plt.ylabel('Count')
    plt.show()

print("Unique room types:", bnb['room_type'].unique())

# Histogram of Prices
plt.figure(figsize=(10,6))
sns.histplot(bnb['price'], bins=500, kde=True)
plt.title('Distribution of Airbnb Prices')
plt.xlim(0, 1200)
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

# Boxplot to detect outliers
plt.figure(figsize=(10,6))
sns.boxplot(x=bnb['price'])
plt.title('Boxplot of Airbnb Prices')
plt.xlabel('Price ($)')
plt.show()

# Price vs. room type scatterplot
plt.figure(figsize=(10,6))
sns.scatterplot(x='room_type', y='price', data=bnb)
plt.title('Price vs. room type')
plt.xlabel('Room Type')
plt.ylabel('Price ($)')
plt.show()

# Correlation heatmap
corr = bnb[num_cols].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap of Selected Numerical Features')
plt.show()

# ==================
# Data Preprocessing
# ==================

# Check for nulls
bnb.isnull().sum()

# Drop unnecessary columns
columns_to_drop = ['id','host_id','name', 'host_name', 'last_review', 'reviews_per_month']
bnb.drop(columns=columns_to_drop, inplace=True)

print("Shape after dropping columns:", bnb.shape)

# Check for duplicates
print("Number of duplicate rows:", bnb.duplicated().sum())

# Define outlier removal function using IQR method
def remove_outliers_iqr(df, features):
    """
    Removes outliers from the specified features in the DataFrame using the IQR method.
    """
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        initial_count = len(df)
        df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
        final_count = len(df)
        print(f"Removed {initial_count - final_count} outliers from {feature}")
    return df

# Remove outliers
bnb = remove_outliers_iqr(bnb, ['price', 'minimum_nights', 'number_of_reviews', 'calculated_host_listings_count', 'availability_365'])

# Boxplot to check outliers after removal
plt.figure(figsize=(10,6))
sns.boxplot(x=bnb['price'])
plt.title('Boxplot of Airbnb Prices')
plt.xlabel('Price ($)')
plt.show()

print("Shape after outlier removal:", bnb.shape)

# Encode categorical features
categorical_cols = [
    'neighbourhood_group','neighbourhood','room_type'
]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    bnb[col] = le.fit_transform(bnb[col])
    label_encoders[col] = le

print(bnb.head())

# Correlation heatmap after encoding
corr = bnb.corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap of Selected Features')
plt.show()

# ==================
# Data splitting and Scaling
# ==================

# Define features and target
X = bnb.drop(['price'], axis=1, errors='ignore')
y = bnb['price']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Normalize numerical features
numeric_features = [
    'minimum_nights',
    'number_of_reviews',
    'calculated_host_listings_count',
    'availability_365',
    'longitude',
    'latitude'
]

# Initialize scaler
scaler = StandardScaler()
scaler.fit(X_train[numeric_features])

X_train[numeric_features] = scaler.transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# ==================
# Model Evaluation
# ==================

# List of models to evaluate
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('Decision Tree', DecisionTreeRegressor(random_state=42)),
    ('Random Forest', RandomForestRegressor(random_state=42)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42)),
    ('XGBoost', XGBRegressor(objective='reg:squarederror', random_state=42))
]

# Initialize variables to track the best model
best_model_name = None
best_model_score = np.inf  # Lower MSE indicates better performance
best_model = None

# Store results
results = {}

for name, model in models:
    # Train the model
    model.fit(X_train, y_train)
    # Predict on test set
    y_pred = model.predict(X_test)
    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}
    print(f"{name} - MSE: {mse:.4f}, R2: {r2:.4f}")
    # Check if this is the best model so far
    if mse < best_model_score:
        best_model_score = mse
        best_model_name = name
        best_model = model

print(f"\nBest Model: {best_model_name} with MSE: {best_model_score:.4f}")

# ==================
# Feature Importance
# ==================

# Get feature importances from the best model if applicable
if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    # Create a DataFrame for visualization
    feature_names = X_train.columns
    feat_importances = pd.Series(importances, index=feature_names)
    # Sort feature importances
    feat_importances = feat_importances.sort_values(ascending=False)
    # Plot feature importances
    plt.figure(figsize=(10,6))
    feat_importances.plot(kind='bar')
    plt.title('Feature Importances')
    plt.ylabel('Importance Score')
    plt.show()

# ==================
# Save Models
# ==================

# Save the trained model
joblib.dump(best_model, 'Gradient Boosting2.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler2.pkl')

# Save the label encoders
joblib.dump(label_encoders, 'label_encoder2.pkl')