# Import Libaray and Data

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,Dense, Dropout, BatchNormalization ,LeakyReLU
from tensorflow.keras.optimizers import Adam ,AdamW
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from keras.callbacks import ReduceLROnPlateau
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from tensorflow.keras.optimizers.schedules import ExponentialDecay

# **Data Loading**

In [51]:
df_selected_scaled = pd.read_csv("/kaggle/input/airbnb-df-scaled-selected-features/airbnb_df_selected_features.csv") 

In [52]:
df_selected_scaled

Unnamed: 0,price,bathrooms,beds,guests,bedrooms,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Azerbaijan,...,country_Uganda,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Vietnam,country_Åland Islands
0,8078,1,1,2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4665,2,2,4,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5991,1,3,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11339,1,2,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6673,1,1,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11527,20311,2,4,8,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11528,4288,8,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11529,15200,0,1,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11530,9103,1,2,2,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


 To enhance feature representation, new ratio-based features such as price_per_bed, price_per_guest, bedroom_per_guest, bathroom_per_guest, and bed_bath_ratio were engineered. These derived features help capture relative value and density aspects that raw features might not fully convey. 

After handling any division-related missing values, standard feature scaling was applied to selected numerical columns to normalize their distribution. This ensures that all features contribute equally to model training, especially for distance-based or gradient-sensitive algorithms.

In [53]:
# Add these before train-test split
df_selected_scaled['price_per_bed'] = df_selected_scaled['price'] / df_selected_scaled['beds'].replace(0, np.nan)
df_selected_scaled['price_per_guest'] = df_selected_scaled['price'] / df_selected_scaled['guests'].replace(0, np.nan)
df_selected_scaled['bedroom_per_guest'] = df_selected_scaled['bedrooms'] / df_selected_scaled['guests'].replace(0, np.nan)
df_selected_scaled['bathroom_per_guest'] = df_selected_scaled['bathrooms'] / df_selected_scaled['guests'].replace(0, np.nan)
df_selected_scaled['bed_bath_ratio'] = df_selected_scaled['beds'] / df_selected_scaled['bathrooms'].replace(0, np.nan)
df_selected_scaled = df_selected_scaled.fillna(0) 

In [54]:
# Copy the encoded dataset
df_scaled = df_selected_scaled.copy()

#Identify numeric columns to scale (excluding target 'price')
numerical_cols = ['bathrooms', 'beds', 'guests', 'bedrooms']

# Initialize scaler
scaler = StandardScaler()

# Fit and transform numerical features
df_scaled[numerical_cols] = scaler.fit_transform(df_scaled[numerical_cols])

#check scaled values (mean should be ~0, std ~1)
print(df_scaled[numerical_cols].describe())

          bathrooms          beds        guests      bedrooms
count  1.153200e+04  1.153200e+04  1.153200e+04  1.153200e+04
mean   2.218135e-17 -2.218135e-17 -6.407947e-17  1.725216e-17
std    1.000043e+00  1.000043e+00  1.000043e+00  1.000043e+00
min   -9.742699e-01 -8.331654e-01 -1.105576e+00 -9.310629e-01
25%   -3.576244e-01 -5.465457e-01 -8.054322e-01 -4.209908e-01
50%   -3.576244e-01 -2.599261e-01 -2.051450e-01 -4.209908e-01
75%    2.590210e-01  2.669350e-02  3.951423e-01  8.908126e-02
max    2.985800e+01  2.725556e+01  3.396578e+00  2.457254e+01


# Spliting the Dataset

In [55]:
# Define features and target
X = df_scaled.drop('price', axis=1)
y = df_scaled['price']

In [56]:
# Split into train and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (9225, 125)
Test shape: (2307, 125)


# **Step 3: Model Creation:**
#  Custom ANN

In [58]:
def build_ann_model(input_dim):
    model = Sequential()

    model.add(Dense(512, kernel_initializer='he_normal', input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(256, kernel_initializer='he_normal'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(128, kernel_initializer='he_normal'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.3))

    model.add(Dense(64, kernel_initializer='he_normal'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.2))

    model.add(Dense(1))  # Regression output

    optimizer = AdamW(learning_rate=0.002, weight_decay=1e-4)

    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# K-Fold Cross-Validation

In [59]:
# Set up K-Fold Cross Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
mse_scores = []
mae_scores = []

mse_scores_rf = []
mae_scores_rf = []

mse_scores_xgb = []
mae_scores_xgb = []

# ANN Model Training And Evaluating

So here we build A custom Artificial Neural Network (ANN) designed for Airbnb price prediction, using multiple dense layers with LeakyReLU activation, batch normalization, and dropout for regularization. 

The AdamW optimizer was used with a specified learning rate and weight decay to improve convergence. To ensure robust performance evaluation, 10-fold cross-validation was applied. 

Early stopping and learning rate reduction callbacks were used during training to prevent overfitting and optimize training efficiency. 

Mean Squared Error (MSE) and Mean Absolute Error (MAE) were calculated for each fold to assess the model’s performance.

In [26]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = build_ann_model(input_dim=X.shape[1])

    early_stop = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10,
                                  verbose=1, min_lr=1e-6)

    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=200,
        batch_size=64,
        callbacks=[early_stop, reduce_lr],
        verbose=0
    )

    y_pred = model.predict(X_val, verbose=0)
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)

    print(f"Fold {fold + 1}: MSE = {mse:.2f}, MAE = {mae:.2f}")
    mse_scores.append(mse)
    mae_scores.append(mae)

print("\nFinal ANN Performance:")
print(f"Average MSE: {np.mean(mse_scores):.2f}")
print(f"Average MAE: {np.mean(mae_scores):.2f}")



Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 29: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 43: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 76: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 101: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 128: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 146: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 169: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 179: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.

Epoch 196: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.
Fold 1: MSE = 1586498.30, MAE = 585.41

Fold 2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 16: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 37: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 50: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 67: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 94: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 126: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 145: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 173: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.

Epoch 183: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.

Epoch 193: ReduceLROnPlateau reducing learning rate to 1.9531250927684596e-06.
Fold 2: MSE = 1074704.44, MAE = 558.12

Fold 3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 34: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 65: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 78: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 107: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 120: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 133: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 143: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.

Epoch 155: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.

Epoch 165: ReduceLROnPlateau reducing learning rate to 1.9531250927684596e-06.

Epoch 175: ReduceLROnPlateau reducing learning rate to 1e-06.
Fold 3: MSE = 1201139.57, MAE = 592.16

Fold 4


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 15: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 35: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 85: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 99: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 122: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 160: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 171: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 181: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.

Epoch 191: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.
Fold 4: MSE = 971196.27, MAE = 571.18

Fold 5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 23: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 37: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 64: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 79: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 115: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 148: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 165: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 175: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.

Epoch 185: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.
Fold 5: MSE = 833285.46, MAE = 519.92

Fold 6


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 26: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 42: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 65: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 77: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 96: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 121: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 131: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 167: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.
Fold 6: MSE = 914174.54, MAE = 532.26

Fold 7


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 31: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 49: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 73: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 96: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 127: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 183: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Fold 7: MSE = 1002378.13, MAE = 508.39

Fold 8


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 28: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 40: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 67: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 86: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 114: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 137: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 149: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 159: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.

Epoch 169: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.
Fold 8: MSE = 998776.87, MAE = 524.05

Fold 9


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 34: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 57: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 69: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 84: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 103: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 121: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 131: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 168: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.

Epoch 178: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.

Epoch 190: ReduceLROnPlateau reducing learning rate to 1.9531250927684596e-06.

Epoch 200: ReduceLROnPlateau reducing learning rate to 1e-06.
Fold 9: MSE = 1085119.39, MAE = 547.40

Fold 10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 27: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 50: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 85: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 104: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 134: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 160: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 186: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.
Fold 10: MSE = 846769.35, MAE = 519.84

📊 Final ANN Performance:
Average MSE: 1051404.23
Average MAE: 545.87


# ANN Cross-Validation Performance

In [45]:
# Use the unlogged version
original_price = df_scaled['price']
print(f"Average price: {original_price.mean():.2f}")
print(f"Median price: {original_price.median():.2f}")

Average price: 9274.60
Median price: 7248.50


# Random Forest

We considered A Random Forest Regressor that is optimized using RandomizedSearchCV to find the best combination of hyperparameters, such as the number of trees, maximum depth, and minimum sample requirements.

In [60]:
param_dist = {
    'n_estimators': [150, 200, 250, 300, 350],
    'max_depth': [10, 15, 20, 25, None],
    'min_samples_split': [2, 4, 6, 10],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_features': ['auto', 'sqrt']
}

# Create base model
rf = RandomForestRegressor(random_state=42)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='neg_mean_absolute_error'
)

# Fit model
random_search.fit(X, y)

# Best parameters
print("Best parameters:", random_search.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters: {'n_estimators': 250, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 20}


The search here was performed with 5-fold cross-validation, using negative MAE as the scoring metric. 

Once the optimal parameters were selected, the model is now for training and evaluating using 10-fold cross-validation. 

Mean Squared Error (MSE) and Mean Absolute Error (MAE) were calculated for each fold to assess the model’s accuracy and generalization performance.

In [61]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = RandomForestRegressor(**random_search.best_params_, random_state=42, n_jobs=-1)
    model.fit(X_tr, y_tr)

    y_pred_rf1 = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred_rf1)
    mae = mean_absolute_error(y_val, y_pred_rf1)

    print(f"Fold {fold + 1}: MSE = {mse:.2f}, MAE = {mae:.2f}")
    mse_scores_rf.append(mse)
    mae_scores_rf.append(mae)



  warn(


Fold 1: MSE = 28504.80, MAE = 45.70


  warn(


Fold 2: MSE = 18607.96, MAE = 40.55


  warn(


Fold 3: MSE = 9735.00, MAE = 33.59


  warn(


Fold 4: MSE = 35387.17, MAE = 48.21


  warn(


Fold 5: MSE = 35121.55, MAE = 45.25


  warn(


Fold 6: MSE = 12561.66, MAE = 34.95


  warn(


Fold 7: MSE = 23623.16, MAE = 42.98


  warn(


Fold 8: MSE = 83897.44, MAE = 43.69


  warn(


Fold 9: MSE = 26342.37, MAE = 41.60


  warn(


Fold 10: MSE = 13469.43, MAE = 34.64


In [62]:
print("\nRandom Forest Final Performance:")
print(f"Average MSE: {np.mean(mse_scores_rf):.2f}")
print(f"Average MAE: {np.mean(mae_scores_rf):.2f}")


Random Forest Final Performance:
Average MSE: 28725.05
Average MAE: 41.12


To evaluate model performance more concretely, a comparison DataFrame was created displaying actual vs. predicted prices from the validation set. 

This allows for a direct, side-by-side assessment of how closely the model's predictions align with real values. The top 20 rows were shown to visually inspect prediction accuracy.

In [63]:
# Create DataFrame with actual and predicted prices
comparison_df = pd.DataFrame({
    'Actual Price': y_val.values.flatten(),
    'Predicted Price': y_pred_rf1.flatten()
})

# Show top 20
comparison_df.head(20)

Unnamed: 0,Actual Price,Predicted Price
0,6673,6666.261632
1,14729,14659.285819
2,30486,30005.804648
3,9028,9027.385128
4,6181,6172.255448
5,2740,2717.308576
6,5409,5448.799326
7,6505,6504.942295
8,5640,5629.030219
9,6165,6170.567657


# XGBoost Model

An XGBoost Regressor is built with carefully tuned hyperparameters to improve predictive accuracy and control overfitting.
The model is evaluated using 10-fold cross-validation to ensure reliable performance across different subsets of the data. 

In [64]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}")
    
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]  

    xgb = XGBRegressor(
        n_estimators=314,              
        learning_rate=0.07,            
        max_depth=9,
        subsample=0.94,
        colsample_bytree=0.97,
        gamma=0.26,
        reg_alpha=0.15,
        reg_lambda=1.46,
        random_state=42,
        n_jobs=-1
    )
    xgb.fit(X_tr, y_tr)

    # Predict and evaluate
    y_pred_xgb1 = xgb.predict(X_val)
    mse = mean_squared_error(y_val, y_pred_xgb1)
    mae = mean_absolute_error(y_val, y_pred_xgb1)

    print(f"Fold {fold + 1}: MSE = {mse:.2f}, MAE = {mae:.2f}")
    mse_scores_xgb.append(mse)
    mae_scores_xgb.append(mae)



Fold 1
Fold 1: MSE = 111196.60, MAE = 102.83

Fold 2
Fold 2: MSE = 37323.79, MAE = 80.22

Fold 3
Fold 3: MSE = 58822.49, MAE = 92.77

Fold 4
Fold 4: MSE = 87108.24, MAE = 95.22

Fold 5
Fold 5: MSE = 49053.64, MAE = 91.24

Fold 6
Fold 6: MSE = 41390.31, MAE = 79.62

Fold 7
Fold 7: MSE = 66331.76, MAE = 89.67

Fold 8
Fold 8: MSE = 72919.74, MAE = 85.72

Fold 9
Fold 9: MSE = 79513.33, MAE = 93.47

Fold 10
Fold 10: MSE = 37971.45, MAE = 77.55


So, In each fold, the model was trained on 90% of the data and validated on the remaining 10%. 

Performance was assessed using Mean Squared Error (MSE) and Mean Absolute Error (MAE), with average scores reported to summarize the model’s overall effectiveness in predicting Airbnb listing prices.

In [65]:
# Final average performance
print("\nXGBoost Cross-Validation Performance:")
print(f"Average MSE: {np.mean(mse_scores_xgb):.2f}")
print(f"Average MAE: {np.mean(mae_scores_xgb):.2f}")


XGBoost Cross-Validation Performance:
Average MSE: 64163.13
Average MAE: 88.83


In [66]:
# Create DataFrame with actual and predicted prices
comparison_df = pd.DataFrame({
    'Actual Price': y_val.values.flatten(),
    'Predicted Price': y_pred_xgb1.flatten()
})

# Show top 20
comparison_df.head(20)

Unnamed: 0,Actual Price,Predicted Price
0,6673,6651.904297
1,14729,14914.351562
2,30486,30616.869141
3,9028,9014.174805
4,6181,6247.50293
5,2740,2777.513184
6,5409,5380.056152
7,6505,6491.900391
8,5640,5747.643555
9,6165,6182.474609


[CV] END max_depth=None, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=200; total time=  11.4s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=250; total time=  16.9s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  15.4s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  15.9s
[CV] END max_depth=25, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=300; total time=  18.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=  10.4s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   2.5s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   2.9s
[CV] END max_depth=20, max_features

# **4.4Ensemble Methods**
**Combined Model** 

In [37]:
# Loop over K-Folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # === Train ANN ===
    ann = build_ann_model(input_dim=X.shape[1])
    early_stop = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, min_lr=1e-6)
    ann.fit(X_train, y_train, validation_data=(X_val, y_val),
            epochs=200, batch_size=64, callbacks=[early_stop, reduce_lr], verbose=0)
    y_pred_ann = ann.predict(X_val, verbose=0).flatten()

    # === Train Random Forest ===
    rf = RandomForestRegressor(n_estimators=250, max_depth=15, random_state=42)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_val)

    # === Train XGBoost ===
    xgb = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=8, random_state=42)
    xgb.fit(X_train, y_train)
    y_pred_xgb = xgb.predict(X_val)

    # === Ensemble: Average predictions ===
    y_pred_ensemble = (y_pred_ann + y_pred_rf + y_pred_xgb) / 3

    mse = mean_squared_error(y_val, y_pred_ensemble)
    mae = mean_absolute_error(y_val, y_pred_ensemble)

    print(f"Fold {fold + 1} 📊 MSE: {mse:.2f}, MAE: {mae:.2f}")
    mse_scores.append(mse)
    mae_scores.append(mae)


Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 1 📊 MSE: 250210.37, MAE: 213.68

Fold 2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 2 📊 MSE: 135724.30, MAE: 183.50

Fold 3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 3 📊 MSE: 168761.29, MAE: 225.62

Fold 4


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 4 📊 MSE: 166210.60, MAE: 233.35

Fold 5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 5 📊 MSE: 129196.28, MAE: 190.40

Fold 6


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 6 📊 MSE: 114268.72, MAE: 188.80

Fold 7


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 7 📊 MSE: 109891.54, MAE: 168.59

Fold 8


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 8 📊 MSE: 147777.67, MAE: 174.84

Fold 9


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 9 📊 MSE: 124276.00, MAE: 181.64

Fold 10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 10 📊 MSE: 117032.53, MAE: 182.86


In [38]:
# === Final Evaluation ===
print("\nEnsemble Performance Summary:")
print(f"Average MSE: {np.mean(mse_scores):.2f}")
print(f"Average MAE: {np.mean(mae_scores):.2f}")


Ensemble Performance Summary:
Average MSE: 146334.93
Average MAE: 194.33


In [67]:
# Create DataFrame with actual and predicted prices
comparison_df = pd.DataFrame({
    'Actual Price': y_val.values.flatten(),
    'Predicted Price': y_pred_ensemble.flatten()
})

# Show top 20
comparison_df.head(20)

Unnamed: 0,Actual Price,Predicted Price
0,6673,6732.985805
1,14729,14841.88343
2,30486,28967.393778
3,9028,8983.167637
4,6181,6207.954385
5,2740,2512.275627
6,5409,5393.661038
7,6505,6586.58531
8,5640,5645.24526
9,6165,6175.109374


# **Conclusion**

Among the three models, Random Forest achieved the best performance with the lowest MAE (~ 41), indicating highly accurate predictions.
XGBoost followed closely with a moderate MAE (~ 88) and good generalization across folds.
The custom ANN model, while performing consistently, had a higher MAE ( ~ 546), suggesting room for further tuning or feature enhancement.
and the combined MAE is 194.33


Overall, tree-based models (especially Random Forest) proved more effective for this Airbnb price prediction task.