<a href="https://colab.research.google.com/github/Alinehbg/EnsembleLearning/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd

# 0. Data Preprocessing

In [None]:
# Load the data into a pandas DataFrame
df = pd.read_csv("data_processed.csv")
df.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,neighbourhood_1,neighbourhood_2,neighbourhood_3
0,40.64749,-73.97237,149,1,9,0.21,6,365,0,1,0,0,1,0,0,0,0.0,0.0,1.0
1,40.75362,-73.98377,225,1,45,0.38,2,355,1,0,0,0,0,1,0,0,0.0,0.0,1.0
2,40.80902,-73.9419,150,3,0,0.0,1,365,0,1,0,0,0,1,0,0,-1.0,0.0,0.0
3,40.68514,-73.95976,89,1,270,4.64,1,194,1,0,0,0,1,0,0,0,1.0,0.0,0.0
4,40.79851,-73.94399,80,10,9,0.1,1,0,1,0,0,0,0,1,0,0,0.0,0.0,1.0


In [None]:
# Separate the features from the target variable
X = df.drop("price", axis=1)
y = df["price"]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Model

In [None]:
# Define the XGBoost Regressor model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', seed=42)

*Note: `reg:squarederror` specifies that the model should use the mean squared error (MSE) as the objective function to minimize during training. This is appropriate for regression problems, where the goal is to predict a continuous numerical value, such as the price of a real estate property.*

In [None]:
# Define the hyperparameters to tune using cross-validation
parameters = {
              
              'max_depth': [8, 9, 10],
              'min_child_weight': [0, 1],
              'reg_alpha': [3, 4, 5],
              'reg_lambda': [0.99, 1, 1.5],
              'subsample': [0.9, 1]
              }

In [None]:
xgb_cv = GridSearchCV(
         estimator = xgb.XGBRegressor(objective='reg:squarederror',
                                      gamma = 0,
                                      max_delta_step = 0,
                                    
                                      max_leaves = 0,
                                      colsample_bytree = 0.55,
                                      learning_rate = 0.045,
                                      max_bin = 300,
                                      n_estimators = 140,

                                      scale_pos_weight = 0,
                                      
                                      tree_method='gpu_hist',
                                      seed=42),
         param_grid = parameters,
         verbose = 2,
         cv = 5
)

In [None]:
# Fit the GridSearchCV object to the training data
xgb_cv.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   0.9s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   0.7s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   0.7s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=1; total time=   0.7s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=1; total time=   0.7s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=1; total time=   0.7s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=1, subsample=0.9; total time=   0.7s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=1, subsample=0.9; total time=   0.7s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=1, subsample=0.9; total time=   0.7

In [None]:
# Print the best hyperparameters found by cross-validation
print("Best Hyperparameters: ", xgb_cv.best_params_)

Best Hyperparameters:  {'max_depth': 9, 'min_child_weight': 1, 'reg_alpha': 4, 'reg_lambda': 1, 'subsample': 1}


In [None]:
# Make predictions on the test data using the best model
y_pred = xgb_cv.predict(X_test)

In [None]:
# Evaluate the model's performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

Mean Squared Error:  3048.3330730702573


In [None]:
import numpy as np

rmse = np.sqrt(mse)
print("Root Mean Squared Error: ", rmse)

Root Mean Squared Error:  55.211711376031964


In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)

R-squared:  0.5513634911981316
