<a href="https://colab.research.google.com/github/Alinehbg/EnsembleLearning/blob/Pauline/XGBoost_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd

# 0. Data Preprocessing

In [None]:
# Load the data into a pandas DataFrame
df = pd.read_csv("data_processed_v4.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group_Bronx,...,neighbourhood_1,neighbourhood_2,neighbourhood_3,neighbourhood_4,neighbourhood_5,neighbourhood_6,neighbourhood_7,neighbourhood_8,neighbourhood_9,neighbourhood_10
0,0,40.64749,-73.97237,149.0,1.0,9.0,0.21,6.0,365.0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,40.75362,-73.98377,225.0,1.0,45.0,0.38,2.0,355.0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,40.80902,-73.9419,150.0,3.0,0.0,0.0,1.0,365.0,0,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,40.68514,-73.95976,89.0,1.0,270.0,4.64,1.0,194.0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,40.79851,-73.94399,80.0,10.0,9.0,0.1,1.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Removing outliers

price_median = df["price"].median()
price_std = df["price"].std()

lb = 10.0
hb = price_median + 3*price_std 

df = df[(df['price'] >= lb) & (df['price'] <= hb)]

In [None]:
# Separate the features from the target variable
X = df.drop("price", axis=1)
y = df["price"]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Model

In [None]:
# Define the XGBoost Regressor model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', seed=42)

*Note: `reg:squarederror` specifies that the model should use the mean squared error (MSE) as the objective function to minimize during training. This is appropriate for regression problems, where the goal is to predict a continuous numerical value, such as the price of a real estate property.*

In [16]:
# Define the hyperparameters to tune using cross-validation
parameters = {
              
              'max_depth': [8, 9, 10],
              'min_child_weight': [0, 1],
              'reg_alpha': [3, 4, 5],
              'reg_lambda': [0.99, 1, 1.5],
              'subsample': [0.9, 1, 1.5]
              }

In [17]:
xgb_cv = GridSearchCV(
         estimator = xgb.XGBRegressor(objective='reg:squarederror',
                                      gamma = 0,
                                      max_delta_step = 0,
                                    
                                      max_leaves = 0,
                                      colsample_bytree = 0.55,
                                      learning_rate = 0.045,
                                      max_bin = 300,
                                      n_estimators = 140,

                                      scale_pos_weight = 0,
                                      
                                      tree_method='gpu_hist',
                                      seed=42),
         param_grid = parameters,
         verbose = 2,
         cv = 10
)

In [18]:
# Fit the GridSearchCV object to the training data
xgb_cv.fit(X_train, y_train)

Fitting 10 folds for each of 162 candidates, totalling 1620 fits
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   1.0s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   1.0s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   1.2s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   1.2s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   1.0s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   0.8s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   0.8s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; total time=   0.8s
[CV] END max_depth=8, min_child_weight=0, reg_alpha=3, reg_lambda=0.99, subsample=0.9; 

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.9/dist-packages/xgboost/sklearn.py", line 1025, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.9/dist-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.9/dist-packages/xgboost/training.py", line 185, 

In [19]:
# Print the best hyperparameters found by cross-validation
print("Best Hyperparameters: ", xgb_cv.best_params_)

Best Hyperparameters:  {'max_depth': 10, 'min_child_weight': 0, 'reg_alpha': 5, 'reg_lambda': 1.5, 'subsample': 0.9}


In [20]:
xgb_cv.best_score_

0.5081019457521949

In [21]:
# Make predictions on the test data using the best model
y_pred = xgb_cv.predict(X_test)

In [22]:
# Evaluate the model's performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

Mean Squared Error:  5340.518252712967


In [23]:
import numpy as np

rmse = np.sqrt(mse)
print("Root Mean Squared Error: ", rmse)

Root Mean Squared Error:  73.07884955794643


In [24]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)

R-squared:  0.5147517701587153
