## Modeling (Part A)

## Loading Data

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xg 
import numpy as np 

In [3]:
data = pd.read_csv("data/cleaned_data.csv")

data.head()

Unnamed: 0,Price,Shared Room,Private Room,Person Capacity,Superhost,Multiple Rooms,Business,Cleanliness Rating,Guest Satisfaction,Bedrooms,...,City_Budapest,City_Lisbon,City_Paris,City_Rome,City_Vienna,Day_Weekday,Day_Weekend,Room_Type_Entire home/apt,Room_Type_Private room,Room_Type_Shared room
0,194.033698,False,True,2.0,False,1,0,10.0,93.0,1,...,False,False,False,False,False,True,False,False,True,False
1,344.245776,False,True,4.0,False,0,0,8.0,85.0,1,...,False,False,False,False,False,True,False,False,True,False
2,433.529398,False,True,4.0,False,0,1,9.0,90.0,2,...,False,False,False,False,False,True,False,False,True,False
3,485.552926,False,True,2.0,True,0,0,10.0,98.0,1,...,False,False,False,False,False,True,False,False,True,False
4,552.808567,False,True,3.0,False,0,0,8.0,100.0,2,...,False,False,False,False,False,True,False,False,True,False


In [6]:
X = data.drop('Price', axis=1)
y = data['Price']

# Splitting the data into train, validation, and test sets
# 60% train, 20% validation, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


## Random Forest Regressor

In [8]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Train the model on the training set
rf_model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = rf_model.predict(X_val)

# Calculate the Mean Squared Error on the validation set
rmse = mean_squared_error(y_val, y_val_pred)

print("RMSE : % f" %(rmse)) 

5547.864448238621

### Tuning Radom Forest

Doing a GridSearch in order to find the best parameters

In [9]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the grid search 
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_params


Fitting 3 folds for each of 81 candidates, totalling 243 fits


{'max_depth': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 150}

Tunned Random Forest

In [22]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, max_depth= 30, 
                                 min_samples_leaf= 1, min_samples_split= 2, n_estimators= 150)

# Train the model on the training set
rf_model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = rf_model.predict(X_val)

# Calculate the Mean Squared Error on the validation set
rmse = mean_squared_error(y_val, y_val_pred)

print("RMSE : % f" %(rmse)) 

RMSE :  5527.145102


## XGBoost Regressor

In [25]:
# Instantiation 
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', seed = 42) 
  
# Fitting the model 
xgb_r.fit(X_train, y_train) 
  
# Predict the model 
y_val_pred = xgb_r.predict(X_val) 
  
# RMSE Computation 
rmse = mean_squared_error(y_val, y_val_pred)
print("RMSE : % f" %(rmse)) 

RMSE :  6341.909488


### Tuning XGBoost Regressor

In [29]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5]
}

# Initialize the XGBoost Regressor
xgb_reg = xg.XGBRegressor(objective='reg:squarederror')

# Initialize the Grid Search
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit Grid Search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_parameters = grid_search.best_params_
best_parameters

Fitting 3 folds for each of 108 candidates, totalling 324 fits


KeyboardInterrupt: 

Tunned XGBoost

In [28]:
# Instantiation 
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', seed = 42,
                       learning_rate= 0.1, max_depth= 7, min_child_weight= 1, n_estimators= 300) 
  
# Fitting the model 
xgb_r.fit(X_train, y_train) 
  
# Predict the model 
y_val_pred = xgb_r.predict(X_val) 
  
# RMSE Computation 
rmse = mean_squared_error(y_val, y_val_pred)
print("RMSE : % f" %(rmse)) 

RMSE :  6012.555554
