# Activate Environment and import necessary modules

In [1]:
!source /Users/amirejibiilia/Desktop/Thesis/new_venv/bin/activate

In [2]:
import time
import numpy as np
import pandas as pd 

#plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, train_test_split # Model evaluation
from sklearn.preprocessing import MinMaxScaler # Data normalization

from xgboost import XGBRegressor, DMatrix, plot_importance # XGBoost

import tensorflow as tf
from tensorflow.keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

import matplotlib.dates as mdates
import optuna
from sklearn.metrics import mean_squared_error

2024-06-20 23:30:48.581781: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# data import
df_train = pd.read_excel("/Users/amirejibiilia/Desktop/Thesis/version_4.xlsx", index_col = 0) 
cols_list = list(df_train.columns) 

In [4]:
df_train.head().round(2)

Unnamed: 0_level_0,HousingIndex,Capitalisation,ExchangeRate,InterestRateOnDeposits,CPI,HouseAffordabilityIndex,RentIndex,MortgageRate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-01-01,163.43,8.53,1.82,7.66,102.09,40.0,111.9,0.16
2005-02-01,159.72,9.56,1.83,7.49,100.32,40.0,122.59,0.16
2005-03-01,178.2,8.62,1.84,7.04,100.59,40.0,123.29,0.16
2005-04-01,183.23,8.81,1.83,7.08,100.42,40.0,129.64,0.16
2005-05-01,175.51,9.45,1.83,7.58,98.71,40.0,133.09,0.16


# XGBoost

## XGBoost - Data Preparation

In [5]:
df_train.head().round(2)

Unnamed: 0_level_0,HousingIndex,Capitalisation,ExchangeRate,InterestRateOnDeposits,CPI,HouseAffordabilityIndex,RentIndex,MortgageRate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-01-01,163.43,8.53,1.82,7.66,102.09,40.0,111.9,0.16
2005-02-01,159.72,9.56,1.83,7.49,100.32,40.0,122.59,0.16
2005-03-01,178.2,8.62,1.84,7.04,100.59,40.0,123.29,0.16
2005-04-01,183.23,8.81,1.83,7.08,100.42,40.0,129.64,0.16
2005-05-01,175.51,9.45,1.83,7.58,98.71,40.0,133.09,0.16


## Hyperparameters Tuning of XGBoost 

In [6]:
# Data split
y_full = df_train['HousingIndex'][:-15]
X_full = df_train.drop(columns=["HousingIndex"])[:-15]

y_test = df_train['HousingIndex'][-15:]
X_test = df_train.drop(columns=["HousingIndex"])[-15:]

In [7]:
# Define objective function for Optuna with Walk-Forward Validation
def objective(trial):
    # Define hyperparameters to optimize
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    }

    # Create XGBoost model with the suggested parameters
    model = XGBRegressor(**params, objective='reg:squarederror', random_state=123)

    # Walk-Forward Validation
    initial_train_size = int(len(X_full) * 0.6)
    test_size = int(len(X_full) * 0.1)
    n_records = len(X_full)
    rmse_scores = []

    for i in range(initial_train_size, n_records, test_size):
        train_end = i
        test_end = i + test_size if (i + test_size) < n_records else n_records

        X_train_split, X_test_split = X_full[:train_end], X_full[train_end:test_end]
        y_train_split, y_test_split = y_full[:train_end], y_full[train_end:test_end]

        model.fit(X_train_split, y_train_split)
        y_pred = model.predict(X_test_split)
        rmse = np.sqrt(mean_squared_error(y_test_split, y_pred))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)
    return mean_rmse

# Create study object and optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)                 

# Get best hyperparameters
best_params = study.best_params

# Update the model with the best hyperparameters
model = XGBRegressor(**best_params, objective='reg:squarederror', random_state=123)

# Train the model on the full training data
model.fit(X_full, y_full)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Test RMSE: {test_rmse}")


[I 2024-06-20 23:31:24,232] A new study created in memory with name: no-name-f5bed37b-4a0e-4e8e-beb6-50dcd730cb34
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.004798, 0.004798),
  'gamma': trial.suggest_loguniform('gamma', 0.01531, 0.01531),
  'subsample': trial.suggest_uniform('subsample', 0.579605, 0.579605),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.821926, 0.821926),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 2.039509e-07, 2.039509e-07),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 8.260234e-07, 8.260234e-07)
[I 2024-06-20 23:31:24,655] Trial 0 finished with value: 78.8717260298449 and parameters: {'max_depth': 4, 'learning_rate': 0.004798, 'n_estimators': 211, 'gamma': 0.01531, 'min_child_weight': 17, 'subsample': 0.579605, 'colsample_bytree': 0.821926, 'reg_lambda': 2.039509e-07, 'reg_alpha': 8.260234e-07}. Best is trial 0 with value: 78.8717260298449.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.0047

Test RMSE: 165.3115814870471


In [8]:
# Best Params Themselves!
best_params
df_parameters_nicely = pd.DataFrame([best_params])
df_parameters_nicely

Unnamed: 0,max_depth,learning_rate,n_estimators,gamma,min_child_weight,subsample,colsample_bytree,reg_lambda,reg_alpha
0,4,0.004798,211,0.01531,17,0.579605,0.821926,2.039509e-07,8.260234e-07
