# Getting Started

## Prerequisites

### Importing Libraries

In [18]:
# data manipulation
import pickle
import numpy as np
import pandas as pd

# ensemble models and auto-tuning
import optuna
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# time-series models and auto-tuning
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# evaluation
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split

# WARNING
import warnings

### Loading Data

In [2]:
def load_object(file_name):
    with open(file_name, 'rb') as f:
        data= pickle.load(f)
    return data

In [3]:
train_data = load_object('notebook_artifacts/train_data.pkl')
test_data = load_object('notebook_artifacts/test_data.pkl')

In [4]:
train_data.head()

Unnamed: 0_level_0,meantemp,humidity,wind_speed,meanpressure,meantemp (t-1)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,7.4,92.0,2.98,1017.8,10.0
2013-01-03,7.167,87.0,4.633,1018.667,7.4
2013-01-04,8.667,71.333,1.233,1017.167,7.167
2013-01-05,6.0,86.833,3.7,1016.5,8.667
2013-01-06,7.0,82.8,1.48,1018.0,6.0


# Performing Model Selection

## Ensemble Models

In [28]:
# Load dataset
X_train, y_train = train_data.drop('meantemp', axis=1), train_data['meantemp']
# Define objective function
def objective(trial):
    # Choose regressor
    regressor_name = trial.suggest_categorical('regressor', ['XGBoost', 'LightGBM', 'CatBoost', 'RandomForest', 'AdaBoost'])
    
    # Define hyperparameters
    if regressor_name == 'XGBoost':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
            'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
        }
        regressor = XGBRegressor(**params, verbosity=0)
    elif regressor_name == 'LightGBM':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
            'num_leaves': trial.suggest_int('num_leaves', 2, 50)
        }
        regressor = LGBMRegressor(**params, verbosity=-1)
    elif regressor_name == 'CatBoost':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
            'depth': trial.suggest_int('depth', 3, 10)
        }
        regressor = CatBoostRegressor(**params, verbose= False)
    elif regressor_name == 'RandomForest':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10)
        }
        regressor = RandomForestRegressor(**params)
    elif regressor_name == 'AdaBoost':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1)
        }
        regressor = AdaBoostRegressor(**params)
    
    # Evaluate the regressor using cross-validation
    score = -cross_val_score(regressor, X, y, scoring='neg_mean_squared_error', cv=5).mean()
    
    return score

In [29]:
# Optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-02-12 20:04:47,219] A new study created in memory with name: no-name-64a5aea5-f554-4c02-8705-a6ad13b449f1
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-02-12 20:04:47,681] Trial 0 finished with value: 2.120110864531192 and parameters: {'regressor': 'LightGBM', 'n_estimators': 376, 'max_depth': 4, 'learning_rate': 0.02939079206156143, 'num_leaves': 49}. Best is trial 0 with value: 2.120110864531192.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1)
[I 2024-02-12 20:04:49,373] Trial 1 finished with value: 2.8793049412760343 and parameters: {'regressor': 'AdaBoost', 'n_estimators': 115, 'learning_rate': 0.049947702302951494}. Best is trial 0 with value: 2.120110864531192.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1)
[I 2024-02-12 20:04:51,299] Trial 2 finished with value: 3.0922905550288826 and parameters: {'regressor': 'AdaBoost', 'n_estimators': 129, 'learning_rate': 0.005579092171936288}. Bes

In [30]:
# Print the best hyperparameters
print('Best trial:')
best_trial = study.best_trial
print('Value: ', best_trial.value)
print('Params: ')
for key, value in best_trial.params.items():
    print(f'    {key}: {value}')

Best trial:
Value:  2.043845633746705
Params: 
    regressor: RandomForest
    n_estimators: 476
    max_depth: 7


## Timer Series Models

In [31]:
X_split_train, X_split_test, y_split_train, y_split_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [32]:
def auto_tune_time_series(train_data):
    warnings.filterwarnings("ignore", category=UserWarning, module="statsmodels")
    X_train, y_train = train_data.drop('meantemp', axis=1), train_data['meantemp']
    arima_model = auto_arima(y_split_train, X=X_split_train, seasonal=True, suppress_warnings=True)
    best_model = SARIMAX(y_split_train, order=arima_model.order, seasonal_order=arima_model.seasonal_order, exog=X_split_train).fit(disp=False)
    predictions = best_model.get_forecast(steps=len(X_split_test), exog=X_split_test).predicted_mean
    warnings.resetwarnings()
    return best_model, predictions

In [33]:
best_model, predictions = auto_tune_time_series(train_data)
mse = mean_squared_error(y_split_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 2.615749346108933


  return get_prediction_index(


# Final Model

- Selected Model: Random Forest Regressor.
- Below are the hyperparameters:
  - n_estimators: 476
  -  max_depth: 7
- MSE: 2.0447