In [10]:
# --- Model Tuning Notebook ---
import pandas as pd
import numpy as np
import os
os.chdir("/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred")
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, mean_squared_error
from features.build_features import FeatureEngineering
from models.arima import ARIMAModel
from models.prophet import ProphetModel
from models.lstm import LSTMModel
from models.xgboost_model import XGBoostModel
import logging

In [11]:
# Project Directory Setup (adjust as necessary)
project_dir = os.getcwd()
raw_data_dir = os.path.join(project_dir, "data", "raw", "1023")

In [12]:
# Load Data
def load_data(filepath):
    try:
        df = pd.read_csv(filepath)
        print(f"Successfully loaded data from: {filepath}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except Exception as e:
        print(f"An error occurred loading data: {e}")
        return None

In [13]:
sp500_companies_path = os.path.join(raw_data_dir, "sp500_companies.csv")
sp500_index_path = os.path.join(raw_data_dir, "sp500_index.csv")
sp500_stocks_path = os.path.join(raw_data_dir, "sp500_stocks.csv")

sp500_companies_df = load_data(sp500_companies_path)
sp500_index_df = load_data(sp500_index_path)
sp500_stocks_df = load_data(sp500_stocks_path)
sp500_stocks_df['Date'] = pd.to_datetime(sp500_stocks_df['Date'])

Successfully loaded data from: /Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/data/raw/1023/sp500_companies.csv
Successfully loaded data from: /Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/data/raw/1023/sp500_index.csv
Successfully loaded data from: /Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/data/raw/1023/sp500_stocks.csv


In [14]:
def preprocess_stocks_data(sp500_stocks_df):
    if sp500_stocks_df is None:
        return None
    sp500_stocks_df = sp500_stocks_df.dropna(subset=["Adj Close"])
    sp500_stocks_df = sp500_stocks_df.sort_values(by="Date").reset_index(drop=True)
    return sp500_stocks_df

In [15]:
def create_and_merge_features(sp500_stocks_df, sp500_companies_df):
    if sp500_stocks_df is None:
        return None
    feature_eng = FeatureEngineering()
    lags_to_use = [1, 7, 30]
    sp500_stocks_df = feature_eng.create_lag_features(
        sp500_stocks_df, "Adj Close", lags_to_use
    )
    windows_to_use = [7, 30, 90]
    sp500_stocks_df = feature_eng.create_rolling_features(
        sp500_stocks_df, "Adj Close", windows_to_use
    )
    sp500_stocks_df = feature_eng.create_calendar_features(sp500_stocks_df, "Date")
    sp500_stocks_df = feature_eng.merge_company_data(
        sp500_stocks_df, sp500_companies_df
    )
    return sp500_stocks_df

In [16]:
def split_data(sp500_stocks_df):
    if sp500_stocks_df is None:
        return None, None, None
    feature_eng = FeatureEngineering()
    train_data, val_data, test_data = feature_eng.time_series_split(
        sp500_stocks_df, "Date"
    )
    return train_data, val_data, test_data

In [17]:
sp500_stocks_df = preprocess_stocks_data(sp500_stocks_df)
sp500_stocks_df = create_and_merge_features(sp500_stocks_df, sp500_companies_df)
train_data, val_data, test_data = split_data(sp500_stocks_df)

Merged the stock data with company data based on Symbol column.
Data has been split in a time-based fashion.


In [19]:
# Define a custom RMSE scorer
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [23]:
# 1. ARIMA Hyperparameter Tuning
if train_data is not None:
    print("\n--- ARIMA Hyperparameter Tuning ---")

    class ARIMAGridSearchCVWrapper:
        def __init__(self, order):
            self.order = order
            self.model = ARIMAModel(order=self.order)

        def fit(self, X, y=None): # X will be train_data when GridSearchCV uses it
                self.model.train(X, target_column="Adj Close")
                return self

        def predict(self, X):
            return self.model.predict(X, target_column = "Adj Close")

        def get_params(self, deep=True):
            return {"order" : self.order}

        def set_params(self, **params):
            self.order = params['order']
            self.model = ARIMAModel(order=self.order)
            return self

    param_grid = {'order': [(p, d, q) for p in range(1, 3) for d in range(1, 2) for q in range(1, 3)]}
    tscv = TimeSeriesSplit(n_splits=3)
    arima_wrapper = ARIMAGridSearchCVWrapper(order=(5,1,0)) # pass in any order as we will be searching across parameters
    grid = GridSearchCV(arima_wrapper, param_grid, cv=tscv, scoring=rmse_scorer, verbose = 0)
    grid.fit(train_data)
    best_order = grid.best_params_['order']
    best_score = -grid.best_score_
    print(f"Best ARIMA order: {best_order} and RMSE: {best_score:.2f}")


--- ARIMA Hyperparameter Tuning ---
ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.


Traceback (most recent call last):
  File "/Users/gagigetsadze/Desktop/Python programs/Time_Series_Pred/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'



ARIMA model has been trained.
Best ARIMA order: (1, 1, 1) and RMSE: nan


In [27]:
# 2. Prophet Hyperparameter Tuning
if train_data is not None:
    print("\n--- Prophet Hyperparameter Tuning ---")
    # We can't do a grid search using prophet directly. Therefore, we will use an example hyperparameter tuning
    # process where we tune the changepoint_prior_scale

    param_grid = {'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5]}
    tscv = TimeSeriesSplit(n_splits=3)

    best_score = float('inf')
    best_param = None
    for param in param_grid['changepoint_prior_scale']:
        scores = []
        for train_index, val_index in tscv.split(train_data):
            train, val = train_data.iloc[train_index], train_data.iloc[val_index]
            model = ProphetModel(changepoint_prior_scale=param)
            model.train(train, target_column='Adj Close')
            if model.trained:
                df_prophet_val = val.rename(columns={'Date': 'ds'})
                forecast = model.model.predict(df_prophet_val)
                val_predictions = forecast['yhat'].values
                score = rmse(val['Adj Close'].values, val_predictions)
                scores.append(score)
            else:
                scores.append(float('inf'))
        mean_score = np.mean(scores)
        if mean_score < best_score:
            best_score = mean_score
            best_param = param
    print(f"Best Prophet Parameter:  changepoint_prior_scale = {best_param} and RMSE: {best_score:.2f}")


--- Prophet Hyperparameter Tuning ---


TypeError: __init__() got an unexpected keyword argument 'changepoint_prior_scale'

In [22]:
# 3. LSTM Hyperparameter Tuning
if train_data is not None:
    print("\n--- LSTM Hyperparameter Tuning ---")

    param_grid = {
    'hidden_size': [20, 50, 100],
    'num_layers': [1, 2],
    'learning_rate': [0.001, 0.005, 0.01],
    'num_epochs' : [50,100]
    }
    tscv = TimeSeriesSplit(n_splits=3)


    best_score = float('inf')
    best_params = None


    for hidden_size in param_grid['hidden_size']:
        for num_layers in param_grid['num_layers']:
            for learning_rate in param_grid['learning_rate']:
                for num_epochs in param_grid['num_epochs']:
                    scores = []
                    for train_index, val_index in tscv.split(train_data):
                        train, val = train_data.iloc[train_index], train_data.iloc[val_index]
                        model = LSTMModel(hidden_size = hidden_size, num_layers = num_layers, learning_rate=learning_rate, num_epochs=num_epochs )
                        model.train(train, val, target_column='Adj Close')

                        if model.trained:
                            val_predictions = model.predict(val, target_column='Adj Close')
                            if val_predictions is not None:
                                score = rmse(val['Adj Close'].iloc[10:].values, val_predictions)
                                scores.append(score)
                            else:
                                scores.append(float('inf'))
                        else:
                            scores.append(float('inf'))
                    mean_score = np.mean(scores)
                    if mean_score < best_score:
                        best_score = mean_score
                        best_params = {'hidden_size' : hidden_size, 'num_layers': num_layers, 'learning_rate' : learning_rate, 'num_epochs' : num_epochs}
    if best_params is not None:
        print(f"Best LSTM Parameters:  hidden_size={best_params['hidden_size']}, num_layers={best_params['num_layers']}, learning_rate={best_params['learning_rate']}, num_epochs={best_params['num_epochs']} and RMSE : {best_score:.2f}")


--- LSTM Hyperparameter Tuning ---
Epoch: 1/50, Training Loss:0.0881, Validation Loss: 0.0259
Epoch: 2/50, Training Loss:0.0845, Validation Loss: 0.0245
Epoch: 3/50, Training Loss:0.0900, Validation Loss: 0.0268
Epoch: 4/50, Training Loss:0.0917, Validation Loss: 0.0275
Epoch: 5/50, Training Loss:0.0881, Validation Loss: 0.0260
Epoch: 6/50, Training Loss:0.0876, Validation Loss: 0.0258
Epoch: 7/50, Training Loss:0.0873, Validation Loss: 0.0256
Epoch: 8/50, Training Loss:0.0855, Validation Loss: 0.0249
Epoch: 9/50, Training Loss:0.0923, Validation Loss: 0.0278
Epoch: 10/50, Training Loss:0.0865, Validation Loss: 0.0250
Epoch: 11/50, Training Loss:0.0884, Validation Loss: 0.0260
Epoch: 12/50, Training Loss:0.0879, Validation Loss: 0.0257
Epoch: 13/50, Training Loss:0.0892, Validation Loss: 0.0264
Epoch: 14/50, Training Loss:0.0889, Validation Loss: 0.0262
Epoch: 15/50, Training Loss:0.0877, Validation Loss: 0.0258
Epoch: 16/50, Training Loss:0.0874, Validation Loss: 0.0257
Epoch: 17/50,

KeyboardInterrupt: 

In [25]:
# 4. XGBoost Hyperparameter Tuning
if train_data is not None:
    print("\n--- XGBoost Hyperparameter Tuning ---")
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 1.0]
    }
    tscv = TimeSeriesSplit(n_splits=3)
    xgb_model = XGBoostModel()
    features = [
        col
        for col in train_data.columns
        if col != "Adj Close" and col != "Date" and train_data[col].dtype in ['int64', 'float64']
    ]
    if features:
        grid = GridSearchCV(xgb_model, param_grid, cv=tscv, scoring=rmse_scorer, verbose = 0)
        grid.fit(train_data[features], train_data['Adj Close'])
        best_params = grid.best_params_
        best_score = -grid.best_score_
        print(f"Best XGBoost Parameters: {best_params} and RMSE: {best_score:.2f}")
    else:
        print("No numerical features available for XGBoost Tuning")


--- XGBoost Hyperparameter Tuning ---


InvalidParameterError: The 'estimator' parameter of GridSearchCV must be an object implementing 'fit'. Got <models.xgboost_model.XGBoostModel object at 0x148342df0> instead.