**Table of contents**<a id='toc0_'></a>    
- [Load and Preprocess Data](#toc1_1_)    
- [Helper Functions](#toc1_2_)    
- [Hyperparameter Tuning](#toc1_3_)    
- [Model Training and Evaluation](#toc1_4_)    
- [Run All Models Across All Pairs](#toc1_5_)    
- [Results Analysis](#toc1_6_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import glob
import pickle
import logging
from tabulate import tabulate
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from lightgbm import LGBMRegressor
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Dropout
from keras.callbacks import EarlyStopping, History, ModelCheckpoint, CSVLogger, TensorBoard
from keras.optimizers import Adam
from kerastuner import HyperModel, HyperParameters, RandomSearch
from tqdm import tqdm
from datetime import datetime

warnings.filterwarnings('ignore')
# plt.style.use('./../.templates/custom_onedark.mplstyle')

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  from kerastuner import HyperModel, HyperParameters, RandomSearch


## <a id='toc1_1_'></a>[Load and Preprocess Data](#toc0_)

In [None]:
# Configurable parameters
DATA_DIR = '../data/'
PRICE_HISTORY_DIR = os.path.join(DATA_DIR, 'price_history/')
SENTIMENT_DIR = os.path.join(DATA_DIR, 'sentiments/')
TI_DIR = os.path.join(DATA_DIR, 'technical_indicators/')

MA_HIGH = 45
MA_LOW = 5
SPREAD_WINDOW = None
LOOK_BACK = 3
ADD_SENTIMENT_MA = True

# Load ticker pairs
tickers_list = [file.split('/')[-1].strip('.csv') for file in glob.glob(PRICE_HISTORY_DIR + '*.csv')]
pairs_df = pd.read_csv(os.path.join(DATA_DIR, 'ranked_pairs_snp.csv'))
pairs = pairs_df[['ticker_0', 'ticker_1']].values

class TickerData:
    def __init__(self, ticker):
        self.ticker = ticker
        self.history = pd.read_csv(os.path.join(PRICE_HISTORY_DIR, f'{ticker}.csv'), index_col='Date', date_format='%Y-%m-%d')
        self.sentiment = pd.read_csv(os.path.join(SENTIMENT_DIR, f'{ticker}.csv'), index_col='Date', date_format='%Y-%m-%d')
        self.ti = pd.read_csv(os.path.join(TI_DIR, f'{ticker}.csv'), index_col='Date', date_format='%Y-%m-%d')

logger.info("Loading ticker data...")
data = {ticker: TickerData(ticker) for ticker in tqdm(tickers_list, desc="Loading ticker data")}
logger.info(f"Loaded data for {len(data)} tickers")

## <a id='toc1_2_'></a>[Helper Functions](#toc0_)

In [3]:
def add_moving_averages(df, moving_averages, columns):
    df = df.copy()
    for ma in moving_averages:
        for col in columns:
            new_col = f'{col}_ma{ma}'
            df[new_col] = df[col].rolling(window=ma).mean()
    return df.fillna(method='ffill')

def get_spread(S1_train, S2_train, S1_full, S2_full):
    s1_ols = sm.add_constant(S1_train)
    spread_model = sm.OLS(S2_train, s1_ols).fit()
    beta = spread_model.params[S1_train.name]
    
    # Calculate spread for full dataset using beta from training data
    spread_full = S2_full - beta * S1_full
    return spread_full, beta

def create_dataset(data, look_back, target_col="zscore_spread", split_sizes=[0.4, 0.1, 0.5], drop_cols=None):
    data = data.copy().dropna()
    
    train_size = int(len(data) * split_sizes[0])
    val_size = int(len(data) * split_sizes[1])
    
    train_data = data.iloc[:train_size]
    val_data = data.iloc[train_size:train_size+val_size]
    test_data = data.iloc[train_size+val_size:]
    
    logger.info(f"Train size: {train_size}, Val size: {val_size}, Test size: {len(data) - train_size - val_size}")
    
    # Calculate spread using only training data
    spread_full, beta = get_spread(
        train_data['Adj Close_S1'], 
        train_data['Adj Close_S2'],
        data['Adj Close_S1'],
        data['Adj Close_S2']
    )
    
    data['spread'] = spread_full
    
    # Calculate z-score
    ma_high = data['spread'].rolling(window=MA_HIGH).mean()
    ma_low = data['spread'].rolling(window=MA_LOW).mean()
    std_high = data['spread'].rolling(window=MA_HIGH).std()
    
    data['zscore_spread'] = (ma_low - ma_high) / std_high
    
    y = data[target_col]
    y_shifted = y.shift(-look_back)
    data["target"] = y_shifted
    
    if drop_cols:
        data = data.drop(drop_cols, axis=1)
    
    data = data.dropna()
    
    train_data = data.iloc[:train_size]
    val_data = data.iloc[train_size:train_size+val_size]
    test_data = data.iloc[train_size+val_size:]
    
    logger.info(f"Final dataset shapes - Train: {train_data.shape}, Val: {val_data.shape}, Test: {test_data.shape}")
    
    return train_data, val_data, test_data, beta

def prepare_dataset(train_data, val_data, test_data):
    X_train, y_train = train_data.drop(columns=["target"]), train_data["target"]
    X_val, y_val = val_data.drop(columns=["target"]), val_data["target"]
    X_test, y_test = test_data.drop(columns=["target"]), test_data["target"]
    return X_train, y_train, X_val, y_val, X_test, y_test

def calculate_metrics(y_true, y_pred):
    return {
        "r2": r2_score(y_true, y_pred),
        "mae": mean_absolute_error(y_true, y_pred),
        "mse": mean_squared_error(y_true, y_pred),
        "rmse": np.sqrt(mean_squared_error(y_true, y_pred)),
    }

def preprocess_pair_data(pair):
    pair_sentiment_df = pd.merge(data[pair[0]].sentiment, data[pair[1]].sentiment, left_index=True, right_index=True, suffixes=("_S1", "_S2"))
    if ADD_SENTIMENT_MA:
        pair_sentiment_df = add_moving_averages(pair_sentiment_df, [5, 10, 20, 40, 80], columns=pair_sentiment_df.columns)
    
    pair_ti_df = pd.merge(data[pair[0]].ti, data[pair[1]].ti, left_index=True, right_index=True, suffixes=("_S1", "_S2"))
    pair_df = pd.merge(pair_ti_df, pair_sentiment_df, left_index=True, right_index=True, how="left")
    
    pair_df = pair_df[pair_df.index < '2024-01-01']
    
    logger.info(f"Preprocessed pair data shape: {pair_df.shape}")
    
    return pair_df

## <a id='toc1_3_'></a>[Hyperparameter Tuning](#toc0_)

In [4]:
class LSTMHyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        model = Sequential()
        
        for i in range(hp.Int('num_lstm_layers', 1, 3)):
            return_sequences = i < hp.Int('num_lstm_layers', 1, 3) - 1
            units = hp.Int(f'lstm_{i}_units', 50, 200, step=50)
            model.add(LSTM(units, return_sequences=return_sequences, input_shape=self.input_shape))
            model.add(Dropout(hp.Float(f'lstm_{i}_dropout', 0.1, 0.5, step=0.1)))
        
        for i in range(hp.Int('num_dense_layers', 1, 2)):
            units = hp.Int(f'dense_{i}_units', 50, 200, step=50)
            model.add(Dense(units, activation='relu'))
        
        model.add(Dense(1, activation='linear'))
        
        optimizer = Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4]))
        model.compile(optimizer=optimizer, loss='mae', metrics=['mae', 'mse'])
        
        return model

def perform_hyperparameter_tuning(X_train_lstm, y_train):
    logger.info("Starting hyperparameter tuning...")
    hypermodel = LSTMHyperModel(input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]))
    
    log_dir = os.path.join("logs", "hyperparameter_tuning", datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
    
    tuner = RandomSearch(
        hypermodel,
        objective='val_loss',
        max_trials=100,
        executions_per_trial=3,
        directory='tuner_results',
        project_name='lstm_tuner'
    )
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=4)
    
    tuner.search(X_train_lstm, y_train, epochs=50, validation_split=0.2, 
                 callbacks=[early_stopping, tensorboard_callback],
                 verbose=1)
    
    best_model = tuner.get_best_models(num_models=1)[0]
    best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
    
    logger.info(f"Best hyperparameters: {best_hyperparameters.values}")
    
    return best_model, best_hyperparameters

## <a id='toc1_4_'></a>[Model Training and Evaluation](#toc0_)

In [5]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    logger.info(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calculate_metrics(y_test, y_pred)
    
    logger.info(f"{model_name} metrics: {metrics}")
    
    return {
        'metrics': metrics,
        'y_pred': y_pred,
    }

def train_and_evaluate_lstm(model, X_train, y_train, X_val, y_val, X_test, y_test, model_name, epochs=100, batch_size=16):
    logger.info(f"Training {model_name}...")
    log_dir = os.path.join("logs", model_name, datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=6)
    history = History()
    csv_logger = CSVLogger(f'{model_name.lower()}_training.log')
    model_checkpoint = ModelCheckpoint(f'best_model_{model_name.lower()}.keras', monitor='val_loss', mode='min')
    
    model.fit(
        X_train, y_train,
        epochs=epochs,
        validation_data=(X_val, y_val),
        verbose=1,
        callbacks=[early_stopping, history, csv_logger, model_checkpoint, tensorboard_callback],
        batch_size=batch_size
    )
    
    y_pred = model.predict(X_test)
    metrics = calculate_metrics(y_test, y_pred.flatten())
    
    logger.info(f"{model_name} metrics: {metrics}")
    
    return {
        'metrics': metrics,
        'y_pred': y_pred,
    }

## <a id='toc1_5_'></a>[Run All Models Across All Pairs](#toc0_)

In [None]:
def run_models_on_pair(pair):
    logger.info(f"Processing pair: {pair}")
    pair_df = preprocess_pair_data(pair)
    train_data, val_data, test_data, beta = create_dataset(pair_df, look_back=LOOK_BACK, target_col="zscore_spread")
    X_train, y_train, X_val, y_val, X_test, y_test = prepare_dataset(train_data, val_data, test_data)
    
    model_results = {}
    
    # Traditional ML models
    base_models = [
        LinearRegression(),
        GradientBoostingRegressor(random_state=42),
        RandomForestRegressor(n_estimators=100, random_state=42),
        LGBMRegressor(random_state=42, verbose=-1),
    ]
    
    for model in tqdm(base_models, desc="Training traditional ML models"):
        model_name = model.__class__.__name__
        model_results[model_name] = train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name)
    
    # Prepare data for LSTM models
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], LOOK_BACK, -1))
    X_val_lstm = X_val_scaled.reshape((X_val_scaled.shape[0], LOOK_BACK, -1))
    X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], LOOK_BACK, -1))
    
    # LSTM models
    lstm_models = {
        'Vanilla LSTM': Sequential([
            LSTM(50, return_sequences=True,
            'Vanilla LSTM': Sequential([
            LSTM(50, return_sequences=True, input_shape=(LOOK_BACK, X_train_lstm.shape[2])),
            LSTM(50),
            Dense(1)
        ]),
        'LSTM': Sequential([
            LSTM(150, return_sequences=True, input_shape=(LOOK_BACK, X_train_lstm.shape[2])),
            Dropout(0.4),
            LSTM(150),
            Dropout(0.3),
            Dense(200, activation='relu'),
            Dense(1, activation='linear')
        ]),
        'BiLSTM': Sequential([
            Bidirectional(LSTM(100, return_sequences=True), input_shape=(LOOK_BACK, X_train_lstm.shape[2])),
            Dropout(0.2),
            Bidirectional(LSTM(100, return_sequences=True)),
            Dropout(0.2),
            Bidirectional(LSTM(50)),
            Dropout(0.2),
            Dense(50, activation='relu'),
            Dense(1)
        ])
    }
    
    for model_name, model in tqdm(lstm_models.items(), desc="Training LSTM models"):
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mae', metrics=['mae', 'mse'])
        model_results[model_name] = train_and_evaluate_lstm(
            model, X_train_lstm, y_train, X_val_lstm, y_val, X_test_lstm, y_test, model_name
        )
        model_results[model_name]['X_test_og'] = X_test
    
    logger.info(f"Completed processing pair: {pair}")
    return model_results, beta

# Run for all pairs
ticker_results = {}

for pair in tqdm(pairs, desc="Processing pairs"):
    logger.info(f"Starting processing for pair: {pair}")
    try:
        ticker_results[tuple(pair)], beta = run_models_on_pair(pair)
        ticker_results[tuple(pair)]['beta'] = beta
        logger.info(f"Successfully processed pair: {pair}")
    except Exception as e:
        logger.error(f"Error processing pair {pair}: {str(e)}")

# Save results
logger.info("Saving results...")
with open('all_models_all_tickers.pkl', 'wb') as f:
    pickle.dump(ticker_results, f)

logger.info("All models have been trained and evaluated. Results saved in 'all_models_all_tickers.pkl'.")

## <a id='toc1_6_'></a>[Results Analysis](#toc0_)

In [None]:
def run_models_on_pair(pair):
    logger.info(f"Processing pair: {pair}")
    pair_df = preprocess_pair_data(pair)
    train_data, val_data, test_data, beta = create_dataset(pair_df, look_back=LOOK_BACK, target_col="zscore_spread")
    X_train, y_train, X_val, y_val, X_test, y_test = prepare_dataset(train_data, val_data, test_data)
    
    model_results = {}
    
    # Traditional ML models
    base_models = [
        LinearRegression(),
        GradientBoostingRegressor(random_state=42),
        RandomForestRegressor(n_estimators=100, random_state=42),
        LGBMRegressor(random_state=42, verbose=-1),
    ]
    
    for model in tqdm(base_models, desc="Training traditional ML models"):
        model_name = model.__class__.__name__
        model_results[model_name] = train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name)
    
    # Prepare data for LSTM models
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], LOOK_BACK, -1))
    X_val_lstm = X_val_scaled.reshape((X_val_scaled.shape[0], LOOK_BACK, -1))
    X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], LOOK_BACK, -1))

    # LSTM models
    lstm_models = {
        'Vanilla LSTM': Sequential([
            LSTM(50, return_sequences=True, input_shape=(LOOK_BACK, X_train_lstm.shape[2])),
            LSTM(50),
            Dense(1)
        ]),
        'LSTM': Sequential([
            LSTM(150, return_sequences=True, input_shape=(LOOK_BACK, X_train_lstm.shape[2])),
            Dropout(0.4),
            LSTM(150),
            Dropout(0.3),
            Dense(200, activation='relu'),
            Dense(1, activation='linear')
        ]),
        'BiLSTM': Sequential([
            Bidirectional(LSTM(100, return_sequences=True), input_shape=(LOOK_BACK, X_train_lstm.shape[2])),
            Dropout(0.2),
            Bidirectional(LSTM(100, return_sequences=True)),
            Dropout(0.2),
            Bidirectional(LSTM(50)),
            Dropout(0.2),
            Dense(50, activation='relu'),
            Dense(1)
        ])
    }
    
    for model_name, model in tqdm(lstm_models.items(), desc="Training LSTM models"):
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mae', metrics=['mae', 'mse'])
        model_results[model_name] = train_and_evaluate_lstm(
            model, X_train_lstm, y_train, X_val_lstm, y_val, X_test_lstm, y_test, model_name
        )
        model_results[model_name]['X_test_og'] = X_test
    
    logger.info(f"Completed processing pair: {pair}")
    return model_results, beta

# Run for all pairs
ticker_results = {}

for pair in tqdm(pairs, desc="Processing pairs"):
    logger.info(f"Starting processing for pair: {pair}")
    try:
        ticker_results[tuple(pair)], beta = run_models_on_pair(pair)
        ticker_results[tuple(pair)]['beta'] = beta
        logger.info(f"Successfully processed pair: {pair}")
    except Exception as e:
        logger.error(f"Error processing pair {pair}: {str(e)}")

# Save results
logger.info("Saving results...")
with open('all_models_all_tickers.pkl', 'wb') as f:
    pickle.dump(ticker_results, f)

logger.info("All models have been trained and evaluated. Results saved in 'all_models_all_tickers.pkl'.")

In [None]:
best_model