<hr>
<br>
<br>
<br>
<h1><center>Predicting Financial Markets with Machine Learning      </center></h1>
<h1><center>-      </center></h1>
<h2><center>Non-Linear Models      </center></h2>
<br>
<br>
<hr>
<br>

<br>
<br>
<h2>Purpose</h2>
<br>
<hr>
A notebook to develop an AI system aiming at trading intraday on cryptocurrencies
<br>
<br>

<br>
<br>
<h2>Imports</h2>
<br>
<hr>
<br>

In [1]:
# Pandas and Python
import pandas as pd

pd.options.display.float_format = "{:.4f}".format
import numpy as np
from tqdm import tqdm
import os
import time
from functools import partial
from multiprocessing import Pool
import sys

# Graphic Libraries
import plotly.io as pio

pio.templates.default = "simple_white"
pd.options.plotting.backend = "plotly"
import matplotlib as plt
from IPython.display import display, HTML, clear_output


# AI and stats
import statsmodels.api as sm
import xgboost
from xgboost import XGBRegressor
import torch
import sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

<br>
<br>
<h2>Notebook Parameters</h2>
<br>
<hr>
<br>

In [5]:
# Define data path
data_path = "data/"

# Risk free rate assumption
risk_free_rate = 0.05  # % per year
rfr_hourly = (1 + risk_free_rate) ** (1 / (24 * 365)) - 1

# Suggested training set
start_date_train = "2023-01-24"
last_date_train = "2024-01-24"

# Suggested validation set
start_date_validate = "2024-01-25"
last_date_validate = "2024-07-24"

# Test set (Unavailable)
# start_date_test = "2024-07-25"
# last_date_test = "2025-01-24"

# Maximum number of features to use
max_nb_features = 50

# Set a level of transaction costs
tc = 0.0000


<br>
<br>
<h2>Data Loading</h2>
<br>
<hr>
<br>

In [6]:
# Main data
data = pd.read_csv(
    f"{data_path}data_in_sample.csv",
    index_col=0,
    header=[0, 1],
)

# Make sure that the index is in the right format
data.index = pd.to_datetime(data.index)


In [7]:
# Load pre-processed features
features = {}
for dirpath, dirnames, filenames in os.walk(data_path):
    for filename in filenames[-max_nb_features:]:
        if "feature" in filename:
            print(f"Loading {filename}")

            # Load feature
            feature = pd.read_csv(
                f"{data_path}{filename}",
                index_col=0,
                header=[0],
            )

            # Make sure that the index is in the right format
            feature.index = pd.to_datetime(feature.index)

            # Store in the feature dict
            features[filename.replace(".csv", "")] = feature

Loading feature_836573281383.csv
Loading feature_838590701664.csv
Loading feature_838899441076.csv
Loading feature_839336425189.csv
Loading feature_840341828987.csv
Loading feature_844025771320.csv
Loading feature_847894150363.csv
Loading feature_854741339252.csv
Loading feature_856641545563.csv
Loading feature_858306317837.csv
Loading feature_859096531804.csv
Loading feature_867578514787.csv
Loading feature_881677815989.csv
Loading feature_884536962935.csv
Loading feature_885225954118.csv
Loading feature_887251521296.csv
Loading feature_889641854357.csv
Loading feature_890728760526.csv
Loading feature_902830881058.csv
Loading feature_903557802651.csv
Loading feature_906048433887.csv
Loading feature_906113031314.csv
Loading feature_907679094852.csv
Loading feature_909460334104.csv
Loading feature_914682300606.csv
Loading feature_929491324974.csv
Loading feature_936020771498.csv
Loading feature_936377278430.csv
Loading feature_938545158299.csv
Loading feature_941272397662.csv
Loading fe

In [8]:
list(features.keys())

['feature_836573281383',
 'feature_838590701664',
 'feature_838899441076',
 'feature_839336425189',
 'feature_840341828987',
 'feature_844025771320',
 'feature_847894150363',
 'feature_854741339252',
 'feature_856641545563',
 'feature_858306317837',
 'feature_859096531804',
 'feature_867578514787',
 'feature_881677815989',
 'feature_884536962935',
 'feature_885225954118',
 'feature_887251521296',
 'feature_889641854357',
 'feature_890728760526',
 'feature_902830881058',
 'feature_903557802651',
 'feature_906048433887',
 'feature_906113031314',
 'feature_907679094852',
 'feature_909460334104',
 'feature_914682300606',
 'feature_929491324974',
 'feature_936020771498',
 'feature_936377278430',
 'feature_938545158299',
 'feature_941272397662',
 'feature_950281828683',
 'feature_950691475264',
 'feature_952144189258',
 'feature_955845031686',
 'feature_961547632365',
 'feature_962177675988',
 'feature_962320961442',
 'feature_965788928669',
 'feature_977512281584',
 'feature_983617492082',


<br>
<br>
<h2>Analytics</h2>
<br>
<hr>
Basic Portfolio analytics to invest in some predictions of the future instruments returns
<br>
<br>


In [9]:
def expected_returns_to_positions(expected_returns):
    """
    Normalize expected returns to make it an investable portfolio

    :param expected_returns: pd.DataFrame containing expectations
                             about future instruments prices variations
    """

    # Positions will be proportional to ranked alpha
    positions = expected_returns.rank(axis=1)

    # Re-scale the leverage
    positions = positions.div(positions.abs().sum(axis=1), axis=0)

    # Make the portfolio dollar neutral
    positions = positions.sub(positions.mean(axis=1), axis=0)

    return positions


def get_sharpe(pnl_portfolio, rfr_hourly):
    """
    Compute the sharpe ratio

    :param pnl_portfolio: pd.Series of returns of the portfolio considered
    :param rfr_hourly: float, the hourly risk free rate
    """

    # Compute excess returns
    excess_returns = pnl_portfolio - rfr_hourly

    # Compute sharpe ratio
    std_dev = excess_returns.std()
    if std_dev == 0:
        sharpe_ratio = np.nan  # Assign NaN instead of inf
    else:
        sharpe_ratio = excess_returns.mean() / std_dev * np.sqrt(252)

    # Output
    return round(sharpe_ratio, 2)


def pnl_analytics(positions, returns, rfr_hourly, lag, tc=0):
    """
    Compute the p&l analytics of the strategy

    :param positions: pd.DataFrame, some positions that have been reached
    :param returns: pd.DataFrame containing returns of instruments
    :param rfr_hourly: float, the hourly risk free rate
    :param lag: int, the number of hours to reach the positions
    :param tc: float, the transaction costs

    """

    # Compute gross p&l
    pnl = positions.shift(1 + lag).mul(returns).sum(axis=1)

    # Compute transaction costs
    trades = positions.fillna(0).diff()
    costs = trades.abs().sum(axis=1) * tc

    # Net p&l: deduce costs from gross p&l
    pnl = pnl.sub(costs, fill_value=0)

    # Compute sharpe
    sharpe = get_sharpe(pnl, rfr_hourly)

    return {"sharpe": sharpe, "pnl": pnl}


def analyze_expected_returns(
    expected_returns,
    returns,
    rfr_hourly,
    title="a Nice Try",
    lags=[0, 1, 2, 3, 6, 12],
    tc=0,
    output_sharpe=False,
    display_results=True,
):
    """
    Provide an economic analysis of some expected_returns

    :param expected_returns: pd.DataFrame containing expectations
                             about future instruments prices variations
    """

    # Take positions as a function of expected returns
    positions = expected_returns_to_positions(expected_returns)

    # Compute p&l and sharpe for different lags
    pnl_lags = {}
    for lag in lags:
        analytics_lag = pnl_analytics(
            positions=positions, returns=returns, rfr_hourly=rfr_hourly, lag=lag, tc=tc
        )
        lag_label = f"Lag {lag}, sharpe={analytics_lag['sharpe']}"
        pnl_lags[lag_label] = analytics_lag["pnl"]

    # Display returns
    pnl_lags = pd.concat(pnl_lags, axis=1).dropna()
    if display_results:
        fig = (
            (1 + pnl_lags)
            .cumprod()
            .plot(
                title=f"Cumulative returns of {title}",
            )
        )
        fig.update_layout(yaxis_type="log")
        fig.show()

    if output_sharpe:
        for lag_label in pnl_lags.columns:
            if "Lag 0" in lag_label:
                return lag_label.split("sharpe=")[-1]


<br>
<br>
<h2>Features Standard Pre-Processing</h2>
<br>
<hr>

<br>


In [10]:
label = data["return"].loc[start_date_train:last_date_train].shift(-1).stack()

In [11]:
features_normalized = {}

for feature_name in features.keys():
    print(f"Processing {feature_name}")

    # Extract the feature
    feature_normalized = features[feature_name]

    # Rank the feature to remove outliers
    feature_normalized = feature_normalized.rank(axis=1, pct=True) - 0.5

    # Stack the feature
    feature_normalized = feature_normalized.stack().sort_index()

    # Store this normalized version
    features_normalized[feature_name] = feature_normalized

# Convert normalized features dict to a single dataframe
features_normalized = pd.concat(features_normalized, axis=1)

# Replace NaNs by average values, as OLS cannot handle NaNs effectively
features_normalized = features_normalized.fillna(0)

Processing feature_836573281383
Processing feature_838590701664
Processing feature_838899441076
Processing feature_839336425189
Processing feature_840341828987
Processing feature_844025771320
Processing feature_847894150363
Processing feature_854741339252
Processing feature_856641545563
Processing feature_858306317837
Processing feature_859096531804
Processing feature_867578514787
Processing feature_881677815989
Processing feature_884536962935
Processing feature_885225954118
Processing feature_887251521296
Processing feature_889641854357
Processing feature_890728760526
Processing feature_902830881058
Processing feature_903557802651
Processing feature_906048433887
Processing feature_906113031314
Processing feature_907679094852
Processing feature_909460334104
Processing feature_914682300606
Processing feature_929491324974
Processing feature_936020771498
Processing feature_936377278430
Processing feature_938545158299
Processing feature_941272397662
Processing feature_950281828683
Processi

In [12]:
import gc

gc.collect()

617

<br>
<br>
<h2>XGBoost: Gradient Boosted Decision Trees</h2>
<br>
<hr>
Gradient Boosted Decision Trees are another way to introduce non-linearity in our model. This non-linearity is present in the link between the label and features, but also among the features themselves. Overfitting is limited thanks to a variety of strategies, resulting in potentially better generalization.
<br>
<br>


<br>
<h4>Defining hyper-parameters</h4>
<br>



In [23]:
# Define hyperparameters
hyperparameters = {
    "learning_rate": 0.001,
    "n_estimators": 500,
    "objective": "reg:squarederror",
    "tree_method": "hist",
    "base_score": 0,
    "max_depth": 7,
    "min_child_weight": 10,
    "subsample": 0.05,
    "colsample_bytree": 0.3,
    "min_split_loss": 0,
    "reg_lambda": 1,
    "reg_alpha": 0,
    "n_jobs": -1,
    "random_state": 0,
}

<br>
<h4>Training the models</h4>
<br>

In [24]:
# # Measure time
# t1 = time.time()

# # Recompute the model every month, skip the first 2 months
# rebalancing_dates = pd.date_range(
#     start=start_date_train, end=last_date_validate, freq="ME"
# )[2:]


# def train_predict_period(
#     last_date_train_fold,
#     returns,
#     hyperparameters,
# ):
#     # Define training and validation dates
#     start_date_train_fold = last_date_train_fold - pd.Timedelta(days=30 * 12)
#     start_date_validate_fold = last_date_train_fold + pd.Timedelta(days=1)
#     last_date_validate_fold = last_date_train_fold + pd.Timedelta(days=31 * 1)

#     # Log progress
#     print(f"Training from {start_date_train_fold} to {last_date_train_fold}")
#     print(f"Predicting from {start_date_validate_fold} to {last_date_validate_fold}\n")
#     sys.stdout.flush()

#     # Create label
#     label_fold = (
#         returns.loc[start_date_train_fold:last_date_train_fold].shift(-1).stack()
#     )

#     # Select features for training and validation
#     features_normalized_train_fold = features_normalized.reindex(label_fold.index)
#     features_normalized_validate_fold = features_normalized.sort_index().loc[
#         start_date_validate_fold:last_date_validate_fold
#     ]

#     # Create XGBoost model
#     model = XGBRegressor(**hyperparameters)

#     # Fit the model
#     model.fit(y=label_fold, X=features_normalized_train_fold)

#     # Make predictions
#     predictions = model.predict(features_normalized_validate_fold)
#     predictions = pd.Series(
#         predictions, index=features_normalized_validate_fold.index
#     ).unstack()

#     # Log completion
#     print(f"Finished processing {last_date_train_fold}\n")
#     sys.stdout.flush()

#     # Return results
#     return pd.concat({str(last_date_train_fold): predictions}, axis=1)


# # # Fix all but one function parameters to iterate on the last one
# # partial_train_predict_period = partial(
# #     train_predict_period,
# #     returns=data["return"],
# #     hyperparameters=hyperparameters,
# # )

# # # Train using one core per date
# # with Pool(16) as pool:
# #     predictions = pool.map(
# #         partial_train_predict_period,  # function to multiprocess
# #         rebalancing_dates,  # values to iterate on
# #     )

# # Parallel execution using Joblib
# predictions = Parallel(n_jobs=num_workers)(
#     delayed(train_predict_period)(date, data["return"], hyperparameters)
#     for date in rebalancing_dates
# )

# # Reformat predictions
# predictions = pd.concat(predictions, axis=1).T.groupby(level=1).sum().T


# # Training finished, print time used for it
# t2 = time.time()
# print(f"Total Training time is {t2 - t1} seconds")


In [25]:
# Measure time
t1 = time.time()

# Recompute the model every month, skipping the first 2 months
rebalancing_dates = pd.date_range(
    start=start_date_train, end=last_date_validate, freq="ME"
)[2:]

# Store predictions
all_predictions = []

for last_date_train_fold in rebalancing_dates:
    try:
        # Define training and validation dates
        start_date_train_fold = last_date_train_fold - pd.Timedelta(days=30 * 12)
        start_date_validate_fold = last_date_train_fold + pd.Timedelta(days=1)
        last_date_validate_fold = last_date_train_fold + pd.Timedelta(days=31 * 1)

        # Log progress
        print(f"Training from {start_date_train_fold} to {last_date_train_fold}")
        print(
            f"Predicting from {start_date_validate_fold} to {last_date_validate_fold}\n"
        )
        sys.stdout.flush()

        # Create label
        label_fold = (
            data["return"]
            .loc[start_date_train_fold:last_date_train_fold]
            .shift(-1)
            .stack()
        )

        # Select features for training and validation
        features_normalized_train_fold = features_normalized.reindex(label_fold.index)
        features_normalized_validate_fold = features_normalized.sort_index().loc[
            start_date_validate_fold:last_date_validate_fold
        ]

        # Create XGBoost model
        model = XGBRegressor(**hyperparameters)

        # Fit the model
        model.fit(y=label_fold, X=features_normalized_train_fold)

        # Make predictions
        predictions = model.predict(features_normalized_validate_fold)
        predictions = pd.Series(
            predictions, index=features_normalized_validate_fold.index
        ).unstack()

        # Store results
        all_predictions.append(
            pd.concat({str(last_date_train_fold): predictions}, axis=1)
        )

        # Log completion
        print(f"✅ Finished processing {last_date_train_fold}\n")
        sys.stdout.flush()

    except Exception as e:
        print(f"❌ Error in {last_date_train_fold}: {e}")
        sys.stdout.flush()

# Reformat predictions
if all_predictions:
    predictions = pd.concat(all_predictions, axis=1).T.groupby(level=1).sum().T
else:
    predictions = None

# Measure time
t2 = time.time()
print(f"Total Training time: {t2 - t1:.2f} seconds")


Training from 2022-04-05 00:00:00 to 2023-03-31 00:00:00
Predicting from 2023-04-01 00:00:00 to 2023-05-01 00:00:00

✅ Finished processing 2023-03-31 00:00:00

Training from 2022-05-05 00:00:00 to 2023-04-30 00:00:00
Predicting from 2023-05-01 00:00:00 to 2023-05-31 00:00:00

✅ Finished processing 2023-04-30 00:00:00

Training from 2022-06-05 00:00:00 to 2023-05-31 00:00:00
Predicting from 2023-06-01 00:00:00 to 2023-07-01 00:00:00

✅ Finished processing 2023-05-31 00:00:00

Training from 2022-07-05 00:00:00 to 2023-06-30 00:00:00
Predicting from 2023-07-01 00:00:00 to 2023-07-31 00:00:00

✅ Finished processing 2023-06-30 00:00:00

Training from 2022-08-05 00:00:00 to 2023-07-31 00:00:00
Predicting from 2023-08-01 00:00:00 to 2023-08-31 00:00:00

✅ Finished processing 2023-07-31 00:00:00

Training from 2022-09-05 00:00:00 to 2023-08-31 00:00:00
Predicting from 2023-09-01 00:00:00 to 2023-10-01 00:00:00

✅ Finished processing 2023-08-31 00:00:00

Training from 2022-10-05 00:00:00 to 202

In [13]:
# import matplotlib.pyplot as plt


# def analyze_expected_returns_fixed(
#     expected_returns,
#     returns,
#     rfr_hourly,
#     title="a Nice Try",
#     lags=[0, 1, 2, 3, 6, 12],
#     tc=0,
#     output_sharpe=False,
#     display_results=True,
# ):
#     positions = expected_returns_to_positions(expected_returns)
#     pnl = pnl_analytics(positions, returns, rfr_hourly, lag=1)["pnl"]
#     print(pnl)
#     # Convert cumulative returns to Matplotlib
#     plt.figure(figsize=(10, 5))
#     (1 + pnl).cumprod().plot()
#     plt.yscale("log")
#     plt.title(title)
#     plt.xlabel("Time")
#     plt.ylabel("Cumulative Returns")
#     plt.grid(True)
#     plt.show()


In [26]:
# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Gradient Boosted Trees, Walk-Forward Cross-Validation, Validation Set",
    lags=[0, 1, 2, 3, 6, 12],
    tc=tc,
)

In [17]:
import nbformat

nbformat.version_info

(5, 10, 4)

In [27]:
# Measure time
t1 = time.time()

# Define number of models for bagging
NUM_MODELS = 10


In [28]:
from sklearn.ensemble import BaggingRegressor


In [31]:
# Recompute the model every month, skipping the first 2 months
rebalancing_dates = pd.date_range(
    start=start_date_train, end=last_date_validate, freq="ME"
)[2:]

# Store predictions
all_predictions = []

for last_date_train_fold in rebalancing_dates:
    try:
        # Define training and validation dates
        start_date_train_fold = last_date_train_fold - pd.Timedelta(days=30 * 12)
        start_date_validate_fold = last_date_train_fold + pd.Timedelta(days=1)
        last_date_validate_fold = last_date_train_fold + pd.Timedelta(days=31 * 1)

        # Log progress
        print(f"Training from {start_date_train_fold} to {last_date_train_fold}")
        print(
            f"Predicting from {start_date_validate_fold} to {last_date_validate_fold}\n"
        )
        sys.stdout.flush()

        # Create label
        label_fold = (
            data["return"]
            .loc[start_date_train_fold:last_date_train_fold]
            .shift(-1)
            .stack()
        )

        # Select features for training and validation
        features_normalized_train_fold = features_normalized.reindex(label_fold.index)
        features_normalized_validate_fold = features_normalized.sort_index().loc[
            start_date_validate_fold:last_date_validate_fold
        ]

        # Define Base Model (XGBoost)
        base_xgb = XGBRegressor(**hyperparameters)

        # Define Bagging Regressor with XGBoost as Base Estimator
        bagged_model = BaggingRegressor(
            estimator=base_xgb,
            n_estimators=NUM_MODELS,  # Number of bootstrapped models
            max_samples=0.8,  # Each model is trained on 80% of the data
            max_features=0.8,  # Each model sees only 80% of the features
            bootstrap=True,  # Sample with replacement
            random_state=42,
            n_jobs=-1,  # Use all available cores
        )

        # Train the bagged model
        bagged_model.fit(X=features_normalized_train_fold, y=label_fold)

        # Make predictions
        predictions = bagged_model.predict(features_normalized_validate_fold)
        predictions = pd.Series(
            predictions, index=features_normalized_validate_fold.index
        ).unstack()

        # Store results
        all_predictions.append(
            pd.concat({str(last_date_train_fold): predictions}, axis=1)
        )

        # Log completion
        print(f"✅ Finished processing {last_date_train_fold}\n")
        sys.stdout.flush()

    except Exception as e:
        print(f"❌ Error in {last_date_train_fold}: {e}")
        sys.stdout.flush()

# Reformat predictions
if all_predictions:
    predictions = pd.concat(all_predictions, axis=1).T.groupby(level=1).sum().T
else:
    predictions = None

# Measure time
t2 = time.time()
print(f"Total Training time: {t2 - t1:.2f} seconds")


Training from 2022-04-05 00:00:00 to 2023-03-31 00:00:00
Predicting from 2023-04-01 00:00:00 to 2023-05-01 00:00:00

✅ Finished processing 2023-03-31 00:00:00

Training from 2022-05-05 00:00:00 to 2023-04-30 00:00:00
Predicting from 2023-05-01 00:00:00 to 2023-05-31 00:00:00

✅ Finished processing 2023-04-30 00:00:00

Training from 2022-06-05 00:00:00 to 2023-05-31 00:00:00
Predicting from 2023-06-01 00:00:00 to 2023-07-01 00:00:00

✅ Finished processing 2023-05-31 00:00:00

Training from 2022-07-05 00:00:00 to 2023-06-30 00:00:00
Predicting from 2023-07-01 00:00:00 to 2023-07-31 00:00:00

✅ Finished processing 2023-06-30 00:00:00

Training from 2022-08-05 00:00:00 to 2023-07-31 00:00:00
Predicting from 2023-08-01 00:00:00 to 2023-08-31 00:00:00

✅ Finished processing 2023-07-31 00:00:00

Training from 2022-09-05 00:00:00 to 2023-08-31 00:00:00
Predicting from 2023-09-01 00:00:00 to 2023-10-01 00:00:00

✅ Finished processing 2023-08-31 00:00:00

Training from 2022-10-05 00:00:00 to 202

In [32]:
# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Bagged XGBoost, Walk-Forward Cross-Validation, Validation Set",
    lags=[0, 1, 2, 3, 6, 12],
    tc=tc,
)


# LSTM

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


In [15]:
# Check for GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using {device}")

Using cuda


In [18]:
# LSTM Model from PyTorch
# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2
        )
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        predictions = self.fc(lstm_out[:, -1, :])  # Take last time step output
        return predictions


# Function to create sequences for LSTM
def create_sequences(features, labels, seq_length=10):
    X, y = [], []
    for i in range(len(features) - seq_length):
        if i + seq_length >= len(labels):
            break
        X.append(
            features.iloc[i : i + seq_length].values
        )  # Use .iloc[] for safe slicing
        y.append(labels.iloc[i + seq_length])

    return np.array(X), np.array(y)


In [19]:
# Recompute the model every month, skipping the first 2 months
rebalancing_dates = pd.date_range(
    start=start_date_train, end=last_date_validate, freq="ME"
)[2:]

# Store predictions
all_predictions = []

for last_date_train_fold in rebalancing_dates:
    try:
        # Define training and validation dates
        start_date_train_fold = last_date_train_fold - pd.Timedelta(days=30 * 12)
        start_date_validate_fold = last_date_train_fold + pd.Timedelta(days=1)
        last_date_validate_fold = last_date_train_fold + pd.Timedelta(days=31 * 1)

        # Log progress
        print(f"Training from {start_date_train_fold} to {last_date_train_fold}")
        print(
            f"Predicting from {start_date_validate_fold} to {last_date_validate_fold}\n"
        )
        sys.stdout.flush()

        # Create label
        label_fold = (
            data["return"]
            .loc[start_date_train_fold:last_date_train_fold]
            .shift(-1)
            .stack()
        )

        # Select features for training and validation
        features_train_fold = features_normalized.reindex(label_fold.index)
        features_validate_fold = features_normalized.sort_index().loc[
            start_date_validate_fold:last_date_validate_fold
        ]

        # Convert to LSTM sequences
        seq_length = 10  # Number of time steps to look back
        X_train, y_train = create_sequences(features_train_fold, label_fold, seq_length)
        X_val, y_val = create_sequences(features_validate_fold, label_fold, seq_length)

        # Convert to PyTorch tensors
        X_train, y_train = (
            torch.tensor(X_train, dtype=torch.float32).to(device),
            torch.tensor(y_train, dtype=torch.float32).to(device),
        )
        X_val, y_val = (
            torch.tensor(X_val, dtype=torch.float32).to(device),
            torch.tensor(y_val, dtype=torch.float32).to(device),
        )

        # Define DataLoader
        train_loader = DataLoader(
            TensorDataset(X_train, y_train), batch_size=32, shuffle=True
        )

        # Define Model
        input_dim = X_train.shape[2]  # Number of features
        hidden_dim = 64
        num_layers = 2
        output_dim = 1

        model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim).to(device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        # Train Model
        num_epochs = 10
        for epoch in range(num_epochs):
            model.train()
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = model(X_batch).squeeze()
                loss = criterion(y_pred, y_batch)
                loss.backward()
                optimizer.step()

            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")

        # Predict on validation set
        model.eval()
        with torch.no_grad():
            predictions = model(X_val).cpu().numpy().flatten()

        # Convert predictions to DataFrame
        predictions_df = pd.Series(
            predictions, index=features_validate_fold.index[seq_length:]
        ).unstack()

        # Store results
        all_predictions.append(
            pd.concat({str(last_date_train_fold): predictions_df}, axis=1)
        )

        # Log completion
        print(f"✅ Finished processing {last_date_train_fold}\n")
        sys.stdout.flush()

    except Exception as e:
        print(f"❌ Error in {last_date_train_fold}: {e}")
        sys.stdout.flush()

# Reformat predictions
if all_predictions:
    predictions = pd.concat(all_predictions, axis=1).T.groupby(level=1).sum().T
else:
    predictions = None

# Measure time
t2 = time.time()
print(f"Total Training time: {t2 - t1:.2f} seconds")


Training from 2022-04-05 00:00:00 to 2023-03-31 00:00:00
Predicting from 2023-04-01 00:00:00 to 2023-05-01 00:00:00

Epoch 1/10, Loss: 0.0000
Epoch 2/10, Loss: 0.0001
Epoch 3/10, Loss: 0.0001
Epoch 4/10, Loss: 0.0000
Epoch 5/10, Loss: 0.0000
Epoch 6/10, Loss: 0.0002
Epoch 7/10, Loss: 0.0003
Epoch 8/10, Loss: 0.0001
Epoch 9/10, Loss: 0.0000
Epoch 10/10, Loss: 0.0001
❌ Error in 2023-03-31 00:00:00: CUDA out of memory. Tried to allocate 14.08 GiB. GPU 0 has a total capacity of 8.00 GiB of which 4.33 GiB is free. Of the allocated memory 2.36 GiB is allocated by PyTorch, and 220.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Training from 2022-05-05 00:00:00 to 2023-04-30 00:00:00
Predicting from 2023-05-01 00:00:00 to 2023-05-31 00:00:00



KeyboardInterrupt: 