<hr>
<br>
<br>
<br>
<h1><center>Predicting Financial Markets with Machine Learning      </center></h1>
<h1><center>-      </center></h1>
<h2><center>Regularization & Cross-validation      </center></h2>
<br>
<br>
<hr>
<br>

<br>
<br>
<h2>Purpose</h2>
<br>
<hr>
A notebook to develop an AI system aiming at trading intraday on cryptocurrencies
<br>
<br>

<br>
<br>
<h2>Imports</h2>
<br>
<hr>
<br>

In [None]:
# Pandas and Python
import pandas as pd
pd.options.display.float_format = '{:.4f}'.format
import numpy as np
from tqdm import tqdm
import os

# Graphic Libraries
import plotly.io as pio
pio.templates.default = "simple_white"
pd.options.plotting.backend = "plotly"
import matplotlib as plt
from IPython.display import display, HTML, clear_output


# AI and stats
import statsmodels.api as sm
import xgboost
import torch
import sklearn




<br>
<br>
<h2>Notebook Parameters</h2>
<br>
<hr>
<br>

In [None]:
# Define data path
data_path = "/home/tbarrau/notebooks/HEC_Course/data_students/in_sample/"

# Risk free rate assumption
risk_free_rate = 0.05 # % per year
rfr_hourly = (1 + risk_free_rate)**(1 / (24*365)) - 1

# Suggested training set
start_date_train = "2023-01-24"
last_date_train = "2024-01-24"

# Suggested validation set
start_date_validate = "2024-01-25"
last_date_validate = "2024-07-24"

# Test set (Unavailable)
# start_date_test = "2024-07-25"
# last_date_test = "2025-01-24"

# Maximum number of features to use
max_nb_features = 20

# Set a level of transaction costs
tc = 0.0000


<br>
<br>
<h2>Data Loading</h2>
<br>
<hr>
<br>

In [None]:
# Main data
data = pd.read_csv(
    f"{data_path}data_in_sample.csv",
    index_col=0,
    header=[0,1],
)

# Make sure that the index is in the right format
data.index = pd.to_datetime(data.index)

# Visualize data
data


In [None]:
# Check what are the fields available
data.columns.get_level_values(0).drop_duplicates()


In [None]:
# Load pre-processed features
features = {}
for dirpath, dirnames, filenames in os.walk(data_path):
   for filename in filenames[-max_nb_features:]:
       
      if "feature" in filename:
          
          print(f"Loading {filename}")

          # Load feature
          feature = pd.read_csv(
              f"{data_path}{filename}",
              index_col=0,
              header=[0],
          )  

          # Make sure that the index is in the right format
          feature.index = pd.to_datetime(feature.index)

          # Store in the feature dict
          features[filename.replace(".csv", "")] = feature
           

<br>
<br>
<h2>Analytics</h2>
<br>
<hr>
Basic Portfolio analytics to invest in some predictions of the future instruments returns
<br>
<br>


In [None]:
def expected_returns_to_positions(expected_returns):
    """
    Normalize expected returns to make it an investable portfolio
    
    :param expected_returns: pd.DataFrame containing expectations
                             about future instruments prices variations
    """

    # Positions will be proportional to ranked alpha
    positions = expected_returns.rank(axis=1)

    # Re-scale the leverage
    positions = positions.div(positions.abs().sum(axis=1), axis=0)

    # Make the portfolio dollar neutral
    positions = positions.sub(positions.mean(axis=1), axis=0)
    
    return positions


def get_sharpe(pnl_portfolio, rfr_hourly):
    """
    Compute the sharpe ratio
    
    :param pnl_portfolio: pd.Series of returns of the portfolio considered
    :param rfr_hourly: float, the hourly risk free rate
    """

    # Compute excess returns
    excess_returns = pnl_portfolio - rfr_hourly
    
    # Compute sharpe ratio
    sharpe_ratio = (
        excess_returns.mean() / excess_returns.std() * np.sqrt(24 * 365)
    )
    
    # Output
    return round(sharpe_ratio, 2)


def pnl_analytics(positions, 
                  returns, 
                  rfr_hourly,
                  lag,
                  tc=0):
    """
    Compute the p&l analytics of the strategy
    
    :param positions: pd.DataFrame, some positions that have been reached
    :param returns: pd.DataFrame containing returns of instruments
    :param rfr_hourly: float, the hourly risk free rate
    :param lag: int, the number of hours to reach the positions
    :param tc: float, the transaction costs
    
    """

    # Compute gross p&l
    pnl = positions.shift(1+lag).mul(returns).sum(axis=1)
    
    # Compute transaction costs
    trades = positions.fillna(0).diff()
    costs = trades.abs().sum(axis=1) * tc
    
    # Net p&l: deduce costs from gross p&l
    pnl = pnl.sub(costs, fill_value=0)
    
    # Compute sharpe
    sharpe = get_sharpe(pnl, rfr_hourly)
    
    return {"sharpe": sharpe,
           "pnl": pnl}


def analyze_expected_returns(
    expected_returns,
    returns,
    rfr_hourly,
    title = "a Nice Try",
    lags = [0,1,2,3,6,12],
    tc = 0,
    output_sharpe=False,
    display_results=True,
):
    """
    Provide an economic analysis of some expected_returns
    
    :param expected_returns: pd.DataFrame containing expectations
                             about future instruments prices variations
    """
    
    # Take positions as a function of expected returns
    positions = expected_returns_to_positions(expected_returns)
    
    # Compute p&l and sharpe for different lags
    pnl_lags = {}
    for lag in lags:
        analytics_lag = pnl_analytics(
            positions=positions, 
            returns=returns, 
            rfr_hourly=rfr_hourly,
            lag=lag,
            tc=tc)
        lag_label = f"Lag {lag}, sharpe={analytics_lag['sharpe']}"
        pnl_lags[lag_label] = analytics_lag["pnl"]
        
    # Display returns
    pnl_lags = pd.concat(pnl_lags, axis=1).dropna()
    if display_results:
        fig = (1+pnl_lags).cumprod().plot(
            title=f"Cumulative returns of {title}",
        )
        fig.update_layout(yaxis_type="log")
        fig.show()

    if output_sharpe:
        for lag_label in pnl_lags.columns:
            if "Lag 0" in lag_label:
                return lag_label.split("sharpe=")[-1]
        
    


<br>
<br>
<h2>Regularized Linear Models: Ridge, LASSO and Elastic Net Predictions</h2>
<br>
<hr>
We explore hereafter various regularization methods of linear multivariate models
<br>
<br>


<h4>Label definition</h4>
What do we want to predict?
<br> 

In [None]:
label = data["return"].loc[start_date_train:last_date_train
    ].shift(-1).stack()

<h4>Features Pre-processing</h4>
How to process the features ?
<br> 

In [None]:
features_normalized = {}

for feature_name in features.keys():

    print(f"Processing {feature_name}")
    
    # Extract the feature
    feature_normalized = features[feature_name]

    # Rank the feature to remove outliers
    feature_normalized = feature_normalized.rank(axis=1, pct=True) - 0.5

    # Stack the feature
    feature_normalized = feature_normalized.stack()

    # Store this normalized version
    features_normalized[feature_name] = feature_normalized

# Convert normalized features dict to a single dataframe
features_normalized = pd.concat(features_normalized, axis=1)

# Replace NaNs by average values, as OLS cannot handle NaNs effectively
features_normalized = features_normalized.fillna(0)

# Reindex like the label for training
features_normalized_train = features_normalized.reindex(label.index)
                                                        

<h4>Model Creation</h4>
<br> 

In [None]:
# Create models with various penalizations
ridge_model = sklearn.linear_model.ElasticNet(
    alpha=0.0001,
    l1_ratio=0.01,
    fit_intercept=True,
)

LASSO_model = sklearn.linear_model.ElasticNet(
    alpha=0.000005,
    l1_ratio=1,
    fit_intercept=True,
)

elastic_net_model = sklearn.linear_model.ElasticNet(
    alpha=0.000015,
    l1_ratio=0.5,
    fit_intercept=True,
)

# Store models
models = {
    "Ridge" : ridge_model,
    "LASSO" : LASSO_model,
    "Elastic Net" : elastic_net_model,
}

# Fit the models
for model_name in models.keys():

    model = models[model_name]

    model = model.fit(
        y=label,
        X=features_normalized_train,
    )

    models[model_name] = model



<h4>Model Predictions</h4>
<br> 
<h5>Train Set</h5>
<br> 

In [None]:
# Make predictions
for model_name in models.keys():

    # Extract fitted model
    model = models[model_name]

    # Make predictions
    predictions = model.predict(
        features_normalized_train)
    predictions = pd.Series(
        predictions, 
        index=features_normalized_train.index
    ).unstack()
    
    # Analyse our predictions
    analyze_expected_returns(
        expected_returns=predictions,
        returns=data["return"].loc[start_date_train:last_date_train],
        rfr_hourly=rfr_hourly,
        title = f"Regularized {model_name} model, Training Set",
        lags = [0,1,2,3,6,12],
        tc = tc)

<br> 
<h5>Validation Set</h5>
<br> 

In [None]:
# Extract features on the validation set
features_normalized_validate = features_normalized.sort_index().loc[
    start_date_validate:last_date_validate]

In [None]:
# Make predictions
for model_name in models.keys():

    # Extract fitted model
    model = models[model_name]

    # Make predictions
    predictions = model.predict(
        features_normalized_validate)
    predictions = pd.Series(
        predictions, 
        index=features_normalized_validate.index
    ).unstack()
    
    # Analyse our predictions
    analyze_expected_returns(
        expected_returns=predictions,
        returns=data["return"].loc[start_date_validate:last_date_validate],
        rfr_hourly=rfr_hourly,
        title = f"Regularized {model_name} model, Validation Set",
        lags = [0,1,2,3,6,12],
        tc = tc)

<br>
<br>
<h2>Various Flavors of Cross-Validation</h2>
<br>
<hr>
We do need a methodology to set the value of the hyperparameters, particularly "alpha"
<br>
<br>


<h4>Searching the optimal hyperparameter</h4>
<br> 

In [None]:
# We will store the sharpes of each alpha
sharpes = pd.Series(dtype=float)

for alpha in np.linspace(1e-6, 2e-5, 30):

    # Create model
    model = sklearn.linear_model.ElasticNet(
        alpha=alpha,
        l1_ratio=0.5,
        fit_intercept=True,
    )

    # Fit model
    model = model.fit(
        y=label,
        X=features_normalized_train,
    )

    # Predict on the validation set
    predictions = model.predict(
        features_normalized_validate)
    predictions = pd.Series(
        predictions, 
        index=features_normalized_validate.index
    ).unstack()
    
    # Analyse our predictions
    sharpes.loc[alpha] = analyze_expected_returns(
        expected_returns=predictions,
        returns=data["return"].loc[start_date_validate:last_date_validate],
        rfr_hourly=rfr_hourly,
        title=f"Regularized Elastic Net model, lambda={alpha} Validation Set",
        lags=[0,1,2,3,6,12],
        tc=tc,
        output_sharpe=True,
        display_results=False,
        
    )

# Select the optimal alpha
optimal_alpha = sharpes.idxmax()

# Create model
model = sklearn.linear_model.ElasticNet(
    alpha=optimal_alpha,
    l1_ratio=0.5,
    fit_intercept=True,
)

# Fit model
model = model.fit(
    y=label,
    X=features_normalized_train,
)

# Predict on the validation set
predictions = model.predict(
    features_normalized_validate)
predictions = pd.Series(
    predictions, 
    index=features_normalized_validate.index
).unstack()

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions,
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Regularized Elastic Net model, lambda={alpha} Validation Set",
    lags=[0,1,2,3,6,12],
    tc=tc,    
)
    

<br> 
<h4>K-Folds Cross-Validation</h4>
<br> 

In [None]:
# Split the data along the time axis
ts_splitter = sklearn.model_selection.TimeSeriesSplit(n_splits=5)

# Create model
model = sklearn.linear_model.ElasticNetCV(
    l1_ratio=0.1,
    fit_intercept=True,
    cv=ts_splitter,
    alphas=np.linspace(1e-6, 2e-5, 30),
)

# Fit model
model = model.fit(
    y=label,
    X=features_normalized_train,
)

# Predict on the validation set
predictions = model.predict(
    features_normalized_validate)
predictions = pd.Series(
    predictions, 
    index=features_normalized_validate.index
).unstack()

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions,
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Regularized Elastic Net model, lambda calibrated with CV, Validation Set",
    lags=[0,1,2,3,6,12],
    tc=tc,
    
)

<br> 
<h4>Walk Forward Cross-Validation</h4>
<br> 

In [None]:
# Recompute the model every month, skip the first 2 months
rebalancing_dates = pd.date_range(
    start=start_date_train, 
    end=last_date_validate, 
    freq="ME"
)[2:]

# Predictions made over successive validation sets will be aggregated into a DataFrame
predictions = pd.DataFrame(
    0,
    index=data["return"].index,
    columns=data["return"].columns,
)

# Launch the training loop
for last_date_train_fold in rebalancing_dates:

    # Define training and validation dates
    
    # Train the model over the last 3 months
    start_date_train_fold = last_date_train_fold - pd.Timedelta(days = 31 * 12)
    
    # The model cannot be used before the first day following the training
    # (no look-forward bias)
    start_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 1)
    
    # The trained model will be used for 1 month
    last_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 31 * 1)

    # Log informations
    print(f"Train a model from date {start_date_train_fold} to date {last_date_train_fold}")
    print(f"Predict from date {start_date_validate_fold} to date {last_date_validate_fold}")
    print("")


    # Create label
    label_fold = data["return"].loc[start_date_train_fold:last_date_train_fold
        ].shift(-1).stack()

    # Only keep dates of the train and validation sets for the features
    features_normalized_train_fold = features_normalized.reindex(label_fold.index)
    features_normalized_validate_fold = features_normalized.sort_index().loc[
        start_date_validate_fold:last_date_validate_fold]

    # Split the data along the time axis
    ts_splitter = sklearn.model_selection.TimeSeriesSplit(n_splits=5)
    
    # Create model
    model = sklearn.linear_model.ElasticNetCV(
        l1_ratio=0.5,
        fit_intercept=True,
        cv=ts_splitter,
        alphas=np.logspace(-8, -5, 50),
    )
    
    # Fit model
    model = model.fit(
        y=label_fold,
        X=features_normalized_train_fold,
    )
    
    # Predict on the validation set
    predictions_fold = model.predict(
        features_normalized_validate_fold)
    predictions_fold = pd.Series(
        predictions_fold, 
        index=features_normalized_validate_fold.index
    ).unstack()

    # Aggregate with other predictions
    predictions = predictions.add(predictions_fold, fill_value=0)

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Regularized Elastic Net model, Walk-Forward Cross-Validation, Validation Set",
    lags=[0,1,2,3,6,12],
    tc=tc,
    
)

<br>
<br>
<h2>Alternative Expected Returns Design</h2>
<br>
<hr>
We try to create hereafter some alternatives to the expected returns obtained by our model
<br>
<br>
<h4>Coming Returns</h4>
<br> 

In [None]:
# Create predictions
predictions = data["return"].rolling(13).mean().shift(-13)

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Coming Returns",
    lags=[0,1,2,3,6,12],
    tc=tc,
)

<br>
<h4>Top Gainers / Loosers</h4>
<br> 

In [None]:
data["return"].mean()

In [None]:
# Sort the average returns
sorted_returns = data["return"].loc[:"2023"].mean().sort_values().dropna()

# Extract top gainers and top loosers
top_gainers = sorted_returns.iloc[-25:].index
top_loosers = sorted_returns.iloc[:25].index

In [None]:
# Build the alpha by going long the top gainers and short the top loosers
predictions = pd.DataFrame(np.nan, 
                     index=data["return"].index,
                     columns=data["return"].columns
                    )
predictions[top_gainers] = 1
predictions[top_loosers] = -1

# The signal is 0 for any token not in the top/bottom
predictions = predictions.fillna(0)

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Top Gainers / Loosers",
    lags=[0,1,2,3,6,12],
    tc=tc,
)


<br>
<br>
<h2>Linear Polymodels or Multi-Univariate Linear Model</h2>
<br>
<hr>
We try to remove further complexity by replacing the multivariate OLS by a collection of univariate models
<br>
<br>


<br>
<h4>Basic Training / Validation Experiment</h4>
<br> 

In [None]:
predictions = pd.DataFrame(
    0,
    index=data["return"].index,
    columns=data["return"].columns,
)

for feature_name in features_normalized_train.columns:

    # Create model
    model = sm.OLS(
        endog = label,
        exog = sm.add_constant(features_normalized_train[feature_name]),
    )
    
    # Fit the model
    model = model.fit()
    
    # Make predictions
    predictions_feature = model.predict(
        sm.add_constant(features_normalized_validate[feature_name])
    ).unstack()

    # Aggregate with other predictions
    predictions = predictions.add(predictions_feature, fill_value=0)

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Linear Polymodel, Validation Set",
    lags=[0,1,2,3,6,12],
    tc=tc,
)
    

<br>
<h4>Walk-Forward Implementation</h4>
<br> 

In [None]:
# Recompute the model every month, skip the first 2 months
rebalancing_dates = pd.date_range(
    start=start_date_train, 
    end=last_date_validate, 
    freq="ME"
)[2:]

# Predictions made over successive validation sets will be aggregated into a DataFrame
predictions = pd.DataFrame(
    0,
    index=data["return"].index,
    columns=data["return"].columns,
)

# Launch the training loop
for last_date_train_fold in rebalancing_dates:

    # Define training and validation dates
    
    # Train the model over the last X months
    start_date_train_fold = last_date_train_fold - pd.Timedelta(days = 30 * 12)
    
    # The model cannot be used before the first day following the training
    # (no look-forward bias)
    start_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 1)
    
    # The trained model will be used for 1 month
    last_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 31 * 1)

    # Log informations
    print(f"Train a model from date {start_date_train_fold} to date {last_date_train_fold}")
    print(f"Predict from date {start_date_validate_fold} to date {last_date_validate_fold}")
    print("")


    # Create label
    label_fold = data["return"].loc[start_date_train_fold:last_date_train_fold
        ].shift(-1).stack()

    # Only keep dates of the train and validation sets for the features
    features_normalized_train_fold = features_normalized.reindex(label_fold.index)
    features_normalized_validate_fold = features_normalized.sort_index().loc[
        start_date_validate_fold:last_date_validate_fold]

    for feature_name in features_normalized_train_fold.columns:


        # Split the data along the time axis
        ts_splitter = sklearn.model_selection.TimeSeriesSplit(n_splits=5)
        
        # Create model
        model = sklearn.linear_model.ElasticNetCV(
            l1_ratio=0.01,
            fit_intercept=True,
            cv=ts_splitter,
            alphas=np.logspace(-8, -5, 50),
        )
        
        # Fit model
        model = model.fit(
            y=label_fold,
            X=features_normalized_train_fold[feature_name].to_frame(),
        )
        
        # Predict on the validation set
        predictions_feature = model.predict(
            features_normalized_validate_fold[feature_name].to_frame())
        predictions_feature = pd.Series(
            predictions_feature, 
            index=features_normalized_validate_fold[feature_name].index
        ).unstack()
    
        # Aggregate with other predictions
        predictions = predictions.add(predictions_feature, fill_value=0)

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Linear Polymodel, Walk-Forward Cross-Validation, Validation Set",
    lags=[0,1,2,3,6,12],
    tc=tc,
    
)

<br>
<br>
<h2>Improving Computation Time</h2>
<br>
<hr>
Our ability to run several experiments in a given amount of time determine the speed at which we learn and improve our models.
Always present in practice, computation time is a constraint that can be pushed out.
<br>
<br>


<br>
<h4>Measuring Computation Time</h4>
<br> 

In [None]:
import time

In [None]:
# Measure time
t1 = time.time()

# Recompute the model every month, skip the first 2 months
rebalancing_dates = pd.date_range(
    start=start_date_train, 
    end=last_date_validate, 
    freq="ME"
)[2:]

# Predictions made over successive validation sets will be aggregated into a DataFrame
predictions = pd.DataFrame(
    0,
    index=data["return"].index,
    columns=data["return"].columns,
)

# Launch the training loop
for last_date_train_fold in rebalancing_dates:

    # Define training and validation dates
    
    # Train the model over the last X months
    start_date_train_fold = last_date_train_fold - pd.Timedelta(days = 30 * 12)
    
    # The model cannot be used before the first day following the training
    # (no look-forward bias)
    start_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 1)
    
    # The trained model will be used for 1 month
    last_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 31 * 1)

    # Log informations
    print(f"Train a model from date {start_date_train_fold} to date {last_date_train_fold}")
    print(f"Predict from date {start_date_validate_fold} to date {last_date_validate_fold}")
    print("")


    # Create label
    label_fold = data["return"].loc[start_date_train_fold:last_date_train_fold
        ].shift(-1).stack()

    # Only keep dates of the train and validation sets for the features
    features_normalized_train_fold = features_normalized.reindex(label_fold.index)
    features_normalized_validate_fold = features_normalized.sort_index().loc[
        start_date_validate_fold:last_date_validate_fold]

    for feature_name in features_normalized_train_fold.columns:


        # Split the data along the time axis
        ts_splitter = sklearn.model_selection.TimeSeriesSplit(n_splits=5)
        
        # Create model
        model = sklearn.linear_model.ElasticNetCV(
            l1_ratio=0.01,
            fit_intercept=True,
            cv=ts_splitter,
            alphas=np.logspace(-8, -5, 50),
        )
        
        # Fit model
        model = model.fit(
            y=label_fold,
            X=features_normalized_train_fold[feature_name].to_frame(),
        )
        
        # Predict on the validation set
        predictions_feature = model.predict(
            features_normalized_validate_fold[feature_name].to_frame())
        predictions_feature = pd.Series(
            predictions_feature, 
            index=features_normalized_validate_fold[feature_name].index
        ).unstack()
    
        # Aggregate with other predictions
        predictions = predictions.add(predictions_feature, fill_value=0)
        

# Training finished, print time used for it
t2 = time.time()
print(f"Total Training time is {t2-t1} seconds")

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Linear Polymodel, Walk-Forward Cross-Validation, Validation Set",
    lags=[0,1,2,3,6,12],
    tc=tc,
    
)

<br>
<h4>Multiprocessing</h4>
<br> 

<h5>Library Embedded Multiprocessing</h5>
<br> 

In [None]:
# Measure time
t1 = time.time()

# Recompute the model every month, skip the first 2 months
rebalancing_dates = pd.date_range(
    start=start_date_train, 
    end=last_date_validate, 
    freq="ME"
)[2:]

# Predictions made over successive validation sets will be aggregated into a DataFrame
predictions = pd.DataFrame(
    0,
    index=data["return"].index,
    columns=data["return"].columns,
)

# Launch the training loop
for last_date_train_fold in rebalancing_dates:

    # Define training and validation dates
    
    # Train the model over the last X months
    start_date_train_fold = last_date_train_fold - pd.Timedelta(days = 30 * 12)
    
    # The model cannot be used before the first day following the training
    # (no look-forward bias)
    start_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 1)
    
    # The trained model will be used for 1 month
    last_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 31 * 1)

    # Log informations
    print(f"Train a model from date {start_date_train_fold} to date {last_date_train_fold}")
    print(f"Predict from date {start_date_validate_fold} to date {last_date_validate_fold}")
    print("")


    # Create label
    label_fold = data["return"].loc[start_date_train_fold:last_date_train_fold
        ].shift(-1).stack()

    # Only keep dates of the train and validation sets for the features
    features_normalized_train_fold = features_normalized.reindex(label_fold.index)
    features_normalized_validate_fold = features_normalized.sort_index().loc[
        start_date_validate_fold:last_date_validate_fold]

    for feature_name in features_normalized_train_fold.columns:


        # Split the data along the time axis
        ts_splitter = sklearn.model_selection.TimeSeriesSplit(n_splits=5)
        
        # Create model
        model = sklearn.linear_model.ElasticNetCV(
            l1_ratio=0.01,
            fit_intercept=True,
            cv=ts_splitter,
            alphas=np.logspace(-8, -5, 50),
            n_jobs=-1,
        )
        
        # Fit model
        model = model.fit(
            y=label_fold,
            X=features_normalized_train_fold[feature_name].to_frame(),
        )
        
        # Predict on the validation set
        predictions_feature = model.predict(
            features_normalized_validate_fold[feature_name].to_frame())
        predictions_feature = pd.Series(
            predictions_feature, 
            index=features_normalized_validate_fold[feature_name].index
        ).unstack()
    
        # Aggregate with other predictions
        predictions = predictions.add(predictions_feature, fill_value=0)
        

# Training finished, print time used for it
t2 = time.time()
print(f"Total Training time is {t2-t1} seconds")

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Linear Polymodel, Walk-Forward Cross-Validation, Validation Set",
    lags=[0,1,2,3,6,12],
    tc=tc,
    
)

<h5>External Multiprocessing</h5>
<br> 

In [None]:
len(rebalancing_dates)

In [None]:
from functools import partial
from multiprocessing import Pool

In [None]:
# Measure time
t1 = time.time()

# Recompute the model every month, skip the first 2 months
rebalancing_dates = pd.date_range(
    start=start_date_train, 
    end=last_date_validate, 
    freq="ME"
)[2:]

def train_predict_period(
    last_date_train_fold,
    returns,
    features_normalized,
):

    # Define training and validation dates
    
    # Train the model over the last X months
    start_date_train_fold = last_date_train_fold - pd.Timedelta(days = 30 * 12)
    
    # The model cannot be used before the first day following the training
    # (no look-forward bias)
    start_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 1)
    
    # The trained model will be used for 1 month
    last_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 31 * 1)

    # Log informations
    print(f"Train a model from date {start_date_train_fold} to date {last_date_train_fold}")
    print(f"Predict from date {start_date_validate_fold} to date {last_date_validate_fold}")
    print("")


    # Create label
    label_fold = returns.loc[start_date_train_fold:last_date_train_fold
        ].shift(-1).stack()

    # Only keep dates of the train and validation sets for the features
    features_normalized_train_fold = features_normalized.reindex(label_fold.index)
    features_normalized_validate_fold = features_normalized.sort_index().loc[
        start_date_validate_fold:last_date_validate_fold]

    predictions_date = pd.DataFrame(
        0,
        index=returns.index,
        columns=returns.columns,
    )

    for feature_name in features_normalized_train_fold.columns:


        # Split the data along the time axis
        ts_splitter = sklearn.model_selection.TimeSeriesSplit(n_splits=5)
        
        # Create model
        model = sklearn.linear_model.ElasticNetCV(
            l1_ratio=0.01,
            fit_intercept=True,
            cv=ts_splitter,
            alphas=np.logspace(-8, -5, 50),
            n_jobs=1,
        )
        
        # Fit model
        model = model.fit(
            y=label_fold,
            X=features_normalized_train_fold[feature_name].to_frame(),
        )
        
        # Predict on the validation set
        predictions_feature = model.predict(
            features_normalized_validate_fold[feature_name].to_frame())
        predictions_feature = pd.Series(
            predictions_feature, 
            index=features_normalized_validate_fold[feature_name].index
        ).unstack()
    
        # Aggregate with other predictions
        predictions_date = predictions_date.add(
            predictions_feature, fill_value=0)
    
    # Output results
    return pd.concat(
        {str(last_date_train_fold) : predictions_date},
        axis=1
    )


# Fix all but one function parameters to iterate on the last one
partial_train_predict_period = partial(
    train_predict_period,
    returns=data["return"],
    features_normalized=features_normalized,
)

# Train using one core per date
with Pool(16) as pool:
    predictions = pool.map(
        partial_train_predict_period, # function to multiprocess
        rebalancing_dates, # values to iterate on
    )

# Reformat predictions
predictions = pd.concat(predictions, axis=1).T.groupby(level=1).sum().T
        

# Training finished, print time used for it
t2 = time.time()
print(f"Total Training time is {t2-t1} seconds")

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Linear Polymodel, Walk-Forward Cross-Validation, Validation Set",
    lags=[0,1,2,3,6,12],
    tc=tc,
)

<br>
<h4>Profiling computation time line by line</h4>
<br> 

In [None]:
from line_profiler import LineProfiler

In [None]:
lp = LineProfiler()
lp_wrapper = lp(train_predict_period)
lp_wrapper(
    last_date_train_fold=rebalancing_dates[-1], 
    returns=data["return"],
    features_normalized=features_normalized,
)

lp.print_stats()

In [None]:
# Measure time
t1 = time.time()

# Recompute the model every month, skip the first 2 months
rebalancing_dates = pd.date_range(
    start=start_date_train, 
    end=last_date_validate, 
    freq="ME"
)[2:]

def train_predict_period(
    last_date_train_fold,
    returns,
    #features_normalized,
):

    # Define training and validation dates
    
    # Train the model over the last X months
    start_date_train_fold = last_date_train_fold - pd.Timedelta(days = 30 * 12)
    
    # The model cannot be used before the first day following the training
    # (no look-forward bias)
    start_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 1)
    
    # The trained model will be used for 1 month
    last_date_validate_fold = last_date_train_fold + pd.Timedelta(days = 31 * 1)

    # Log informations
    print(f"Train a model from date {start_date_train_fold} to date {last_date_train_fold}")
    print(f"Predict from date {start_date_validate_fold} to date {last_date_validate_fold}")
    print("")


    # Create label
    label_fold = returns.loc[start_date_train_fold:last_date_train_fold
        ].shift(-1).stack()

    # Only keep dates of the train and validation sets for the features
    features_normalized_train_fold = features_normalized.reindex(label_fold.index)
    features_normalized_validate_fold = features_normalized.sort_index().loc[
        start_date_validate_fold:last_date_validate_fold]

    predictions_date = pd.Series(
        0, 
        index=features_normalized_validate_fold.index,
    ).unstack()

    for feature_name in features_normalized_train_fold.columns:


        # Split the data along the time axis
        ts_splitter = sklearn.model_selection.TimeSeriesSplit(n_splits=5)
        
        # Create model
        model = sklearn.linear_model.ElasticNetCV(
            l1_ratio=0.01,
            fit_intercept=True,
            cv=ts_splitter,
            alphas=np.logspace(-8, -5, 50),
            n_jobs=1,
        )
        
        # Fit model
        model = model.fit(
            y=label_fold,
            X=features_normalized_train_fold[feature_name].to_frame(),
        )
        
        # Predict on the validation set
        predictions_feature = model.predict(
            features_normalized_validate_fold[feature_name].to_frame())
        predictions_feature = pd.Series(
            predictions_feature, 
            index=features_normalized_validate_fold[feature_name].index
        ).unstack()
    
        # Aggregate with other predictions
        predictions_date = predictions_date.add(
            predictions_feature, fill_value=0)
    
    # Output results
    return pd.concat(
        {str(last_date_train_fold) : predictions_date},
        axis=1
    )


# Fix all but one function parameters to iterate on the last one
partial_train_predict_period = partial(
    train_predict_period,
    returns=data["return"],
    # features_normalized=features_normalized,
)

# Train using one core per date
with Pool(16) as pool:
    predictions = pool.map(
        partial_train_predict_period, # function to multiprocess
        rebalancing_dates, # values to iterate on
    )

# Reformat predictions
predictions = pd.concat(predictions, axis=1).T.groupby(level=1).sum().T
        

# Training finished, print time used for it
t2 = time.time()
print(f"Total Training time is {t2-t1} seconds")

# Analyse our predictions
analyze_expected_returns(
    expected_returns=predictions.loc[start_date_validate:last_date_validate],
    returns=data["return"].loc[start_date_validate:last_date_validate],
    rfr_hourly=rfr_hourly,
    title=f"Linear Polymodel, Walk-Forward Cross-Validation, Validation Set",
    lags=[0,1,2,3,6,12],
    tc=tc,
    
)