# Start of modelling

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.linear_model import (LinearRegression, 
                                  HuberRegressor,
                                  ElasticNet)
from sklearn.metrics import (mean_squared_error, 
                             r2_score)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (TimeSeriesSplit, 
                                     ParameterGrid)
from sklearn.pipeline import make_pipeline
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import (GradientBoostingRegressor,
                              RandomForestRegressor as RF)
from group_lasso import GroupLasso
from datetime import datetime

### Loading data and changing it to be usable

In [2]:
# Read datafiles into dataframes.
# y = the excess returns. X has the predictors.
y = pd.read_csv('Dependent_y.csv', header=0, index_col=0)
X = pd.read_csv('Features_X.csv', header=0, index_col=0)


y.fillna(0, inplace=True) # Do we need this? Isn't it already done in Pre-Processing?

# Converting data to Dates as index
y.index = pd.to_datetime(y.index, format="%Y-%m").to_period('M')
X.index = pd.to_datetime(X.index, format="%Y-%m").to_period('M')

# Creating the weights of the stocks compared to the portfolio
weights = pd.read_csv('Stocks_weights.csv', header=0)
weights.index = weights['Date']
weights = weights.drop('Date', axis=1)
weights.index = pd.to_datetime(weights.index, format="%Y-%m").to_period('M')

## Functions

In [3]:

def R_oos(num, den):
    """
Calculates the Out Of Sample R-squared
Input: 
    - num: Numerator
    - den: Denomenator

    Output: Out of sample R-squared
    """
    R_oos_val = 1 - (np.sum(num)/np.sum(den))
    return R_oos_val



def val_fun(model, params: dict, X_trn, y_trn, X_vld, y_vld, max_iter=10, tol=1e-4):
    """
Validates a model to get the best parameters
Input: 
    - model: The model we are validating.
    - params: A dictionary of parameters.
    - X_trn: Predictors training set.
    - y_trn: Dependent variable training set.
    - X_vld: Predictors validation set.
    - y_vld:Dependent variable validation set.
    - max_iter: ...
    - tol: ...

    Output: Best parameters.
    """
    best_ros = None
    lst_params = list(ParameterGrid(params))
    no_improvement_count = 0
    for param in lst_params:
        if best_ros == None:
            mod = model().set_params(**param).fit(X_trn, y_trn)
            y_pred = mod.predict(X_vld)
            best_ros = R_oos(y_vld, y_pred)
            best_param = param
        else:
            mod = model().set_params(**param).fit(X_trn, y_trn)
            y_pred = mod.predict(X_vld)
            ros = R_oos(y_vld, y_pred)
            if ros > best_ros:
                best_ros = ros
                best_param = param
                no_improvement_count = 0
            else:
                no_improvement_count += 1
                if no_improvement_count >= max_iter:
                    break
            if abs(ros - best_ros) < tol:
                break
    return best_param


def Sharpe_gain(Sharpe_val, Roo2_val):
    """
Here: what this function does.
Input: 
    - Sharpe_val: 
    - Roo2_Val:

    Output: 
    """
    SR_star = np.sqrt(((Sharpe_val**2)+Roo2_val)/(1-Roo2_val))
    return SR_star - Sharpe_val

In [4]:
def val_fun_NN(model, params: dict, X_trn, y_trn, X_vld, y_vld, illustration=True):
    """
Validates a Neural Network to return the best mode.
Input: 
    - model: The model we are validating.
    - params: A dictionary of parameters.
    - X_trn: Predictors training set.
    - y_trn: Dependent variable training set.
    - X_vld: Predictors validation set.
    - y_vld: Dependent variable validation set.

    Output: The best Neural Network model.
    """
    best_ros = None
    lst_params = list(ParameterGrid(params))
    for param in lst_params:
        if best_ros is None:
            mod = model(n_layers=param['n_layers'], loss=param['loss'], l1=param['l1'], 
                            learning_rate=param['learning_rate'], batch_size=param['batch_size'], 
                            epochs=param['epochs'], random_state=param['random_state'], 
                            batch_norm=param['batch_norm'], patience=param['patience'], 
                            verbose=param['verbose'], monitor=param['monitor'])
            mod.fit(X_trn, y_trn, X_vld, y_vld)
            best_mod = mod
            y_pred = mod.predict(X_vld)
            best_ros = R_oos(y_vld, y_pred)
            best_param = param
            if illustration:
                print(f'Model with params: {param} finished.')
                print(f'with out-of-sample R squared on validation set: {best_ros*100:.5f}%')
                print('*'*60)
        else:
            mod = model(n_layers=param['n_layers'], loss=param['loss'], l1=param['l1'], 
                            learning_rate=param['learning_rate'], batch_size=param['batch_size'], 
                            epochs=param['epochs'], random_state=param['random_state'], 
                            batch_norm=param['batch_norm'], patience=param['patience'], 
                            verbose=param['verbose'], monitor=param['monitor'])
            mod.fit(X_trn, y_trn, X_vld, y_vld)
            y_pred = mod.predict(X_vld)
            ros = R_oos(y_vld, y_pred)
            if illustration:
                print(f'Model with params: {param} finished.')
                print(f'with out-of-sample R squared on validation set: {ros*100:.5f}%')
                print('*'*60)
            if ros > best_ros:
                best_ros = ros
                best_mod = mod
                best_param = param
    if illustration:
        print('\n'+'#'*60)
        print('Tuning process finished!!!')
        print(f'The best setting is: {best_param}')
        print(f'with R2oos {best_ros*100:.2f}% on validation set.')
        print('#'*60)
    return best_mod


## 1 - OLS(-3)

#### Characteristics of OLS-3
OLS-3 includes size = 'mvel1', Book-to-Market = 'bm', momentum = 'mom1m','mom6m','mom12m','mom36m'.

In [32]:
# Save the predictors needed for the OLS-3 in a seperate file
X_3pred = X[['mvel1', 'bm', 'mom1m', 'mom6m', 'mom12m', 'mom36m']]

### Regression with expanding window and with and without Huber loss

In [23]:
def expanding_regression_OLS(Dependent, Predictors, stock_weights, loss = 'OLS', initial_train_years = 18, validation_years = 12, test_years = 1):
    """
Function that runs OLS with expanding window.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio.
    - loss: Specify if you want to use 'OLS' loss, or 'Huber' loss
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
    """
    # Initialize lists to store values.
    r_port_difference_list = []
    r_port_actual_list =[]
    # List of 1957-2016
    years = Dependent.index.year.unique()

    # Loop for expanding window.
    for i in range(len(years) - initial_train_years - validation_years):
        start_year = years[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()

        # Specifies which loss function to use.    
        if loss == 'OLS':
            model = LinearRegression()
        elif loss == 'Huber':
            model = HuberRegressor(epsilon = 99.9) # Set the epsilon to 99.9%.
        else:
            raise ValueError("Invalid loss function. Use OLS or Huber.")
        
        # Training the model
        OLS3 = model.fit(X_train, y_train)

        # Predict returns at stock level
        r_stock_pred = OLS3.predict(X_test).reshape(-1)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)
        
    return Model_Roos

In [35]:
#OUT-OF-SAMPLE R^2 for OLS-3
OLS_3pred_Roos=expanding_regression_OLS(y,X_3pred,stock_weights=weights)
OLS_3pred_Roos

-0.001969218131502082

In [34]:
#OUT-OF-SAMPLE R^2 for OLS-3 with Huber Loss function
OLS_3pred_Roos_H = expanding_regression_OLS(y, X_3pred,stock_weights=weights, loss = 'Huber')
OLS_3pred_Roos_H

0.0004975755819653926

## 2 - Dimension Reduction: PCR and PLS

### 2.1 - PCR

In [36]:
def pcr(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs PCR with expanding window.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
    """
    # Lists to save outcomes.
    component_counts = []
    r_port_difference_list = []
    r_port_actual_list = []

    yrs = Dependent.index.year.unique() # List with all the years. (1957-2016)
    n_components_list = range(1, 7) # To determine the number of components that are tested.
    best_components = None # Initialize and later save the best amount of components.
    best_r2 = -np.inf  # Initialize with negative infinity to find the maximum R-squared

    # Expanding window.
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()

        # Testing the different components in PCA.
        for n_component in n_components_list:

            pca = PCA(n_components=n_component)
            X_train_pca = pca.fit_transform(X_train) 
            X_val_pca = pca.transform(X_val)

            # Fit Linear Regression on the training set
            model = LinearRegression()
            model.fit(X_train_pca, y_train)

            # Predict on the validation set
            y_val_pred = model.predict(X_val_pca)
            r2 = r2_score(y_val, y_val_pred)

            # Update best components if the current number of components yields a higher R-squared
            if r2 > best_r2:
                best_r2 = r2
                best_components = n_component

        # Save the best number of components to the list
        component_counts.append(best_components)

        # Use the best number of components to fit the final model on the combined training and validation sets
        best_pca = PCA(n_components=best_components)
        X_train_pca = best_pca.fit_transform(X_train)
        X_test_pca = best_pca.transform(X_test)
        
        # Best Model
        PCASP500 = LinearRegression()
        PCASP500.fit(X_train_pca, y_train)

        # Predict returns at the stock level
        r_stock_pred = PCASP500.predict(X_test_pca)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [None]:
# Get Out Of Sample R_squared
PCR_Roo2 = pcr(Dependent=y, Predictors=X, stock_weights=weights)
PCR_Roo2

### 2.2 - PLS

In [56]:
def walk_forward_pls(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs PLS with expanding window.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
    """ 
    # # Initalize to store r-squared for portfolio..
    r_port_difference_list = []
    r_port_actual_list =[]
    component_counts = [] # Initialize to save number of components.
    years = Predictors.index.year.unique()#List of years (1957-2016)
    n_components_list = range(1, 7) # To determine the number of components that are tested.
    best_components = None # Initialize and later save the best amount of components.
    best_r2 = -np.inf  # Initialize with negative infinity to find the maximum R-squared

    for i in range(len(years) - initial_train_years - validation_years): 
        start_year = years[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()

        # Testing the different components in PLS.
        for n_component in n_components_list:

            # Train the model once on the training set
            pls = PLSRegression(n_components=n_component)
            pls.fit(X_train, y_train) 

           # Predict on the validation set
            y_val_pred = pls.predict(X_val)
            r2 = r2_score(y_val, y_val_pred)

            # Update best components if the current number of components yields a higher R-squared
            if r2 > best_r2:
                best_r2 = r2
                best_components = n_component

        # Save the best number of components to the list
        component_counts.append(best_components)

        # Use the best number of components to fit the final model 
        best_pls = PLSRegression(n_components=best_components)
        best_pls.fit(X_train, y_train)

        # Evaluate the final model on the test set
        r_stock_pred = best_pls.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [57]:
# Runs PLS and saves the results
r_squared_scores_pls = walk_forward_pls(y, X, stock_weights=weights ,initial_train_years=18, validation_years=12, test_years=1)

In [58]:
print(r_squared_scores_pls)

-0.05519827043915937


## 3 - Elastic Net & Lasso & Ridge

Elastic Net -> l1_ratio=0.5
Ridge -> l1_ratio=0
Lasso -> l1_ratio=1

### E-net

In [77]:
def ENet(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Elastic Net with expanding window.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
    """
    # All the years(1957-2016)
    yrs = Dependent.index.year.unique()
    # Initalize to store r-squared for portfolio.
    r_port_difference_list = []
    r_port_actual_list = []
    # Tested tuning parameters
    tuning_par = {
        "alpha": np.linspace(1e-1, 1e-4, num=10),
        "l1_ratio": [0.5], "tol":[1e-2]
    } 
    
    # Now the model runs for every time of the 30 splits and for every possible combination of the tuning parameters.
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        # This part runs the tuning to find the best combination of the tuning parameters for every split
        best_par = val_fun(ElasticNet, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        # Now we test the model
        ENet_SP500 = ElasticNet(alpha=best_par['alpha'], l1_ratio=best_par['l1_ratio']).fit(X_train, y_train)

        # Calculate R_squared        
        r_stock_pred = ENet_SP500.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [79]:
ENet = ENet(Dependent=y,Predictors=X,stock_weights=weights) 
ENet

0.25721887452371794

### Lasso

In [80]:
def Lasso(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Lasso Regression with expanding window.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    # All the years(1957-2016)
    yrs = Dependent.index.year.unique()
    # Initalize to store r-squared for portfolio.
    r_port_difference_list = []
    r_port_actual_list = []
    # Tested tuning parameters
    tuning_par = {
        "alpha": np.linspace(1e-1, 1e-4, num=10),
        "l1_ratio": [1], "tol":[1e-2]
    } 
    
    # Now the model runs for every time of the 30 splits and for every possible combination of the tuning parameters.
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        # This part runs the tuning to find the best combination of the tuning parameters for every split
        best_par = val_fun(ElasticNet, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        # Now we test the model
        LAS_SP500 = ElasticNet(alpha=best_par['alpha'], l1_ratio=best_par['l1_ratio']).fit(X_train, y_train)

        # Calculate R_squared        
        r_stock_pred = LAS_SP500.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [81]:
# Lasso results
Lasso_score = Lasso(Dependent=y,Predictors=X,stock_weights=weights) 
Lasso_score

0.25721887452371794

### Ridge

In [82]:
def Ridge(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Ridge Regression with expanding window.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    # All the years(1957-2016)
    yrs = Dependent.index.year.unique()
    # Initalize to store r-squared for portfolio.
    r_port_difference_list = []
    r_port_actual_list = []
    # Tested tuning parameters
    tuning_par = {
        "alpha": np.linspace(1e-1, 1e-4, num=10),
        "l1_ratio": [0], "tol":[1e-2]
    } 
    
    # Now the model runs for every time of the 30 splits and for every possible combination of the tuning parameters.
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        # This part runs the tuning to find the best combination of the tuning parameters for every split
        best_par = val_fun(ElasticNet, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        # Now we test the model
        RID_SP500 = ElasticNet(alpha=best_par['alpha'], l1_ratio=best_par['l1_ratio']).fit(X_train, y_train)

        # Calculate R_squared        
        r_stock_pred = RID_SP500.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [83]:
# Results Ridge regression
Ridge_score = Ridge(Dependent=y,Predictors=X,stock_weights=weights) 
Ridge_score

0.25721887452371794

### Implementation of Huber-Loss

Huber-Loss-function

In [103]:
def huber_loss(y_val, y_pred, delta):
    """
Function that ...
Input: 
    - y_val: ...
    - y_pred: ...
    - delta: ...

    Output: ...
"""
    error = y_val - y_pred
    is_small_error = np.abs(error) <= delta
    squared_loss = 0.5 * (error ** 2)
    linear_loss = delta * (np.abs(error) - 0.5 * delta)
    return np.where(is_small_error, squared_loss, linear_loss)

In [104]:
def val_fun_with_huber(model, params: dict, X_trn, y_trn, X_vld, y_vld):
    """
Function that ...
Input: 
    - model: ...
    - params: ...
    - X_trn: ...
    - y_trn: ...
    - X_vld ...
    - y_vld ...

    Output: ...
"""
    best_ros = None
    lst_params = list(ParameterGrid(params))
    for param in lst_params:
        if best_ros == None:
            mod = model().set_params(**param).fit(X_trn, y_trn)
            y_pred = mod.predict(X_vld)
            smallest_loss = huber_loss(y_vld, y_pred, delta=99.9)
            best_param = param
        else:
            mod = model().set_params(**param).fit(X_trn, y_trn)
            y_pred = mod.predict(X_vld)
            loss = huber_loss(y_vld, y_pred, delta=99.9)
            if loss < smallest_loss:
                smallest_loss = loss
                best_param = param
    return best_param

### E-Net with Huber

In [105]:
def ENet_with_huber(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Elastic Net using Huber Loss function.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    # All the years(1957-2016)
    yrs = Dependent.index.year.unique()
    # Initalize to store r-squared for portfolio.
    r_port_difference_list = []
    r_port_actual_list = []
    # Tested tuning parameters
    tuning_par = {
        "alpha": np.linspace(1e-1, 1e-4, num=10),
        "l1_ratio": [0.5], "tol":[1e-2]
    } 
    
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years
        end_validation_year = end_train_year + validation_years
        end_test_year = end_validation_year + test_years

        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        best_par = val_fun_with_huber(ElasticNet, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        ENetH_SP500 = ElasticNet(alpha=best_par['alpha'], l1_ratio=best_par['l1_ratio']).fit(X_train, y_train)

        # Calculate R_squared        
        r_stock_pred = ENetH_SP500.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [106]:
# Results Elastic Net with Huber loss function
ENet_huber_scores = ENet_with_huber(Dependent=y, Predictors=X, stock_weights=weights)
ENet_huber_scores

0.2450819024981108

## Lasso + H

In [107]:

def Lasso_with_huber(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Lasso Regression with Huber Loss function.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    # All the years(1957-2016)
    yrs = Dependent.index.year.unique()
    # Initalize to store r-squared for portfolio.
    r_port_difference_list = []
    r_port_actual_list = []
    # Tested tuning parameters
    tuning_par = {
        "alpha": np.linspace(1e-1, 1e-4, num=10),
        "l1_ratio": [1], "tol":[1e-2]
    } 
    
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years
        end_validation_year = end_train_year + validation_years
        end_test_year = end_validation_year + test_years

        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        best_par = val_fun_with_huber(ElasticNet, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        LASH_SP500 = ElasticNet(alpha=best_par['alpha'], l1_ratio=best_par['l1_ratio']).fit(X_train, y_train)

        # Calculate R_squared        
        r_stock_pred = LASH_SP500.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [108]:
# Results Lasso Regression with Huber loss function
Lasso_with_huber_scores = Lasso_with_huber(Dependent=y, Predictors=X, stock_weights=weights)
Lasso_with_huber_scores

0.24486347729415947

## Ridge + H

In [109]:
def Ridge_with_huber(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Ridge Regression with Huber Loss function.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    # All the years(1957-2016)
    yrs = Dependent.index.year.unique()
    # Initalize to store r-squared for portfolio.
    r_port_difference_list = []
    r_port_actual_list = []
    # Tested tuning parameters
    tuning_par = {
        "alpha": np.linspace(1e-1, 1e-4, num=10),
        "l1_ratio": [0], "tol":[1e-2]
    } 
    
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years
        end_validation_year = end_train_year + validation_years
        end_test_year = end_validation_year + test_years

        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        best_par = val_fun_with_huber(ElasticNet, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        RIDH_SP500 = ElasticNet(alpha=best_par['alpha'], l1_ratio=best_par['l1_ratio']).fit(X_train, y_train)

        # Calculate R_squared        
        r_stock_pred = RIDH_SP500.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [110]:
# Results Ridge Regression with Huber loss function
Ridge_with_huber_scores = Ridge_with_huber(Dependent=y, Predictors=X, stock_weights=weights)
Ridge_with_huber_scores

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

0.24203593905499088

## 4 - GLM


In [14]:
# 
def flatten(l):
    return [item for sublist in l for item in sublist]

In [71]:
def GLM(y, X,stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Ridge Regression with Huber Loss function.
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    yrs = X.index.year.unique()
    r_port_difference_list = []
    r_port_actual_list = []
    Roos_list = []
    tuning_par = {
    #'knots': [3],
    'group_reg':[1e-4,1e-1],
    'l1_reg': [1e-4,0],
    'groups': [],
    'random_state': [12308]
    }

    #GETTING THE SPLINES
    spline_data = pd.DataFrame(np.ones((X.shape[0],1)),index=X.index,columns=['const'])
    for i in X.columns:
        i_dat = X.loc[:,i]
        i_sqr = i_dat**2
        i_cut, bins = pd.cut(i_dat, 3, right=True, ordered=True, retbins=True)
        i_dum = pd.get_dummies(i_cut)
        for j in np.arange(3):
            i_dum.iloc[:,j] = i_dum.iloc[:,j]*((i_dat-bins[j])**2)
        i_dum.columns = [f"{i}_{k}" for k in np.arange(1,3+1)]
        spline_data = pd.concat((spline_data,i_dat,i_dum),axis=1)


    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = spline_data[(X.index.year < end_train_year)]
        X_test = spline_data[(X.index.year >= end_validation_year) & (X.index.year < end_test_year)]
        X_val = spline_data[(X.index.year >= end_train_year) & (X.index.year < end_validation_year)]
        y_train = y[(y.index.year < end_train_year)].values.ravel()
        y_test = y[(y.index.year >= end_validation_year) & (y.index.year < end_test_year)].values.ravel()
        y_val = y[(y.index.year >= end_train_year) & (y.index.year < end_validation_year)].values.ravel()
        
        groups = [0]+flatten([list(np.repeat(i,3+1))[:] for i in np.arange(1,X.shape[1]+1)])
        tuning_par['groups'] = groups
        # This part runs the tuning to find the best combination of the tuning parameters for every split
        best_par = val_fun(GroupLasso, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        GL=GroupLasso(groups=best_par['groups'], group_reg=best_par['group_reg'], l1_reg=best_par['l1_reg'], fit_intercept=False, random_state=best_par['random_state'],supress_warning=True).fit(X_train,y_train)
        #GL=GroupLasso(groups=best_par.groups,group_reg=best_par.lmd,l1_reg=best_par.l1_reg,fit_intercept=False,random_state=best_par.random_state)
        
        # Predict returns at the stock level
        r_stock_pred = GL.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [72]:
# Results GLM
GLM_Roo2 = GLM(y=y, X=X, stock_weights=weights)   
GLM_Roo2

1957-01-01    0.315774
1957-01-01    0.344997
1957-01-01    0.328586
1957-01-01    0.361710
1957-01-01    0.328800
                ...   
2016-12-01    0.329053
2016-12-01    0.000000
2016-12-01    0.318052
2016-12-01    0.350582
2016-12-01    0.363564
Length: 359357, dtype: float64' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  i_dum.iloc[:,j] = i_dum.iloc[:,j]*((i_dat-bins[j])**2)
1957-01-01    0.000000
1957-01-01    0.000000
1957-01-01    0.000000
1957-01-01    0.000000
1957-01-01    0.000000
                ...   
2016-12-01    0.000000
2016-12-01    0.002201
2016-12-01    0.000000
2016-12-01    0.000000
2016-12-01    0.000000
Length: 359357, dtype: float64' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  i_dum.iloc[:,j] = i_dum.iloc[:,j]*((i_dat-bins[j])**2)
1957-01-01    0.0
1957-01-01    0.0
1957-01-01    0.0
1957-01-01    0.0
1957-01-01    0.0
             ... 
2016-12-01    0.0
2016-12-01    0.0


0.2548427126697421

In [75]:
print('GLM Roos^2 for a small subsample of predictons:',GLM_Roo2)

GLM Roos^2 for a small subsample of predictons: 0.2548427126697421


## 5 - Random Forest

#### Selecting Subsamples

In [91]:
rf_pred = ['mom1m', 'dy']
X_red_rf = X[rf_pred]
X_red_rf

Unnamed: 0_level_0,mom1m,dy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1957-01-01,-0.440062,0.000000
1957-01-01,-0.414635,0.000000
1957-01-01,-0.428776,0.000000
1957-01-01,-0.400577,0.000000
1957-01-01,-0.428589,0.000000
...,...,...
2016-12-01,-0.428368,-0.995178
2016-12-01,-0.286416,-0.911305
2016-12-01,-0.438039,-0.777349
2016-12-01,-0.409901,-0.798266


#### Random Forest Function

In [None]:
def Random_F(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Random Forest
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    yrs = Dependent.index.year.unique()
    r_port_difference_list = []
    r_port_actual_list = []
    tuning_par = {
    'n_estimators': [300],
    'max_depth': [3,6],
    'max_features': [30,50,100],
    'random_state': [12308]
    }
    
    # Now the model runs for every time of the 30 splits and for every possible combination of the tuning parameters.
    # In this case is 30*60 = 1800, but only the best R2 for every split are stored. 
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        # This part runs the tuning to find the best combination of the tuning parameters for every split
        best_par = val_fun(RF, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        # Now we test the model
        RF_SP500 = RF(n_estimators=best_par['n_estimators'], max_depth=best_par['max_depth'], max_features=best_par['max_features'], 
               random_state=best_par['random_state']).fit(X_train, y_train)
        
        # Predict returns at the stock level
        r_stock_pred = RF_SP500.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [None]:
# Results Random Forest
RF_Roo2 = Random_F(Dependent=y,Predictors=X, stock_weights=weights)   
RF_Roo2
#(Running time with 2 predictors: 14 minutes)

## 6 - Gradient Boosted Regression Trees

In [None]:
def GBRT(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Gradient Boosted Regression Tree
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    yrs = Predictors.index.year.unique()
    r_port_difference_list = []
    r_port_actual_list = []
    tuning_par = {
    'n_estimators': range(1, 150),
    'max_depth': range(1,2),
    'learning_rate': [0.01, 0.1]
    }
    
    # Now the model runs for every time of the 30 splits and for every possible combination of the tuning parameters.
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()

        # This part runs the tuning to find the best combination of the tuning parameters for every split
        best_par = val_fun(GradientBoostingRegressor, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        # Now we test the model
        GBRT_SP500 = GradientBoostingRegressor(n_estimators=best_par['n_estimators'], max_depth=best_par['max_depth'], learning_rate=best_par['learning_rate']).fit(X_train, y_train)

        r_stock_pred = GBRT_SP500.predict(X_test)
   
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [None]:
# run function and print resulting R2 value
GBRT_R2 = GBRT(Dependent=y,Predictors=X, stock_weights=weights)   
print(GBRT_R2)
print(np.mean(GBRT_R2)) 

## Additional Method: XGBOOST

In [None]:
from xgboost import XGBRegressor

In [None]:
def XGBoost(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs XGBoost
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    yrs = Dependent.index.year.unique()
    r_port_difference_list = []
    r_port_actual_list = []
    tuning_par = {
    'n_estimators': [500,600,800,1000],
    'max_depth': [1,2],
    'random_state': [12308],
    #'learning_rate': [.01]
    }
    
    # Now the model runs for every time of the 30 splits and for every possible combination of the tuning parameters.
    # In this case is 30*60 = 1800, but only the best R2 for every split are stored. 
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        # This part runs the tuning to find the best combination of the tuning parameters for every split
        best_par = val_fun(XGBRegressor, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        # Now we test the model
        XGB = XGBRegressor(n_estimators=best_par['n_estimators'], max_depth=best_par['max_depth']).fit(X_train, y_train)
        
        # Predict returns at the stock level
        r_stock_pred = XGB.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [None]:
# Results XGBoost
XGB_Roo2 = XGBoost(Dependent=y,stock_weights=weights,Predictors=X)   
XGB_Roo2

In [None]:
np.mean(XGB_Roo2)

## Additional Method: BART Model

pip install ISLP

In [None]:
from ISLP.bart import BART

In [None]:
def BARTrees(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that runs Bayesian Addetive Regression Tree
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    yrs = Dependent.index.year.unique()
    r_port_difference_list = []
    r_port_actual_list = []
    tuning_par = {
    'num_trees': [100,200,300],
    'burnin': [50,150,200],
    'max_stages': [500,1000,2000]
    }
    
    # Now the model runs for every time of the 30 splits and for every possible combination of the tuning parameters.
    # In this case is 30*60 = 1800, but only the best R2 for every split are stored. 
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        # This part runs the tuning to find the best combination of the tuning parameters for every split
        best_par = val_fun(BART, params=tuning_par, X_trn=np.asarray(X_train), y_trn=y_train, X_vld=np.asarray(X_val), y_vld=y_val)
        
        # Now we test the model
        BART_SP500 = BART(num_trees=best_par['num_trees'], burnin=best_par['burnin'], max_stages=best_par['max_stages']).fit(X_train, y_train)
        
        # Predict returns at the stock level
        r_stock_pred = BART_SP500.predict(np.asarray(X_test))
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [None]:
# Results BART
BART_Roo2 = BARTrees(Dependent=y,Predictors=X, stock_weights=weights)   
BART_Roo2

In [None]:
np.mean(BART_Roo2)

## Additional Method: Bagging

In [3]:
def Bagging(Dependent, Predictors, stock_weights, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that adds Bagging to the Random Forest
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    yrs = Dependent.index.year.unique()
    r_port_difference_list = []
    r_port_actual_list = []
    tuning_par = {
    'n_estimators': [300],
    'max_depth': [3,6],
    'random_state': [12308]
    }
    
    # Now the model runs for every time of the 30 splits and for every possible combination of the tuning parameters.
    # In this case is 30*60 = 1800, but only the best R2 for every split are stored. 
    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
        
        # This part runs the tuning to find the best combination of the tuning parameters for every split
        best_par = val_fun(RF, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
        
        # Now we test the model
        BAG_SP500 = RF(n_estimators=best_par['n_estimators'], max_depth=best_par['max_depth'], max_features=X_train.shape[1], 
               random_state=best_par['random_state']).fit(X_train, y_train)
        
        # Predict returns at the stock level
        r_stock_pred = BAG_SP500.predict(X_test)
        
        # Gets weights from current testing year
        weights_test = stock_weights.loc[str(end_validation_year)]
        # Initialize dataframe to store predicted and actual returns
        r_portfolio = pd.DataFrame(index=weights_test.index, columns=['return_test', 'return_pred'])
        
        # Calculate monthly return predicted and actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the results in a DataFrame
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Store numerator and denominator to calculate out of sample R-Squared
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
    
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)

    return Model_Roos

In [None]:
# Results Bagging
Bagg_Roo2 = Bagging(Dependent=y,Predictors=X,stock_weights=weights)
Bagg_Roo2

# Neural Network

In [None]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
import gc

In [None]:
def NN_function(Dependent, Predictors, stock_weights, num_layers, initial_train_years=18, validation_years=12, test_years=1):
    """
Function that adds Bagging to the Random Forest
Input: 
    - Dependent: Dependent variable data
    - Predictors: Independent variables data
    - stock_weights: The weights of a stock as percentage of portfolio
    - num_layers: The number of layers in the Neural Network
    - initial_train_years: Number of initial training years. (Default is 18)
    - validation_years: Number of years for the validation set. (Default is 12)
    - test_years: Number of years for the test set. (Default is 1)

    Output: Out of sample R-squared.
"""
    yrs = Dependent.index.year.unique()
    r_port_difference_list = []
    r_port_actual_list = []

    for i in range(len(yrs) - initial_train_years - validation_years):
        start_year = yrs[i]
        end_train_year = start_year + initial_train_years  # 18 years of training and increasing with 1 year every iteration
        end_validation_year = end_train_year + validation_years  # 12 years of validation
        end_test_year = end_validation_year + test_years  # 1 year of test

        # Creating training, validation and test sets.
        X_train = Predictors[(Predictors.index.year < end_train_year)]
        X_test = Predictors[(Predictors.index.year >= end_validation_year) & (Predictors.index.year < end_test_year)]
        X_val = Predictors[(Predictors.index.year >= end_train_year) & (Predictors.index.year < end_validation_year)]
        y_train = Dependent[(Dependent.index.year < end_train_year)].values.ravel()
        y_test = Dependent[(Dependent.index.year >= end_validation_year) & (Dependent.index.year < end_test_year)].values.ravel()
        y_val = Dependent[(Dependent.index.year >= end_train_year) & (Dependent.index.year < end_validation_year)].values.ravel()
         
        tuning_par = {
        'n_layers': [num_layers],
        'loss': ['mse'],
        'l1': [1e-5, 1e-3],
        'learning_rate': [.001, .01],
        'batch_size': [10000],
        'epochs': [100],
        'batch_norm': [True],
        'random_state': [1],
        'patience': [5],
        'verbose': [0],
        'monitor': ['val_loss']}
        # NN class
        class NN(nn.Module):
            def __init__(
                self, n_layers=1, loss='mse', l1=1e-5, l2=0, learning_rate=.01, batch_norm=True, patience=5,
                epochs=100, batch_size=10000, verbose=1, random_state=1, monitor='val_loss', base_neurons=5
            ):
                super(NN, self).__init__()
                self.n_layers = n_layers
                self.l1 = l1
                self.l2 = l2
                self.learning_rate = learning_rate
                self.batch_norm = batch_norm
                self.patience = patience
                self.epochs = epochs
                self.batch_size = batch_size
                self.verbose = verbose
                self.monitor = monitor
                self.base_neurons = base_neurons
                self.random_state = random_state

                # Initialize model layers
                self.layers = nn.ModuleList()
                input_size, output_size = None, 1

                for i in range(self.n_layers, 0, -1):
                    if self.n_layers > self.base_neurons:
                        in_features = input_size if input_size is not None else X_train.shape[1]
                        out_features = 2 ** i
                        self.layers.append(nn.Linear(in_features, out_features))
                        self.layers.append(nn.ReLU())
                    else:
                        in_features = input_size if input_size is not None else X_train.shape[1]
                        out_features = 2 ** (self.base_neurons - (self.n_layers - i + 1))
                        self.layers.append(nn.Linear(in_features, out_features))
                        self.layers.append(nn.ReLU())
                    input_size = out_features
                    if self.batch_norm:
                        self.layers.append(nn.BatchNorm1d(out_features))

                self.layers.append(nn.Linear(input_size, output_size))

                # Loss function
                self.criterion = nn.MSELoss()

                # Optimizer
                self.optimizer = Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.l1 + self.l2)

            def forward(self, x):
                for layer in self.layers:
                    x = layer(x)
                return x

            def fit(self, X_train, y_train, X_val, y_val):
                torch.manual_seed(self.random_state)
                np.random.seed(self.random_state)
                random.seed(self.random_state)

                X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
                y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
                X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
                y_val_tensor = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1)


                train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
                train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

                early_stop_counter = 0
                best_loss = float('inf')

                for epoch in range(self.epochs):
                    self.train()
                    for inputs, targets in train_loader:
                        self.optimizer.zero_grad()
                        outputs = self(inputs)
                        loss = self.criterion(outputs, targets)
                        loss.backward()
                        self.optimizer.step()

                    # Validation loss
                    self.eval()
                    with torch.no_grad():
                        outputs = self(X_val_tensor)
                        val_loss = self.criterion(outputs, y_val_tensor)

                    if val_loss < best_loss:
                        best_loss = val_loss
                        early_stop_counter = 0
                    else:
                        early_stop_counter += 1

                    if early_stop_counter >= self.patience:
                        print("Early stopping.")
                        break

                    if self.verbose and epoch % self.verbose == 0:
                        print(f"Epoch {epoch + 1}/{self.epochs}, Validation Loss: {val_loss:.6f}")

                return self

            def predict(self, X):
                self.eval()
                with torch.no_grad():
                    X_tensor = torch.tensor(X.values, dtype=torch.float32)
                    return self(X_tensor).numpy()

        #Setting best NN model
        best_NN = val_fun_NN(NN, params=tuning_par, X_trn=X_train, y_trn=y_train, X_vld=X_val, y_vld=y_val)
    
        # Predict returns at the stock level
        r_stock_pred = best_NN.predict(X_test).reshape(-1)

        weights_test = stock_weights.loc[str(end_validation_year)]
        # Portofolio test
        dates = weights_test.index
        
        r_portfolio = pd.DataFrame(index=dates, columns=['return_test', 'return_pred'])
        
        # Calculate portfolio return actual 
        for month in range(1, 13):
            start_index = (month - 1) * weights_test.shape[0] // 12  
            end_index = month * weights_test.shape[0] // 12
            month_weights = weights_test.iloc[start_index:end_index]
            month_y_test = y_test[start_index:end_index]
            month_y_pred = r_stock_pred[start_index:end_index]
            # Store the result directly in the DataFrame  
            r_portfolio.loc[f'{end_validation_year}-{month:02d}', ['return_test', 'return_pred']] = np.sum(month_weights['weight'] * month_y_test), np.sum(month_weights['weight'] * month_y_pred) 

        # Appending values to lists
        r_port_difference_list.extend(((r_portfolio['return_test']-r_portfolio['return_pred'])**2).tolist())
        #print(r_port_difference_list)
        r_port_actual_list.extend(((r_portfolio['return_test'])**2).tolist())
        #print(r_port_actual_list)
    # Calculate Roos
    Model_Roos = R_oos(r_port_difference_list, r_port_actual_list)
        
    return Model_Roos

In [None]:
# Results NN1-Regression-[32(relu)-1(linear)]

NN_1_ROO2 = NN_function(Dependent=y, Predictors=X, stock_weights=weights, num_layers=1, initial_train_years=18, validation_years=12, test_years=1)
NN_1_ROO2

In [None]:
gc.collect()

In [None]:
# Results NN2-Regression-[32(relu)-16(relu)-1(linear)]
NN_2_ROO2 = NN_function(Dependent=y, Predictors=X, stock_weights=weights, num_layers=2, initial_train_years=18, validation_years=12, test_years=1)
NN_2_ROO2

In [None]:
gc.collect()

In [None]:
# Results NN3-Regression-[32(relu)-16(relu)-8(relu)-1(linear)]
NN_3_ROO2 = NN_function(Dependent=y, Predictors=X, stock_weights=weights, num_layers=3, initial_train_years=18, validation_years=12, test_years=1)
NN_3_ROO2

In [None]:
gc.collect()

In [None]:
# Results NN4-Regression-[32(relu)-16(relu)-8(relu)-4(relu)-1(linear)]
NN_4_ROO2 = NN_function(Dependent=y, Predictors=X, stock_weights=weights, num_layers=4, initial_train_years=18, validation_years=12, test_years=1)
NN_4_ROO2

In [None]:
gc.collect()

In [None]:

# NN5-Regression-[32(relu)-16(relu)-8(relu)-4(relu)-2(relu)-1(linear)]
NN_5_ROO2 = NN_function(Dependent=y, Predictors=X, stock_weights=weights, num_layers=5, initial_train_years=18, validation_years=12, test_years=1)
NN_5_ROO2

In [None]:
gc.collect()