### Data pre-processing

In [8]:
# from fiam
import datetime
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA

print(datetime.datetime.now())

# turn off pandas Setting with Copy Warning
pd.set_option("mode.chained_assignment", None)

# set working directory
work_dir = "C:/Users/akobe/OneDrive/Asset-Management-FIAM/McGill-FIAM Asset Management Hackathon/data/"

# read sample data
file_path = os.path.join(
    work_dir, "hackathon_sample_v2.csv"
)  # replace with the correct file name
raw = pd.read_csv(
    file_path, parse_dates=["date"], low_memory=False
)  # the date is the first day of the return month (t+1)

# read list of predictors for stocks
file_path = os.path.join(
    work_dir, "factor_char_list.csv"
)  # replace with the correct file name
stock_vars = list(pd.read_csv(file_path)["variable"].values)

# define the left hand side variable
ret_var = "stock_exret" #possibly change?
new_set = raw[
    raw[ret_var].notna()
].copy()  # create a copy of the data and make sure the left hand side is not missing

# transform each variable in each month to the same scale
monthly = new_set.groupby("date")
data = pd.DataFrame()
for date, monthly_raw in monthly:
    group = monthly_raw.copy()
    # rank transform each variable to [-1, 1]
    for var in stock_vars:
        var_median = group[var].median(skipna=True)
        group[var] = group[var].fillna(
            var_median
        )  # fill missing values with the cross-sectional median of each month

        group[var] = group[var].rank(method="dense") - 1
        group_max = group[var].max()
        if group_max > 0:
            group[var] = (group[var] / group_max) * 2 - 1
        else:
            group[var] = 0  # in case of all missing values
            print("Warning:", date, var, "set to zero.")

    # add the adjusted values
    data = data._append(
        group, ignore_index=True
    )  # append may not work with certain versions of pandas, use concat instead if needed

# initialize the starting date, counter, and output data
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = 0
pred_out = pd.DataFrame()

2024-09-21 00:43:57.462155


In [2]:
def select_features(X, y, n_features=50):
    """Select top features based on F-statistic"""
    selector = SelectKBest(f_regression, k=n_features)
    selector.fit(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    return selected_features

In [3]:
#  from fiam - do not run takes long to execute
"""
#  estimation with expanding window
while (starting + pd.DateOffset(years=11 + counter)) <= pd.to_datetime(
    "20240101", format="%Y%m%d"
):
    cutoff = [
        starting,
        starting
        + pd.DateOffset(
            years=8 + counter
        ),  # use 8 years and expanding as the training set
        starting
        + pd.DateOffset(
            years=10 + counter
        ),  # use the next 2 years as the validation set
        starting + pd.DateOffset(years=11 + counter),
    ]  # use the next year as the out-of-sample testing set

    # cut the sample into training, validation, and testing sets
    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    selected_features = select_features(train[stock_vars], train[ret_var])

    # Optional: if your data has additional binary or categorical variables,
    # you can further standardize them here
    scaler = StandardScaler().fit(train[stock_vars])
    train[stock_vars] = scaler.transform(train[stock_vars])
    validate[stock_vars] = scaler.transform(validate[stock_vars])
    test[stock_vars] = scaler.transform(test[stock_vars])

    # get Xs and Ys
    X_train = train[stock_vars].values
    Y_train = train[ret_var].values
    X_val = validate[stock_vars].values
    Y_val = validate[ret_var].values
    X_test = test[stock_vars].values
    Y_test = test[ret_var].values

    # de-mean Y (because the regressions are fitted without an intercept)
    # if you want to include an intercept (or bias in neural networks, etc), you can skip this step
    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    # prepare output data
    reg_pred = test[
        ["year", "month", "date", "permno", ret_var]
    ]  # minimum identifications for each stock

    # Linear Regression
    # no validation is needed for OLS
    reg = LinearRegression(fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    x_pred = reg.predict(X_test) + Y_mean
    reg_pred["ols"] = x_pred

    # Lasso
    lambdas = np.arange(
        -4, 4.1, 0.1
    )  # search for the best lambda in the range of 10^-4 to 10^4, range can be adjusted
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(lambdas):
        reg = Lasso(alpha=(10**i), max_iter=1000000, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)

    # select the best lambda based on the validation set
    best_lambda = lambdas[np.argmin(val_mse)]
    reg = Lasso(alpha=(10**best_lambda), max_iter=1000000, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    x_pred = reg.predict(X_test) + Y_mean  # predict the out-of-sample testing set
    reg_pred["lasso"] = x_pred

    # Ridge
    # same format as above
    lambdas = np.arange(-1, 8.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(lambdas):
        reg = Ridge(alpha=((10**i) * 0.5), fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)

    best_lambda = lambdas[np.argmin(val_mse)]
    reg = Ridge(alpha=((10**best_lambda) * 0.5), fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    x_pred = reg.predict(X_test) + Y_mean
    reg_pred["ridge"] = x_pred

    # Elastic Net
    # same format as above
    lambdas = np.arange(-4, 4.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(lambdas):
        reg = ElasticNet(alpha=(10**i), max_iter=1000000, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)

    best_lambda = lambdas[np.argmin(val_mse)]
    reg = ElasticNet(alpha=(10**best_lambda), max_iter=1000000, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    x_pred = reg.predict(X_test) + Y_mean
    reg_pred["en"] = x_pred

    # add to the output data
    pred_out = pred_out._append(reg_pred, ignore_index=True)

    # go to the next year
    counter += 1

# output the predicted value to csv
out_path = os.path.join(work_dir, "output.csv")
print(out_path)
pred_out.to_csv(out_path, index=False)

# print the OOS R2
yreal = pred_out[ret_var].values
for model_name in ["ols", "lasso", "ridge", "en"]:
    ypred = pred_out[model_name].values
    r2 = 1 - np.sum(np.square((yreal - ypred))) / np.sum(np.square(yreal))
    print(model_name, r2)

# for timing purpose
print(datetime.datetime.now())
"""

### Pipeline and Training

The following block provides initial tests for predicting the stock excess return - minimizing MSE

In [3]:
from sklearn.model_selection import TimeSeriesSplit
from joblib import Parallel, delayed
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# feature scaling function
def scale_features(train, validate, test, stock_vars):
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train[stock_vars])
    validate_scaled = scaler.transform(validate[stock_vars])
    test_scaled = scaler.transform(test[stock_vars])
    return train_scaled, validate_scaled, test_scaled

# 3. model training with parallel processing
def train_model(model, X_train, Y_train, X_val, Y_val, Y_mean):
    model.fit(X_train, Y_train)
    val_mse = mean_squared_error(Y_val, model.predict(X_val) + Y_mean)
    return model, val_mse

def parallel_model_training(X_train, Y_train, X_val, Y_val, Y_mean):
    models = {
        'ols': LinearRegression(fit_intercept=False),
        'lasso': Lasso(max_iter=10000, fit_intercept=False),
        'ridge': Ridge(fit_intercept=False),
        'en': ElasticNet(max_iter=10000, fit_intercept=False)
    }
    
    results = Parallel(n_jobs=-1)(
        delayed(train_model)(model, X_train, Y_train, X_val, Y_val, Y_mean)
        for model in models.values()
    )
    
    return dict(zip(models.keys(), results))

# ltsm - in progress
class StockLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(StockLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # Add extra dimension if input is unbatched
        if len(x.shape) == 2:
            x = x.unsqueeze(0)
        
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out.squeeze(0)  # Remove extra dimension for unbatched input

class StockDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


def train_lstm(X_train, Y_train, X_val, Y_val, epochs=100, batch_size=32):
    input_dim = X_train.shape[1]
    model = StockLSTM(input_dim, hidden_dim=50, num_layers=2, output_dim=1)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    train_dataset = StockDataset(X_train, Y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs.squeeze(), batch_y)
            loss.backward()
            optimizer.step()
    
    model.eval()
    with torch.no_grad():
        val_predictions = model(torch.FloatTensor(X_val)).squeeze().numpy()
    
    return model, mean_squared_error(Y_val, val_predictions)

# Time series split default 5
tscv = TimeSeriesSplit(n_splits=5)
    
for train_index, test_index in tscv.split(data):
    train_val = data.iloc[train_index]
    test = data.iloc[test_index]
        
    # Further split train_val into train and validate
    train_size = int(0.8 * len(train_val))
    train = train_val[:train_size]
    validate = train_val[train_size:]
        
    # Scale features
    X_train, X_val, X_test = scale_features(train, validate, test, stock_vars)
    Y_train, Y_val, Y_test = train[ret_var].values, validate[ret_var].values, test[ret_var].values
        
        # De-mean Y
    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean
        
    # Train models in parallel
    model_results = parallel_model_training(X_train, Y_train_dm, X_val, Y_val, Y_mean)
        
    # Train LSTM
        # lstm_model, lstm_val_mse = train_lstm(X_train, Y_train, X_val, Y_val)
        
        # Make predictions and evaluate
    for model_name, (model, val_mse) in model_results.items():
        test_pred = model.predict(X_test) + Y_mean
        test_mse = mean_squared_error(Y_test, test_pred)
        print("---")
        print(f"{model_name} - Validation MSE: {val_mse}, Test MSE: {test_mse}")
        
        # LSTM prediction and evaluation
         # lstm_model.eval()
        #  with torch.no_grad():
         #     lstm_test_pred = lstm_model(torch.FloatTensor(X_test)).squeeze().numpy()
      #    lstm_test_mse = mean_squared_error(Y_test, lstm_test_pred)
       #   print(f"LSTM - Validation MSE: {lstm_val_mse}, Test MSE: {lstm_test_mse}")


---
ols - Validation MSE: 0.015196703399922797, Test MSE: 0.0070769780541416576
---
lasso - Validation MSE: 0.014119349956661777, Test MSE: 0.0064109213676483605
---
ridge - Validation MSE: 0.015196103962458237, Test MSE: 0.007076800698866414
---
en - Validation MSE: 0.014119349956661777, Test MSE: 0.0064109213676483605
---
ols - Validation MSE: 0.005725876461242681, Test MSE: 0.014633755384064666
---
lasso - Validation MSE: 0.00547020960108268, Test MSE: 0.01435129579188557
---
ridge - Validation MSE: 0.005725836485825353, Test MSE: 0.014633691176228791
---
en - Validation MSE: 0.00547020960108268, Test MSE: 0.01435129579188557
---
ols - Validation MSE: 0.01009783362922321, Test MSE: 0.0068532274721569055
---
lasso - Validation MSE: 0.009911206346441886, Test MSE: 0.00667082672997179
---
ridge - Validation MSE: 0.010097808296425901, Test MSE: 0.006853205026655427
---
en - Validation MSE: 0.009911206346441886, Test MSE: 0.00667082672997179
---
ols - Validation MSE: 0.006804843525877954

Attempting to combine the first part with stock selection

In [4]:
# Function to predict on next month and select top 100 stocks
def predict_next_month(models, X_test, Y_mean, stock_ids, top_n=100):
    predictions = {}
    
    # Get predictions from each model
    for model_name, (model, _) in models.items():
        predictions[model_name] = model.predict(X_test) + Y_mean
    
    # OLS to begin
    # Select OLS model for final selection to start - using other model selection methods - khai suggested looking into information coefficient 
    final_predictions = predictions['ols']
    
    # Create a DataFrame with predictions and stock ids
    pred_df = pd.DataFrame({
        'ticker': stock_ids,
        'predicted_return': final_predictions
    })
    
    # Sort by predicted return and select top N stocks
    top_stocks = pred_df.sort_values(by='predicted_return', ascending=False).head(top_n)
    
    return top_stocks

# Main execution
#def main():
# Prepare data

tscv = TimeSeriesSplit(n_splits=5)
    
for train_index, test_index in tscv.split(data):
    train_val = data.iloc[train_index]
    test = data.iloc[test_index]
        
    # Further split train_val into train and validate
    train_size = int(0.8 * len(train_val))
    train = train_val[:train_size]
    validate = train_val[train_size:]
        
    # Scale features
    X_train, X_val, X_test = scale_features(train, validate, test, stock_vars)
    Y_train, Y_val, Y_test = train[ret_var].values, validate[ret_var].values, test[ret_var].values
        
    # De-mean Y
    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean
        
    # Train models in parallel
    model_results = parallel_model_training(X_train, Y_train_dm, X_val, Y_val, Y_mean)
        
    # Train LSTM 
    # lstm_model, lstm_val_mse = train_lstm(X_train, Y_train, X_val, Y_val)
        
    # Make predictions for the next month and select top 100 stocks
    stock_ids = test['stock_ticker'].values  # using ticker for ID
    top_100_stocks = predict_next_month(model_results, X_test, Y_mean, stock_ids, top_n=100)
    
    # Add top 100 stocks to portfolio (further logic can be implemented here)
    print(top_100_stocks)

# Additional steps:
# 1. Calculate weights (cequal-weighted to start - can use predictions later)
# 2. Add these stocks to portfolio and track performance
# 3. Iterate the process and evaluate performance at each step


      ticker  predicted_return
10049    MNY          0.101783
184      WDC          0.095579
28347    RCI          0.092300
26530    RCI          0.087547
4959     WDC          0.085358
...      ...               ...
45325   LLTC          0.061181
33839    SFG          0.061044
40986   SUNW          0.061025
30350    CNO          0.060977
8860      AW          0.060784

[100 rows x 2 columns]
      ticker  predicted_return
30647    AMD          0.074281
41454   BRCD          0.072448
27433     KG          0.072171
29766    AMD          0.069326
30604     MU          0.066990
...      ...               ...
11554   QLGC          0.050886
35097    CCE          0.050817
17861   ENDP          0.050806
246     SEPR          0.050769
39915   FDML          0.050765

[100 rows x 2 columns]
      ticker  predicted_return
27040    GNW          0.059678
12075    CAR          0.051638
23888    AMD          0.050113
22470    GNW          0.048970
13604   TMUS          0.047965
...      ...          

Adding portfolio evaluaiton and better stock selection - still simple using nlargest

In [5]:
# Function to select top 100 stocks based on predicted returns for the next month
def select_top_stocks(predictions, stock_ids, top_n=100):
    stock_returns_df = pd.DataFrame({
        'stock_ticker': stock_ids,
        'predicted_return': predictions
    })
    stock_returns_df = stock_returns_df.drop_duplicates(subset=['stock_ticker'])  # Ensure no duplicates
    top_stocks = stock_returns_df.nlargest(top_n, 'predicted_return')
    return top_stocks

# Portfolio performance evaluation with equal weights
def evaluate_portfolio(stock_data, selected_stocks, ret_var):
    equal_weights = np.ones(len(selected_stocks)) / len(selected_stocks)  # Equal weight for each stock
    selected_stock_data = stock_data[stock_data['permno'].isin(selected_stocks['stock_ticker'])]
    
    portfolio_return = (selected_stock_data.groupby('date')[ret_var].mean()).mean()  # Average return of the portfolio
    print(f"Portfolio Return (equal-weighted): {portfolio_return}")
    return portfolio_return

# Main execution
tscv = TimeSeriesSplit(n_splits=5)
    
for train_index, test_index in tscv.split(data):
    train_val = data.iloc[train_index]
    test = data.iloc[test_index]
    
    # Ensure that test set contains data for one full month
    test = test[(test['date'].dt.month == (test['date'].dt.month.min() + 1))]  # Next month
    
    # Further split train_val into train and validate
    train_size = int(0.8 * len(train_val))
    train = train_val[:train_size]
    validate = train_val[train_size:]
    
    # Scale features
    X_train, X_val, X_test = scale_features(train, validate, test, stock_vars)
    Y_train, Y_val, Y_test = train[ret_var].values, validate[ret_var].values, test[ret_var].values
    
    # De-mean Y
    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean
    
    # Train models in parallel
    model_results = parallel_model_training(X_train, Y_train_dm, X_val, Y_val, Y_mean)
    
    for model_name, (model, val_mse) in model_results.items():
        test_pred = model.predict(X_test) + Y_mean
        test_mse = mean_squared_error(Y_test, test_pred)
        print(f"{model_name} - Validation MSE: {val_mse}, Test MSE: {test_mse}")
        
        # Select top 100 stocks for the next month
        top_100_stocks2 = select_top_stocks(test_pred, test['permno'], top_n=100)
        
        # Evaluate the portfolio with equal weights
        evaluate_portfolio(test, top_100_stocks2, ret_var)
        
    # LSTM model 
    # lstm_model, lstm_val_mse = train_lstm(X_train, Y_train, X_val, Y_val)
    # with torch.no_grad():
    #    lstm_test_pred = lstm_model(torch.FloatTensor(X_test)).squeeze().numpy()
    #    lstm_test_mse = mean_squared_error(Y_test, lstm_test_pred)
    
    

ols - Validation MSE: 0.015196703399922797, Test MSE: 0.005724219654758669
Portfolio Return (equal-weighted): 0.021320510831690034
lasso - Validation MSE: 0.014119349956661777, Test MSE: 0.00533553671789525
Portfolio Return (equal-weighted): 0.005594892693205861
ridge - Validation MSE: 0.015196103962458237, Test MSE: 0.005724074469773026
Portfolio Return (equal-weighted): 0.021320510831690034
en - Validation MSE: 0.014119349956661777, Test MSE: 0.00533553671789525
Portfolio Return (equal-weighted): 0.005594892693205861
ols - Validation MSE: 0.005725876461242681, Test MSE: 0.01301053509817297
Portfolio Return (equal-weighted): 0.00879326007626934
lasso - Validation MSE: 0.00547020960108268, Test MSE: 0.0128663732618627
Portfolio Return (equal-weighted): -0.001599930963518663
ridge - Validation MSE: 0.005725836485825353, Test MSE: 0.01301048912892264
Portfolio Return (equal-weighted): 0.00879326007626934
en - Validation MSE: 0.00547020960108268, Test MSE: 0.0128663732618627
Portfolio Ret

### Adding next month stock feature by shifting values 

In [9]:
# some preprocessing
data['next_month_stockExRet'] = data.groupby('stock_ticker')['stock_exret'].pct_change().shift(-1)
data = data.drop(columns=['shrcd', 'ret_eom'])
data = data[~np.isinf(data['next_month_stockExRet'])]
data = data.dropna()

  data['next_month_stockExRet'] = data.groupby('stock_ticker')['stock_exret'].pct_change().shift(-1)


In [12]:
import pandas as pd
from sklearn.linear_model import Ridge  # or any other model

# Prepare the training data (up to 2009-12-31) - test OOS is from 2010 to 2023
train_data = data[data['date'] <= '2009-12-31']  # All data until the end of 2009
test_data = data[data['date'] >= '2010-01-01']   # Predict for 2010 and beyond

# Drop unnecessary columns for training features and target
X_train = train_data.drop(columns=['next_month_stockExRet', 'stock_ticker', 'stock_exret', 'date', 'size_port', 'cusip', 'comp_name'])
y_train = train_data['next_month_stockExRet']

# Train the model
model = Ridge()  # random starting model
model.fit(X_train, y_train)

# Predict for the next 12 months - 2010
results = []
for month in range(1, 13):
    # Extract the test data for the current month
    X_test = test_data[test_data['date'].dt.month == month].drop(columns=['next_month_stockExRet', 'stock_ticker', 'stock_exret', 'date', 'size_port', 'cusip', 'comp_name'])
    y_test = test_data[test_data['date'].dt.month == month]['next_month_stockExRet']
    
    # Ensure there's test data for this month
    if X_test.empty:
        print(f"No test data available for month {month}")
        continue
    
    # Predict the next month's returns
    y_pred = model.predict(X_test)
    
    # Get the stock tickers for the test set
    stock_tickers = test_data[test_data['date'].dt.month == month]['stock_ticker'].values
    
    # Create a DataFrame with stock tickers and predicted returns
    stock_predictions = pd.DataFrame({'stock_ticker': stock_tickers, 'predicted_return': y_pred})
    
    # Rank stocks based on predicted returns and select the top 100
    top_100_stocks = stock_predictions.nlargest(100, 'predicted_return')
    
    # Calculate the average actual return of the top 100 stocks
    actual_returns = test_data[
        (test_data['stock_ticker'].isin(top_100_stocks['stock_ticker'])) & 
        (test_data['date'].dt.month == month)
    ]['next_month_stockExRet'].mean()
    
    # Store the results for this month, including both stock tickers and predicted returns
    results.append({
        'month': f'2023-{month:02d}',
        'top_100_stocks': top_100_stocks[['stock_ticker', 'predicted_return']].to_dict('records'),  # List of top stocks and their predicted returns
        'average_actual_return': actual_returns
    })
    
# Create a DataFrame to summarize the results
results_df = pd.DataFrame(results)

# Display the final results
print(results_df)


      month                                     top_100_stocks  \
0   2023-01  [{'stock_ticker': 'FOSL', 'predicted_return': ...   
1   2023-02  [{'stock_ticker': 'FOSL', 'predicted_return': ...   
2   2023-03  [{'stock_ticker': 'FOSL', 'predicted_return': ...   
3   2023-04  [{'stock_ticker': 'FOSL', 'predicted_return': ...   
4   2023-05  [{'stock_ticker': 'COLM', 'predicted_return': ...   
5   2023-06  [{'stock_ticker': 'FOSL', 'predicted_return': ...   
6   2023-07  [{'stock_ticker': 'BBIO', 'predicted_return': ...   
7   2023-08  [{'stock_ticker': 'IRTC', 'predicted_return': ...   
8   2023-09  [{'stock_ticker': 'MARA', 'predicted_return': ...   
9   2023-10  [{'stock_ticker': 'CERE', 'predicted_return': ...   
10  2023-11  [{'stock_ticker': 'MARA', 'predicted_return': ...   
11  2023-12  [{'stock_ticker': 'HMSY', 'predicted_return': ...   

    average_actual_return  
0               -2.358061  
1               -1.273336  
2               -0.053791  
3                1.420487  
4

something seems off with 12th month predictions.

In [13]:
results_df

Unnamed: 0,month,top_100_stocks,average_actual_return
0,2023-01,"[{'stock_ticker': 'FOSL', 'predicted_return': ...",-2.358061
1,2023-02,"[{'stock_ticker': 'FOSL', 'predicted_return': ...",-1.273336
2,2023-03,"[{'stock_ticker': 'FOSL', 'predicted_return': ...",-0.053791
3,2023-04,"[{'stock_ticker': 'FOSL', 'predicted_return': ...",1.420487
4,2023-05,"[{'stock_ticker': 'COLM', 'predicted_return': ...",0.482415
5,2023-06,"[{'stock_ticker': 'FOSL', 'predicted_return': ...",-4.366208
6,2023-07,"[{'stock_ticker': 'BBIO', 'predicted_return': ...",-2.051492
7,2023-08,"[{'stock_ticker': 'IRTC', 'predicted_return': ...",-1.108195
8,2023-09,"[{'stock_ticker': 'MARA', 'predicted_return': ...",-10.847763
9,2023-10,"[{'stock_ticker': 'CERE', 'predicted_return': ...",-0.75954


note this is initial tests and possible that this approach is not correct. 