# Import all used packages

In [1]:
import pandas as pd
import numpy as np
import tensorflow.compat.v1.keras.backend as K
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import layers
from tensorflow.keras.layers import LSTM, SimpleRNN
from tensorflow.keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow .keras.optimizers import Adam
import keras
from sklearn.metrics import accuracy_score
from keras import optimizers
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import pickle
import datetime

import warnings
#from pandas.tseries.offsets import DateOffset
warnings.filterwarnings('ignore')

# Load in data in respective periods
This is done to alleviate the issue of our machines running out of memory when running code, thus only load in data that is needed for respective period.

### 2005-2011

In [2]:
X_train = pickle.load( open( "save_X.p", "rb" ) )
y_train = pickle.load( open( "save_y.p", "rb" ) )
X_test = pickle.load( open( "save_X_test.p", "rb" ) )
y_test = pickle.load( open( "save_y_test.p", "rb" ) )
dates_df_test_pf = pickle.load( open( "dates_df_test_pf.p", "rb" ) )

### 2012-2018

In [2]:
X_train = pickle.load( open( "save_X_12_18.p", "rb" ) )
y_train = pickle.load( open( "save_y_12_18.p", "rb" ) )
X_test = pickle.load( open( "save_X_test_12_18.p", "rb" ) )
y_test = pickle.load( open( "save_y_test_12_18.p", "rb" ) )
dates_df_test_pf = pickle.load( open( "dates_df_test_pf_12_18.p", "rb" ) )

### 2019-2022

In [None]:
X_train = pickle.load( open( "save_X_19_22.p", "rb" ) )
y_train = pickle.load( open( "save_y_19_22.p", "rb" ) )
X_test = pickle.load( open( "save_X_test_19_22.p", "rb" ) )
y_test = pickle.load( open( "save_y_test_19_22.p", "rb" ) )
dates_df_test_pf = pickle.load( open( "dates_df_test_pf_19_22.p", "rb" ) )

### 1990-2004

In [None]:
X_train = pickle.load( open( "save_X_1990.p", "rb" ) )
y_train = pickle.load( open( "save_y_1990.p", "rb" ) )

# Load in NASDAQ 100 data
Load in Nasdaq 100 data that is used in the portfolio section

In [14]:
# Here we calculate the average monthly return for each period we are considering. This is used for the rebalancing threshold.
Nasdaq100_index = pd.read_csv("NASDAQ100_index.csv")
Nasdaq_avg_ret = []
for i in range(3):
    if i == 0:       
        plot_nas = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2005-01-31") & (Nasdaq100_index["DATE"]<"2012-02-01")]["NASDAQ100"].tolist()
    elif i == 1:     
        plot_nas = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2012-01-31") & (Nasdaq100_index["DATE"]<"2019-02-01")]["NASDAQ100"].tolist()
    elif i == 2:
        plot_nas = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2019-01-31") & (Nasdaq100_index["DATE"]<"2023-02-01")]["NASDAQ100"].tolist()
    # The data has "." in a lot of places, that we remove
    for x in plot_nas[:]:
        if x == '.':
            plot_nas.remove(x) 
    # The dataformat is also strings, so we convert to float
    plot_nas = [float(x) for x in plot_nas]
    # Calculate returns
    ret_nas = []
    for i in range(1,len(plot_nas)):
        ret_nas.append(plot_nas[i]/plot_nas[i-1]-1)
    Nasdaq_avg_ret.append(sum(ret_nas)/np.ceil(len(ret_nas)/21)) # Calculate average return in period


Nasdaq_avg_ret

[0.008280178107935323, 0.013346305965169048, 0.014801658109719847]

In [2]:
df = pd.read_csv("sorted_dates.csv") # Load in unfinished dataset

df_vix = pd.read_csv("VIXCLS.csv") # Load in VIX (we decided to add it late)
# Make manipulations to VIX df and merge it to dataset
df_vix["datadate"] = df_vix["DATE"]
df_vix["datadate"] = pd.to_datetime(df_vix["datadate"])
df["datadate"] = pd.to_datetime(df["datadate"])
df_vix = df_vix.drop(["DATE"],axis = 1)
df = pd.merge_asof(df,df_vix, on ="datadate")

df_close = pd.read_csv("Closing_price_month.csv") # Load in closing price monthly (we noticed we might need it)
df_close["datadate"] = pd.to_datetime(df_close["datadate"])
df_close = df_close[["tic","datadate","prccm"]]

df = df.reset_index(drop=True)

df = df.fillna(0) #Fill NaN with 0
# Remove a bunch of unused columns from dataset
df = df.loc[:, ~df.columns.isin(["return quarterly","tic.1","ggroup.1", 'ggroup', "gsector","gsector.1","naics.1","naics","sic.1","sic"])]
df = df.sort_values(["tic","datadate"]) # Sort dataset on tickers and datadate, to get in correct order
df = pd.merge(df, df_close, on=['tic', 'datadate']) # Merge closing price to dataset
# Remove two firms with wildly inconsistent price and return data
df = df[df["tic"]!="CRGE"]
df = df[df["tic"]!="HYMC"]

df["log_ret"] = np.log(df["prccm"]/df["prccm"].shift(1)) # Include log-return from "current" month as feature
df["target"] = np.log(df["prccm"].shift(-1)/df["prccm"]) # We want to "forecast" (predict next month's log-return)

# Load in monthly dataset since we need some information from here for the portfolio stuff
monthly = pd.read_csv("MONTHLY ALL RAW.csv")
monthly = monthly[["tic","datadate","prccm","cshom"]] # Only interested in ticker, datadate, closing price, and shares outstanding
monthly["mkt cap"] = monthly["prccm"] * monthly["cshom"] # Calculate market cap (used in market cap weighted portfolios)
monthly = monthly.drop(["prccm","cshom"],axis=1) # After market cap is calculated, we have no need for closing price and shares outstanding
# Make manipulations and merge to dataset
monthly["datadate"] = pd.to_datetime(monthly["datadate"]) 
df = pd.merge(df, monthly, on=['tic', 'datadate'])
df["mkt cap"] = df["mkt cap"].fillna(0) # We noticed a few places with NaN values in market cap, remove these


# The next few lines correct an issue we had, where, when calculating log-returns, some firms' final datapoints would use
# the incorrect data from another firm. This is obviously not correct, so we remove every firm's final datapoint
df = df.reset_index(drop = True)
temp = [0] # Create temp for storing index
for i in tqdm_notebook(range(len(df)-1)):
    if df["tic"].iloc[i] != df["tic"].iloc[i+1]: # Find where we need to delete a datapoint (where NaN should have appead)
        temp.append(i) # Append the index
        temp.append(i+1)
temp.append(len(df)-1)
df = df.drop(temp) # Remove the index from df
df = df.dropna() # Remove NaN values
df = df.reset_index(drop = True)
df = df.sort_values(["tic","datadate"]) # Sort dataset on tickers and datadate, to get in correct order, again

n_features = 31 # Choose amount of features for use in later function. We have 31

input_size = 12 # Determine the amount of data for each firm that should på put in the model. We choose 12 months
train_size = 0.8 # NOT USED!!! - old variable from when we trained on 80% and tested on 20% of data

df = df[df["datadate"]<"2023"] # We decided for simplicity to cut off the data at 2023

# The next lines are what separates the data in the aforementioned periods
#df = df[(df["datadate"]<"2012-01-01") & (df["datadate"]>"2004-02-01")] 
#df = df[(df["datadate"]<"2019-01-01") & (df["datadate"]>"2011-02-01")] 
#df = df[(df["datadate"]<"2023-01-01") & (df["datadate"]>"2018-02-01")] 

df = df[(df["datadate"]<"2005") & (df["datadate"]>"1989")] 

# After examining descriptor correlation, these were deemed not important, and thus dropped
df = df.drop(["MIDREV","MIDREV excess","STOQ","ATO","TB3MS","rf"],axis = 1)



  0%|          | 0/1141398 [00:00<?, ?it/s]

Since we decided that we wanted to compare our portfolio results to the Nasdaq 100 index, it makes sense that we build our portfolio on the very same Nasdaq 100 constituents. Thus, we create a test set only containing the firms available in our dataset that were/are Nasdaq constituents in the correct time periods.

In [None]:
#create test train set
from functools import reduce
N100 = pd.ExcelFile("Ticker NASDAQ100 data BLOOMBERG.xlsx") # Load in Nasdaq 100 constituents for every year between 2005 and 2023
Nasdaq100 = {} # Create dictionary that will contain the constituents for each year
# Divide out information from each excel sheet in dictionary 
years = np.arange(2004,2024)
Tickers = []
for year in years:
    Nasdaq100[year] = pd.read_excel(N100, str(year))
    for j in range(len(Nasdaq100[year])):
        Nasdaq100[year].at[j,"Ticker"] = Nasdaq100[year]["Ticker"].tolist()[j].split()[0]
    Tickers = reduce(np.union1d, (Tickers,Nasdaq100[year]["Ticker"])) # End up with complete list of Nasdaq 100 constituents
    
        
unique_tics = np.unique(df["tic"]) # Determine the unique tickers in dataset
intersect = np.intersect1d(unique_tics,Tickers) # Find the intersection between tickers in dataset and Nasdaq 100 constituents 
train = df[~df["tic"].isin(intersect)] # Create train set with every ticker NOT in Nasdaq 100
test = df[df["tic"].isin(intersect)] # Create test set with every ticker in Nasdaq 100
test_pf = df[df["tic"].isin(intersect)] # Create duplicate test set with every ticker in Nasdaq 100 for portfolio stuff
train = train.drop(["prccm","mkt cap"],axis = 1) # Remove close price and market cap, since these are not used for training 
test = test.drop(["prccm","mkt cap"],axis = 1) # Remove close price and market cap, since these are not used for testing
test_pf = test_pf[["datadate","tic","prccm","mkt cap"]] # Only keep datadate, ticker, close price, and market cap in duplicate test set used for portfolio stuff 

In [None]:
# Scale both train and test set
for col in train.columns[2:-1]:
    scaler = MinMaxScaler()
    train[[col]] = scaler.fit_transform(train[[col]]) # Use fit_transform on training set
    test[[col]] = scaler.transform(test[[col]]) # Use transform on test set. ONLY NASDAQ COMPANIES

The next functions are what uses the input_size parameter from earlier. Basically, we want to represent the data in "chunks" of _input_size_ (this case 12) datapoints for each firm at a time. Thus, the first function creates a list, _dates_df_, of dataframes for the train set each containing a years worth of data with the next index of the list sliding one month ahead. The final output is as said a list where the index is the corresponding month's index (dates_df[0] contains the first month and so on)

The bottom function does roughly the same but for the test set. A key difference is, that the resulting list is a level deeper since we need to keep track of specific tickers for each year. Thus the first index corresponds to the year and the next index is the month (dates_df_test[0][0] will be first month of first year, while dates_df_test[1][11] is the last month of the next year and so on)

In [None]:
def input_size_func(df):
    dates_df = [] # Allocate storage to save data for each month
    unique_dates = np.unique(df["datadate"]) # Determine the unique dates 
    for i in range(input_size-1,len(unique_dates)): # Iterate over every month but start at "input_size"-1 since we cant look 12 months back starting at month 0
        try:
            # Look at data in range of "input_size" sliding one month ahead at a time
            dates_df.append(df[(df["datadate"]>=unique_dates[i-input_size+1]) & (df["datadate"]<=unique_dates[i])].reset_index(drop=True))
        except:
            None
    return dates_df

def input_size_func_test(dictionary,test_set):
    input_size_test = [] # Allocate storage to save data for each month
    years = np.arange(2019,2023) #'''REMEMBER TO CHANGE!!!''' # Decide which years to iterate over (three respective periods)
    for year in years:
        temp=test_set[test_set["tic"].isin(np.unique(dictionary[year]["Ticker"]))] # Look at specific tickers every year        
        temp_test=temp[(temp["datadate"]>=str(year-1)+"-02-01") & (temp["datadate"]<str(year+1) )] # Look at data in range of "input_size" sliding one month ahead at a time
        input_size_test.append(input_size_func(temp_test)) # Use above function for each year, creating a list that will be 1 level deeper
    return input_size_test
        
dates_df_train = input_size_func(train)    
dates_df_test_dict = input_size_func_test(Nasdaq100,test)
dates_df_test_pf = input_size_func_test(Nasdaq100,test_pf)

We noticed a little too late on, that the way we created the _input_size_ data in the above functions, we did not make sure that we only had data in chunks of 12. Thus, some places would contain a chunk of some random number of data points. This created a bunch of issues, since we need chunks of 12. A quick fix was to create two functions that could remove the inconsistensies. 

In [None]:
def remove_input_size_errors_train(dates_df_train):
    for l in tqdm_notebook(range(len(dates_df_train))): # Iterate over every index of list 
        k=0 # Assign start-index variable, so we can start next loop from index where last item is removed
        stop = 1 # Assign dummy variable to determine if we have removed something and can stop the current list index 
        # (stop == 0 means that we have not stopped in an entire run-through, and can then jump to next index) 
        for j in range(99999999):
            if stop == 0: # If we have not stopped in the previous run-through of current index there are no inconsitensies, we can jump to next index
                break
            stop = 0 # Set stop variable
            for i in range(k,len(dates_df_train[l]),input_size): # Jump 12 each step, start at k so we dont have to start from beginning when removing 
                try:
                    if dates_df_train[l].iloc[i]["tic"]!=dates_df_train[l].iloc[i+input_size-1]["tic"]: # If the firm 11 places in front is different we dont have chunk of 12
                        dates_df_train[l] = dates_df_train[l].drop(i).reset_index(drop=True) # Remove current index and reset index
                        k = i # Set start-index, since the next inconsistency will always come after the one we just removed
                        stop = 1 # Since we have removed datapoint, we have stopped, and thus it is not time to break out of loop since there might be more inconsitencies
                        break
                except:
                    dates_df_train[l] = dates_df_train[l].drop(i).reset_index(drop=True) # We will end up in except statement near the end if there is inconsistency. Remove this
                    k = i # Set start-index, since the next inconsistency will always come after the one we just removed
                    stop = 1 # Since we have removed datapoint, we have stopped, and thus it is not time to break out of loop since there might be more inconsitencies
    return dates_df_train

def remove_input_size_errors_test(dates_df_test):
    # Remember that this list is a level deeper, so we need to keep track of both year and month
    year = 0
    month = 0
    for l in tqdm_notebook(range(len(dates_df_test)*12)): # The total number of indexes is the amount of years multiplied with 12
        k=0 # Assign start-index variable, so we can start next loop from index where last item is removed
        stop = 1 # Assign dummy variable to determine if we have removed something and can stop the current list index
        for j in range(9999999):
            if stop == 0: # If we have not stopped in the previous run-through of current index there are no inconsitensies, we can jump to next index
                break
            stop = 0 # Set stop variable
            for i in range(k,len(dates_df_test[year][month]),input_size): # Jump 12 each step, start at k so we dont have to start from beginning when removing 
                try:
                    if dates_df_test[year][month].iloc[i]["tic"]!=dates_df_test[year][month].iloc[i+input_size-1]["tic"]: # If the firm 11 places in front is different we dont have chunk of 12
                        dates_df_test[year][month] = dates_df_test[year][month].drop(i).reset_index(drop=True) # Remove current index and reset index
                        k = i # Set start-index, since the next inconsistency will always come after the one we just removed
                        stop = 1 # Since we have removed datapoint, we have stopped, and thus it is not time to break out of loop since there might be more inconsitencies
                        break
                except:
                    dates_df_test[year][month] = dates_df_test[year][month].drop(i).reset_index(drop=True) # We will end up in except statement near the end if there is inconsistency. Remove this
                    k = i # Set start-index, since the next inconsistency will always come after the one we just removed
                    stop = 1 # Since we have removed datapoint, we have stopped, and thus it is not time to break out of loop since there might be more inconsitencies
        month = (month+1) % 12 # Update month (every time we reach 12 it will reset to 0 and begin a new year)
        year = int(np.floor((l+1)/12)) # Update year, only change when we reach 12th, 24th, ... index
    return dates_df_test

dates_df_train = remove_input_size_errors_train(dates_df_train)
dates_df_test_dict = remove_input_size_errors_test(dates_df_test_dict)
dates_df_test_pf = remove_input_size_errors_test(dates_df_test_pf)

The next functions split datasets into features and target, but also keeps individual firms separated. We want the data in a way where 12 months of features correspond to the last month's target. Example: We save features for one firm from Jan to Dec, and save the target for Dec.

Again, bottom function is a level deeper.

In [None]:
def split(dictionary):
    # Create dictionaries that will contain features and target 
    X = {}
    y = {}
    for i in tqdm_notebook(range(len(dictionary))):
        # Create temporary lists that will contain dataframes containg chunks of 12 for each individual firm, so 1 dataframe per firm
        save_X = []
        save_y = []
        try:
            for j in range(len(dictionary[i])):
                # If next firm is different, we need to save the features from previous 11 months and from current month + target from current month
                if dictionary[i]["tic"].iloc[j] != dictionary[i]["tic"].iloc[j+1]: 
                    save_X.append(dictionary[i].iloc[j-input_size+1:j+1,2:-1]) # Save features 
                    save_y.append(dictionary[i].iloc[j,-1]) # Save target
                    
        except: # We end up in the except statement in the end
            save_X.append(dictionary[i].iloc[j-input_size+1:j+1,2:-1]) # Save features 
            save_y.append(dictionary[i].iloc[j,-1]) # Save target
        #Save temporary lists in dictionary
        X[i] = save_X 
        y[i] = save_y
    return X,y

def split_test(test_dict):
    # Create dictionaries that will contain features and target 
    X_temp,y_temp = {},{}
    for k in range(len(test_dict)):
        X_,y_ = split(test_dict[k]) # Use above function for every year of test set
        X_temp[k] = X_ # Save features for each year in dictionary
        y_temp[k] = y_ # Save target for each year in dictionary
    return X_temp,y_temp

X_train, y_train = split(dates_df_train)
# X_test, y_test = split_test(dates_df_test_dict)

In [None]:
pickle.dump( X_train, open( "save_X_1990.p", "wb" ) )
pickle.dump( y_train, open( "save_y_1990.p", "wb" ) )
# pickle.dump( X_test, open( "save_X_test_19_22.p", "wb" ) )
# pickle.dump( y_test, open( "save_y_test_19_22.p", "wb" ) )
# pickle.dump( dates_df_test_pf, open( "dates_df_test_pf_19_22.p", "wb" ) )

# Run code from here 
We are now done with data manipulation, and ready to run the model

In [3]:
# If model is not run from beginning, but pickled data is loaded in, run this to assign varaibles
input_size = 12
n_features = 31

In [None]:
# Create RNN model that we use in project
def create_model_rnn(neurons,learning_rate,drop_out,hidden):
    early = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=50) # Stop if no progress in # epochs
    #checkpoint = tf.keras.callbacks.ModelCheckpoint("weights.best.hdf5",monitor='val_loss', verbose=0, save_best_only=True, mode='min')
    callbacks_list = [early] #,checkpoint
    model = Sequential()
    model.add(SimpleRNN(units = neurons, activation = 'relu', input_shape=(input_size, n_features), return_sequences=True))
    model.add(Dropout(rate=drop_out))
    
    # Create functionality for using different amounts of hidden layers used in grid search
    if hidden == 1:
        model.add(SimpleRNN(units = neurons, activation = 'relu', input_shape=(input_size, n_features), return_sequences=True))
    elif  hidden == 2:
        model.add(SimpleRNN(units = neurons, activation = 'relu', input_shape=(input_size, n_features), return_sequences=True))
        model.add(SimpleRNN(units = neurons, activation = 'relu', input_shape=(input_size, n_features), return_sequences=True))
    elif  hidden == 3:
        model.add(SimpleRNN(units = neurons, activation = 'relu', input_shape=(input_size, n_features), return_sequences=True))
        model.add(SimpleRNN(units = neurons, activation = 'relu', input_shape=(input_size, n_features), return_sequences=True))
        model.add(SimpleRNN(units = neurons, activation = 'relu', input_shape=(input_size, n_features), return_sequences=True))
        
    model.add(SimpleRNN(units = neurons, activation = 'relu', return_sequences=False ))
    model.add(Dense(units = 1)) #Linear output layer
    opt = optimizers.Adam(lr=learning_rate, clipnorm=1.)  
    model.compile(optimizer = opt, loss = "mse")
    return model,callbacks_list

In [None]:
#Hyperparameters
neurons = [50]
epochs = [50]
learning_rate = [0.0005,0.001] 
drop_out = [0.05,0.01]
batch_size = [32,64]
hidden = [1,2]
val_size = 0.2
loss = "mse"

Below, we do the grid search. We run a model for every combination of hyperparameters given above and sort the different models on their validation mse.

Bottom function makes it possible to run big grid on firstt month and then use optimal hyperparameters on next 2 months.

In [None]:
def find_optimal_model(X_train,y_train,starting_point,end_point,epochs,neurons,learning_rate,batch_size,hidden,drop_out):
    optimal = []
    if len(neurons) == 1 & len(epochs) == 1 & len(learning_rate) == 1 & len(drop_out) == 1 & len(batch_size) == 1 & starting_point == end_point:
        verbose = 1
    else:
        verbose = 0
    for i in tqdm_notebook(range(starting_point,end_point+1)):
        print(i)
        print("---")
        temp = []
        for j in range(len(neurons)):
            for q in range(len(epochs)):
                for k in range(len(learning_rate)):
                    for p in range(len(drop_out)):
                        for l in range(len(batch_size)):
                            for h in range(len(hidden)):
                                print("Neurons:",neurons[j],". Epochs:",epochs[q],". Learning rate:",learning_rate[k],". Dropout:",drop_out[p],". Batch size:",batch_size[l], ". Hidden layers:", hidden[h])
                                # Create model using function above
                                model,callbacks_list = create_model_rnn(neurons[j],learning_rate[k],drop_out[p],hidden[h])
                                # Fit model on training data, save loss history
                                hist = model.fit(np.array(X_train[i]),np.array(y_train[i]),epochs=epochs[q], verbose=verbose,callbacks=callbacks_list,validation_split=0.2,batch_size = batch_size[l]) 
                                # Predict data using train as input to calculate train MSE
                                pred_train = model.predict(np.array(X_train[i]))
                                # Predict data using validation data as input to calculate validation MSE
                                pred_val = model.predict(np.array(X_train[i])[int(len(np.array(X_train[i]))*0.8):])
                                # Save dataframe of different MSEs, hyperparameters used, models, and loss results
                                temp.append([mean_squared_error(np.array(y_train[i]),pred_train),mean_squared_error(np.array(y_train[i])[int(len(y_train[i])*0.8):],pred_val),neurons[j],epochs[q],learning_rate[k],drop_out[p],batch_size[l],hidden[h],model,hist.history])
        df_temp = pd.DataFrame(data = temp,columns = ("train mse","val mse","neurons","epochs","learning_rate","dropout","batch size","hidden layers","model_save","loss_hist"))
        optimal.append(df_temp)
    return optimal, starting_point

def run_multiple(X_train,y_train,starting_point,end_point,epochs,neurons,learning_rate,batch_size,hidden,drop_out):
    optimal_1, index_start = find_optimal_model(X_train,y_train,starting_point,starting_point,epochs,neurons,learning_rate,batch_size,hidden,drop_out)
    temp_hyper = optimal_1[0][optimal_1[0]["val mse"]==(optimal_1[0]["val mse"].min())]
    optimal_2, start = find_optimal_model(X_train,y_train,starting_point+1,end_point,[temp_hyper.iloc[0]["epochs"]],[temp_hyper.iloc[0]["neurons"]],[temp_hyper.iloc[0]["learning_rate"]],[temp_hyper.iloc[0]["batch size"]],[temp_hyper.iloc[0]["hidden layers"]],[temp_hyper.iloc[0]["dropout"]])
    return optimal_1+optimal_2,index_start, end_point

In [None]:
# Run above functions to find models, specify start and end index
start = 78
ending = 83 
optimal1,index_start,index_end = run_multiple(X_train,y_train,start,start+2,epochs,neurons,learning_rate,batch_size,hidden,drop_out)
optimal2,index_start,slut = run_multiple(X_train,y_train,index_end+1,ending,epochs,neurons,learning_rate,batch_size,hidden,drop_out)
optimal = optimal1+optimal2

In [None]:
# Find every optimal model and save in dataframe along with hyperparameters used
def choose_optimal(optimal,index_start):
    df = pd.DataFrame()
    year = int(np.floor(index_start/12))
    month = index_start % 12
    for i in range(len(optimal)):
        temp = optimal[i][optimal[i]["val mse"]==(optimal[i]["val mse"].min())]
        df = df.append(temp)
        print(year,month)
        month = month+1
    df = df.reset_index(drop=True)
    return df

optimal_df = choose_optimal(optimal,start)

In [None]:
optimal_df

# Pickle data here when done running

In [None]:
pickle.dump( optimal_df.drop(["model_save"],axis=1), open( "optimal_df_1997_2-2.p", "wb" ) ) # Remember to change name

# Load in and gather saved data

In [2]:
y1997_ = pickle.load( open( "optimal_df_1997_2-2.p", "rb" ) )
y1998 = pickle.load( open( "optimal_df_1998_1-2.p", "rb" ) )
y1998_ = pickle.load( open( "optimal_df_1998_2-2.p", "rb" ) )
y1999 = pickle.load( open( "optimal_df_1999_1-2.p", "rb" ) )
y1999_ = pickle.load( open( "optimal_df_1999_2-2.p", "rb" ) )
y2000 = pickle.load( open( "optimal_df_2000_1-2.p", "rb" ) )
y2000_ = pickle.load( open( "optimal_df_2000_2-2.p", "rb" ) )
y2001 = pickle.load( open( "optimal_df_2001_1-2.p", "rb" ) )
y2001_ = pickle.load( open( "optimal_df_2001_2-2.p", "rb" ) )
y2002 = pickle.load( open( "optimal_df_2002_1-2.p", "rb" ) )
y2002_ = pickle.load( open( "optimal_df_2002_2-2.p", "rb" ) )
y2003 = pickle.load( open( "optimal_df_2003_1-2.p", "rb" ) )
y2003_ = pickle.load( open( "optimal_df_2003_2-2.p", "rb" ) )
y2004 = pickle.load( open( "optimal_df_2004_1-2.p", "rb" ) )
y2004_ = pickle.load( open( "optimal_df_2004_2-2.p", "rb" ) )


optimal_df_97_04 = pd.concat([y1997_,y1998,y1998_,y1999,y1999_,y2000,y2000_,y2001,y2001_,y2002,y2002_,y2003,y2003_,y2004,y2004_]).reset_index(drop=True)
pickle.dump( optimal_df_97_04, open( "optimal_df_97-04.p", "wb" ) )

In [3]:
optimal_df_91_97 = pickle.load( open( "optimal_df_91-97.p", "rb" ) )

In [4]:
optimal_df_pre = pd.concat([optimal_df_97_04,optimal_df_91_97])
pickle.dump( optimal_df_pre, open( "optimal_df_91-04.p", "wb" ) )

In [3]:
optimal_df = pickle.load( open( "2005_FINAL.p", "rb" ) )
#pickle.dump( optimal_df, open( "optimal_df_05-11.p", "wb" ) )

In [4]:
y2012 = pickle.load( open( "optimal_df_2012_1-2", "rb" ) )
y2012_ = pickle.load( open( "optimal_df_2012_2-2", "rb" ) )
y2013 = pickle.load( open( "optimal_df_2013_1-2", "rb" ) )
y2013_ = pickle.load( open( "optimal_df_2013_2-2", "rb" ) )
y2014 = pickle.load( open( "optimal_df_2014_1-2", "rb" ) )
y2014_ = pickle.load( open( "optimal_df_2014_2-2", "rb" ) )
y2015 = pickle.load( open( "optimal_df_2015_1-2", "rb" ) )
y2015_ = pickle.load( open( "optimal_df_2015_2-2", "rb" ) )
y2016 = pickle.load( open( "optimal_df_2016_1-2", "rb" ) )
y2016_ = pickle.load( open( "optimal_df_2016_2-2", "rb" ) )
y2017 = pickle.load( open( "optimal_df_2017_1-2", "rb" ) )
y2017_ = pickle.load( open( "optimal_df_2017_2-2", "rb" ) )
y2018 = pickle.load( open( "optimal_df_2018_1-2", "rb" ) )
y2018_ = pickle.load( open( "optimal_df_2018_2-2", "rb" ) )


optimal_df = pd.concat([y2012,y2012_,y2013,y2013_,y2014,y2014_,y2015,y2015_,y2016,y2016_,y2017,y2017_,y2018,y2018_]).reset_index(drop=True)
#pickle.dump( optimal_df, open( "optimal_df_12-18.p", "wb" ) )

In [7]:
y2019 = pickle.load( open( "optimal_df_2019_1-2", "rb" ) )
y2019_ = pickle.load( open( "optimal_df_2019_2-2", "rb" ) )
y2020 = pickle.load( open( "optimal_df_2020_1-2", "rb" ) )
y2020_ = pickle.load( open( "optimal_df_2020_2-2", "rb" ) )
y2021 = pickle.load( open( "optimal_df_2021_1-2", "rb" ) )
y2021_ = pickle.load( open( "optimal_df_2021_2-2", "rb" ) )
y2022 = pickle.load( open( "optimal_df_2022_1-2", "rb" ) )
y2022_ = pickle.load( open( "optimal_df_2022_2-2", "rb" ) )


optimal_df = pd.concat([y2019,y2019_,y2020,y2020_,y2021,y2021_,y2022,y2022_]).reset_index(drop=True)
pickle.dump( optimal_df, open( "optimal_df_19-22.p", "wb" ) )

# Run portfolio

We are now ready for the portfolio stuff. 

First function is what goes through every month and manages portfolios.

In [8]:
def optimal_hyper(optimal_df,X_test,y_test,dates_df_test_pf, frequency):
    df = pd.DataFrame(columns = ("ls_ret_eq","ls_ret_mc","bh_ret_eq","bh_ret_mc")) # Create df that will save returns from all portfolios each month
    # We need to keep track of the current year and month that we are running through. Start is year 0 month 0.
    year = -1 # make sure modulus starts at right point, year will be increasesed to 0. This is done beacuse when i=0 we will go into year increase
    month = 0
    
    rebal = 1 # Initialize paramater that keeps track of when we are in a month where we can rebalance
    
    bh_ret_eq,bh_ret_mc = 0,0 # we have no return from buy/hold in first period
    bh_value_eq,bh_value_mc = 100,100 # Start with $100 (We actually ended up not using the value)
    bh_pf_eq,bh_pf_mc = pd.DataFrame(),pd.DataFrame() # we have no portfolios in first period
    portfolio_stocks = {} # Initialize dictionary that will hold all stocks we use for every month
    returns_list = [] # Initialize list that will save returns from every portfolio for every month
    
    save_i = [] # ALlocate storage to save indexes of rebalance points 
    for i in range(len(optimal_df)):
        # First if-statement keeps track of when we can rebalance
        if i %frequency !=0: # If freq is 1 then we rebalance every month. If freq is 3, we can rebalance every third month and so on
            bh_ret_eq = 999 # If we are in a month where we can NOT rebalance, we just set the return to a large value, so we do not enter rebalance in make_pf func
            bh_ret_mc = 999 
            rebal = 0 # Set rebal = 0 when we can not rebalance
        elif i%frequency == 0: # In a month we can rebalance, save the index, so we can plot it
            save_i.append(i)
        
        temp = optimal_df.iloc[i] # Save current month's predictions etc. in temp value
        
        # Update year every 12th index, at the same time set month to 0 for the new year
        if i %12 == 0:
            year = year+1
            month = 0
        
        # Run select_stocks to save tickers, predicitons, true returns, and market cap for each month
        portfolio_stocks[i] = select_stocks(temp["pred"],np.array(y_test[year][month]),dates_df_test_pf,year,month)
        # Run make_pf to create/update/manage portfolios for every month - function returns:
        # Returns from both long/short pf
        # Stocks in both buy/hold pf, returns from both buy/hold, value of both buy/hold
        ls_ret_eq, ls_ret_mc, bh_pf_eq, bh_pf_mc, bh_ret_eq, bh_ret_mc, bh_value_eq, bh_value_mc = make_pf(portfolio_stocks[i],bh_ret_eq,bh_ret_mc,bh_value_eq,bh_value_mc,bh_pf_eq,bh_pf_mc,Nasdaq_avg_ret[2],rebal )

        df.loc[len(df)] = [ls_ret_eq,ls_ret_mc,bh_ret_eq,bh_ret_mc] # Save returns in df
        
        returns_list.append([ls_ret_eq,ls_ret_mc,bh_ret_eq,bh_ret_mc]) # Save returns in list (we ended up not using this)
        
        month = month+1 #increase 1 month
        rebal = 1 # Reset rebal for next month
    df = df.reset_index(drop=True)
    return df, portfolio_stocks, returns_list, save_i

In [9]:
def select_stocks(pred_ret,true_ret,dates_df_test_pf,year,month):
    # Function gathers tickers, predicted returns, true returns, and market cap for all stocks
    tic_cap = pd.DataFrame() 
    for l in range(2,len(dates_df_test_pf[year][month]),input_size): # Get tickers from dates_df_test_pf, but only need 1 instance per firm
        tic_cap = tic_cap.append(dates_df_test_pf[year][month][l:l+1])
    
    df = pd.DataFrame(columns = ("tic","pred","true","market cap"))
    df["tic"] = tic_cap["tic"]
    df["pred"] = np.exp(pred_ret)-1 #Transform log-returns to normal returns
    df["true"] = np.exp(true_ret)-1 #Transform log-returns to normal returns
    df["market cap"] = tic_cap["mkt cap"]
    df = df.sort_values("pred",ascending=False) # Sort everything according to predicted returns - highest to lowest
    return df

In [10]:
def make_pf(df,bh_ret_eq,bh_ret_mc,bh_value_eq,bh_value_mc,bh_pf_eq,bh_pf_mc,Nasdaq_avg_ret,rebal):
    # Function should create 2 portfolios: Long/short & buy/hold
    
    '''_______________ long/short _______________''' 
    # First create long/short portfolios that go long in top 10% performing stocks and short in bottom 10% performing stocks
    
    ls_pf = df.copy().drop(df.index[int(len(df)*0.1):int(len(df)*0.9)]).reset_index(drop=True) # Create df 20% of the length of input stocks

    if len(ls_pf) % 2 != 0: # For zero-net investment, we make sure that we long and short same amount of stocks
        ls_pf = ls_pf.drop(np.floor(len(ls_pf)/2)).reset_index(drop=True) # Thus, if we have odd amount of stocks, remove middle index
    
    # Equal weight
    ls_pf["equal"] = (ls_pf["true"])*1/(len(ls_pf)/2) # Multiply both long and short positions with 1/(length of long/short)
    ls_pf["equal"].iloc[len(ls_pf)//2:] = ls_pf["equal"].iloc[len(ls_pf)//2:]*(-1) # Multiply short positions with minus 1 (get opposite sign returns)
    ls_ret_eq = np.sum(ls_pf["equal"]) # The return of the portfolio will be the sum of the returns
    
    # Market cap weight
    # For the market-cap-weighted do the same as above, but this time multiply by weights: mkt_cap/(total mkt_cap of long/short positions)
    ls_pf["mkt_cap"] = (ls_pf["true"])
    ls_pf["mkt_cap"].iloc[:len(ls_pf)//2] = ls_pf["mkt_cap"].iloc[:len(ls_pf)//2]*1*ls_pf["market cap"].iloc[:len(ls_pf)//2]/sum(ls_pf["market cap"].iloc[:len(ls_pf)//2])
    ls_pf["mkt_cap"].iloc[len(ls_pf)//2:] = ls_pf["mkt_cap"].iloc[len(ls_pf)//2:]*(-1)*ls_pf["market cap"].iloc[len(ls_pf)//2:]/sum(ls_pf["market cap"].iloc[len(ls_pf)//2:])
    ls_ret_mc = np.sum(ls_pf["mkt_cap"]) # The return of the portfolio will be the sum of the returns
    
    
    '''_______________ buy/hold _______________'''
    # Next create buy/hold portfolios that buy stocks with positive predictions and holds, and goes cash when market goes down
    
    # Equal weight
    # Check if we should rebalance or not depending on the return from previous period
    if bh_ret_eq >= Nasdaq_avg_ret:
        if bh_pf_eq.empty: # If we are holding cash, the return from this month is 0
            bh_ret_eq = 0
        elif len(df[df["pred"]>0])<1 and rebal == 1: # If we should not rebalance, check if next month will go down
            # If yes, and we are able to rebalance, go cash to avoid next month's crash
            bh_ret_eq = 0 
        else:
            # If we are in a situation where we should not rebalance, save current stocks in the portfolio
            tics = np.unique(df["tic"])
            bh_pf_eq = bh_pf_eq[bh_pf_eq["tic"].isin(tics)].sort_values("tic")
            bh_pf_eq["true"] = df[df["tic"].isin(bh_pf_eq["tic"].unique())].sort_values("tic")["true"].tolist()
            bh_pf_eq["equal"] = (bh_pf_eq["true"]+1)*bh_value_eq/len(bh_pf_eq) # Calculate weighted return of each stock in pf (equal)
            bh_ret_eq = (sum(bh_pf_eq["equal"])/bh_value_eq)-1 # Find total return for month
            bh_value_eq = sum(bh_pf_eq["equal"]) # Update total pf value
    
    # Check if returns are below threshold so we should rebalance (If we are in a month where we can NOT rebalance, we will never enter here)
    elif bh_ret_eq < Nasdaq_avg_ret:
        bh_pf_eq = df[df["pred"]>0].copy() # Rebalance by buying every stock with positive prediction
        if bh_pf_eq.empty: # If all predictions are negative, we go cash
            bh_ret_eq = 0
        elif len(bh_pf_eq)<1: # Same as above, but can be changed if we want more positive predictions before we buy
            bh_ret_eq = 0
        else:
            bh_pf_eq["equal"] = (bh_pf_eq["true"]+1)*bh_value_eq/len(bh_pf_eq) # Calculate weighted return of each stock in pf (equal)
            bh_ret_eq = (sum(bh_pf_eq["equal"])/bh_value_eq)-1 # Find total return for month
            bh_value_eq = sum(bh_pf_eq["equal"]) # Update total pf value

        
    # Market cap weight
    # Check if we should rebalance or not depending on the return from previous period
    if bh_ret_mc >= Nasdaq_avg_ret:
        if bh_pf_mc.empty: # If we are holding cash, the return from this month is 0
            bh_ret_mc = 0
        elif len(df[df["pred"]>0])<1 and rebal == 1: # If we should not rebalance, check if next month will go down
            # If yes, and we are able to rebalance, go cash to avoid next month's crash
            bh_ret_mc = 0 
        else:
            # If we are in a situation where we should not rebalance, save current stocks in the portfolio
            tics = np.unique(df["tic"])
            bh_pf_mc = bh_pf_mc[bh_pf_mc["tic"].isin(tics)].sort_values("tic")
            bh_pf_mc["true"] = df[df["tic"].isin(bh_pf_mc["tic"].unique())].sort_values("tic")["true"].tolist()
            bh_pf_mc["mkt_cap_pf"] = (bh_pf_mc["true"]+1)*bh_value_mc/len(bh_pf_mc) # Calculate weighted return of each stock in pf (market-cap-weighted)
            bh_ret_mc = (sum(bh_pf_mc["mkt_cap_pf"])/bh_value_mc)-1 # Find total return for month
            bh_value_mc = sum(bh_pf_mc["mkt_cap_pf"]) # Update total pf value
    
    # Check if returns are below threshold so we should rebalance (If we are in a month where we can NOT rebalance, we will never enter here)
    elif bh_ret_mc < Nasdaq_avg_ret:
        bh_pf_mc = df[df["pred"]>0].copy() # Rebalance by buying every stock with positive prediction
        if bh_pf_mc.empty: # If all predictions are negative, we go cash
            bh_ret_mc = 0
        elif len(bh_pf_mc)<1: # Same as above, but can be changed if we want more positive predictions before we buy
                bh_ret_mc = 0
        else:
            bh_pf_mc["mkt_cap_pf"] = (bh_pf_mc["true"]+1)*bh_value_mc*bh_pf_mc["market cap"]/sum(bh_pf_mc["market cap"]) # Calculate weighted return of each stock in pf (market-cap-weighted)
            bh_ret_mc = (sum(bh_pf_mc["mkt_cap_pf"])/bh_value_mc)-1 # Find total return for month
            bh_value_mc = sum(bh_pf_mc["mkt_cap_pf"]) # Update total pf value
            
    # return returns from all portfolios, stocks in buy/hold and value of buy/hold
    return ls_ret_eq, ls_ret_mc, bh_pf_eq, bh_pf_mc, bh_ret_eq, bh_ret_mc, bh_value_eq, bh_value_mc

In [15]:
optimal_new, portfolio_stocks, returns_list,save_i = optimal_hyper(optimal_df,X_test,y_test,dates_df_test_pf,1)

In [13]:
testing = []
for i in range(len(portfolio_stocks)):
    if all(val > 0 for val in portfolio_stocks[i]["true"].head(15)):
        print(i)

21
50


In [12]:
portfolio_stocks[50][0:60]

Unnamed: 0,tic,pred,true,market cap
794,WYNN,0.374522,0.964447,9097696000.0
722,STX,0.291945,0.357737,9272062000.0
302,FLEX,0.276603,0.342561,7861784000.0
494,LOGI,0.25053,0.29572,4809501000.0
710,STLD,0.222925,0.413167,7379971000.0
38,ADSK,0.222637,0.186199,7571682000.0
98,BB,0.222331,0.612155,65980460000.0
446,ISRG,0.197137,0.507236,10426860000.0
86,AMZN,0.188372,0.096405,30628470000.0
650,QRTEA,0.181774,0.827586,8329880000.0


# Table for results

In [17]:
# Create function that can make table of returns, volatility and Sharpe ratio for period
def df_results(pred_period,optimal_new):
    rf = pd.read_csv("DGS10.csv") # Get risk-free rate
    # Find Nasdaq and rf for the period we are looking at
    if pred_period == 0:       
        plot_nas = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2005-01-31") & (Nasdaq100_index["DATE"]<"2012-01-01")]["NASDAQ100"].tolist()
        rf = rf[(rf["DATE"]>"2005")&(rf["DATE"]<"2012")]["DGS10"].mean()
        
    elif pred_period == 1:     
        plot_nas = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2012-01-31") & (Nasdaq100_index["DATE"]<"2019-01-01")]["NASDAQ100"].tolist()
        rf = rf[(rf["DATE"]>"2012")&(rf["DATE"]<"2019")]["DGS10"].mean()
    
    elif pred_period == 2:
        plot_nas = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2019-01-31") & (Nasdaq100_index["DATE"]<"2023-01-01")]["NASDAQ100"].tolist()
        rf = rf[(rf["DATE"]>"2019")&(rf["DATE"]<"2023")]["DGS10"].mean()
    # The data has "." in a lot of places, that we remove 
    for x in plot_nas[:]:
        if x == '.':
            plot_nas.remove(x) 
    plot_nas = [float(x) for x in plot_nas] # The dataformat is also strings, so we convert to float
    # Calculate Nasdaq returns
    ret_nas = []
    for i in range(1,len(plot_nas)):
        ret_nas.append(plot_nas[i]/plot_nas[i-1]-1)
    Nasdaq_ret = np.cumsum(ret_nas)[-1] # Find total Nasdaq return in period
    Nasdaq_avg_ret = Nasdaq_ret/(len(optimal_new)/12) # Find the average yearly Nasdaq return
    vol = np.std(ret_nas)*np.sqrt(252) # Find Nasdaq yearly volatility
    SR = (Nasdaq_avg_ret-rf)/vol # Calculate Nasdaq Sharpe ratio
    data = np.array([Nasdaq_ret,Nasdaq_avg_ret,vol,SR]) # Save Nasdaq data and put in df
    df_tabular = pd.DataFrame(data=data,columns=["Nasdaq 100"],index=["Total return in period","Average annual return","Annual volatility","Sharpe ratio"])
    
    # Allocate storage for buy/hold total return and volatility
    tot_ret_bh = []
    ann_vol_bh = []
    # Allocate storage for long/short total return and volatility
    tot_ret_ls = []
    ann_vol_ls = []
    
    # Calculate buy/hold total return, average annual return, annual volatility, and Sharpe ratio
    for col in optimal_new.columns[2:]:
        tot_ret_bh.append(np.cumsum(optimal_new[col]).tolist()[-1])
        avg_ann_ret_bh = [x/(len(optimal_new)/12) for x in tot_ret_bh]
        ann_vol_bh.append(np.std(optimal_new[col])*np.sqrt(12))
    SR_bh = (np.array(avg_ann_ret_bh)-rf)/ann_vol_bh
    
    # Calculate long/short total return, average annual return, annual volatility, and Sharpe ratio
    for col in optimal_new.columns[:2]:
        tot_ret_ls.append(np.cumsum(optimal_new[col]).tolist()[-1])
        avg_ann_ret_ls = [x/(len(optimal_new)/12) for x in tot_ret_ls]
        ann_vol_ls.append(np.std(optimal_new[col])*np.sqrt(12))
    SR_ls = (np.array(avg_ann_ret_ls)-rf)/ann_vol_ls
    
    
    df_tabular["Buy/hold eq"] = [tot_ret_bh[0],avg_ann_ret_bh[0],ann_vol_bh[0],SR_bh[0]]
    df_tabular["Buy/hold mc"] = [tot_ret_bh[1],avg_ann_ret_bh[1],ann_vol_bh[1],SR_bh[1]]
    df_tabular["Long/short eq"] = [tot_ret_ls[0],avg_ann_ret_ls[0],ann_vol_ls[0],SR_ls[0]]
    df_tabular["Long/short mc"] = [tot_ret_ls[1],avg_ann_ret_ls[1],ann_vol_ls[1],SR_ls[1]]
    
    return df_tabular

df_results(1,optimal_new)
    

Unnamed: 0,Nasdaq 100,Buy/hold eq,Buy/hold mc,Long/short eq,Long/short mc
Total return in period,1.030968,1.398925,1.307196,2.050198,2.692237
Average annual return,0.147281,0.199846,0.186742,0.292885,0.384605
Annual volatility,0.160248,0.102875,0.111541,0.19817,0.311077
Sharpe ratio,0.778413,1.723497,1.472113,1.364202,1.163903


In [None]:
# For table with yearly long/short returns + Nasdaq

a = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2005-01-31") & (Nasdaq100_index["DATE"]<"2012-01-01")]
# a = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2012-01-31") & (Nasdaq100_index["DATE"]<"2019-01-01")]
# a = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2019-01-31") & (Nasdaq100_index["DATE"]<"2023-01-01")]
for i in range(len(a)):
    if a["NASDAQ100"].iloc[i] == ".":
        a["NASDAQ100"].iloc[i] = a["NASDAQ100"].iloc[i-1]
    else:
        a["NASDAQ100"].iloc[i] = float(a["NASDAQ100"].iloc[i])
a["return"] = a["NASDAQ100"]/a["NASDAQ100"].shift(1)-1
a = a.dropna()

years = np.arange(2012,2019) # Change for period
total_sum = []
for year in years:
    total_sum.append(sum(a[(a["DATE"]>str(year-1)+"12-31") & (a["DATE"]<str(year+1))]["return"].tolist()))

ls_df = pd.DataFrame(index=years)
ls_eq_cum_ret = []
ls_mc_cum_ret = []
for i in range(11,len(optimal_new)+11,12):
    ls_eq_cum_ret.append(optimal_new["ls_ret_eq"].loc[i-11:i].sum())
    ls_mc_cum_ret.append(optimal_new["ls_ret_mc"].loc[i-11:i].sum())
ls_df["Nasdaq 100"] = total_sum
ls_df["Long/short eq"] = ls_eq_cum_ret
ls_df["Long/short mc"] = ls_mc_cum_ret
ls_df.loc["Total"] = [sum(total_sum),sum(ls_eq_cum_ret),sum(ls_mc_cum_ret)]
ls_df

# Plots

In the following function, we plot the cumulative returns for nasdaq and for buy/hold with 3 different rebalancing frequencies.

In [None]:
def compare_return_with_nasdaq(optimal_df,Nasdaq100_index,pred_period,pf_type):
    rebalance = 1 # Determine rebalancing frequency
    for k in range(3):
        if k==1:
            rebalance = 3
        elif k==2:
            rebalance = 6
        optimal_new, portfolio_stocks, returns_list, save_i = optimal_hyper(optimal_df,X_test,y_test,dates_df_test_pf,rebalance)
        
        # Determine period we are looking at, and find correspodning Nasdaq dates
        if pred_period == 0:
            date_range = pd.date_range(start='2005-01-01', end='2012-01-31', freq='M')
            plot_nas = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2005-01-31") & (Nasdaq100_index["DATE"]<"2012-02-01")]["NASDAQ100"].tolist()
            X_axis = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2005-01-31") & (Nasdaq100_index["DATE"]<"2012-01-31")]["DATE"].tolist()
        elif pred_period == 1:
            date_range = pd.date_range(start='2012-01-01', end='2019-01-31', freq='M')
            plot_nas = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2012-01-31") & (Nasdaq100_index["DATE"]<"2019-02-01")]["NASDAQ100"].tolist()
            X_axis = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2012-01-31") & (Nasdaq100_index["DATE"]<"2019-01-31")]["DATE"].tolist()
        elif pred_period == 2:
            date_range = pd.date_range(start='2019-01-01', end='2023-02-01', freq='M')
            plot_nas = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2019-01-31") & (Nasdaq100_index["DATE"]<"2023-02-01")]["NASDAQ100"].tolist()
            X_axis = Nasdaq100_index[(Nasdaq100_index["DATE"]>="2019-01-31") & (Nasdaq100_index["DATE"]<"2023-01-31")]["DATE"].tolist()
        else:
            print("WRONG INPUT PRED PERIOD")

        ''' PREPARE NASDAQ 100 PLOT '''
        index_list = []
        i = 0
        for x in plot_nas[:]:
            if x == '.':
                plot_nas.remove(x) 
                index_list.append(i)
            i = i+1
        plot_nas = [float(x) for x in plot_nas]
        cum_ret_nas = []
        for i in range(1,len(plot_nas)):
            cum_ret_nas.append(plot_nas[i]/plot_nas[i-1]-1)
        cum_ret_nas = np.cumsum(cum_ret_nas) # Calculate cumulative returns for Nasdaq
        
        # Prepare x-axis for Nasdaq plot 
        X_axis = [x for i, x in enumerate(X_axis) if i not in index_list]
        X_axis = [datetime.datetime.strptime(date_str, "%Y-%m-%d").date() for date_str in X_axis]

        eq = [0]
        mc = [0]

        ''' PREPARE RETURN PLOT'''
        if pf_type == "ls":
            eq = eq + (optimal_new["ls_ret_eq"]).tolist() # Collect lists
            mc = mc + (optimal_new["ls_ret_mc"]).tolist()
        elif pf_type == "bh":
            eq = eq + (optimal_new["bh_ret_eq"]).tolist() # Collect lists
            mc = mc + (optimal_new["bh_ret_mc"]).tolist()
        else:
            print("WRONG INPUT PF TYPE")

        plot_eq = np.cumsum(eq) # Calculate cumulative returns for equal buy/hold
        plot_mc = np.cumsum(mc) # Calculate cumulative returns for mkt cap buy/hold
        
        # Add recession bars to plot
        if pred_period == 0:
            rec_3 = [13847,14395] #rec_3: 2007-12-01, 2009-06-01
            plt.fill_between(rec_3, -1.1, 5.1, facecolor=(0,0,0,.05), edgecolor=(0,0,0,.2))
            plt.ylim(-0.35,2.15)
        elif pred_period == 2:
            rec_4 = [18292,18352] #rec_4: 2020-02-01, 2020-04-01
            plt.fill_between(rec_4, -0.25, 1.51, facecolor=(0,0,0,.05), edgecolor=(0,0,0,.2))
            plt.ylim(-0.2,1.35)
        x_pf = np.array(date_range)
        plt.style.use('default')
        
        # Make sure we do not plot rebalnce points if we can rebalance often or are looking at long/short
        if len(save_i)>40 or pf_type == "ls":
            marker_on = []
        else:
            marker_on = save_i
        
        # Plot
        if k==0:
            plt.plot(X_axis,cum_ret_nas,label = "Nasdaq 100 index")
            plt.plot(x_pf,plot_eq,label = "Equal reb 1",color = "dimgrey")
            plt.plot(x_pf,plot_mc,"--",label = "Market cap reb 1",color = "black")
        
        elif k==1:
            plt.plot(x_pf,plot_eq,"-o",markevery=marker_on,label = "Equal reb 3",color = "forestgreen")
            plt.plot(x_pf,plot_mc,"--*",markevery=marker_on,label = "Market cap reb 3",color = "limegreen")
        elif k==2:
            plt.plot(x_pf,plot_eq,"-o",markevery=marker_on,label = "Equal reb 6", color = "tomato")
            plt.plot(x_pf,plot_mc,"--*",markevery=marker_on,label = "Market cap reb 6",color = "firebrick" )
    
        plt.xlabel("Date",fontsize = 20)
        plt.ylabel("Cumulative returns",fontsize = 20)

        plt.grid()
        plt.legend(loc = 2, fontsize=15)
        
plt.figure(figsize = (15,10))
compare_return_with_nasdaq(optimal_df,Nasdaq100_index,2,"bh")

In [None]:
# Prepare loss plots
true_train = []
true_val = []
for j in range(100):
    
    temp_1 = []
    temp_2 = []
    for i in range(0,len(optimal_df_2019),3):
        try:
            temp_1.append(optimal_df_2019["loss_hist"][i]["loss"][j])
            temp_2.append(optimal_df_2019["loss_hist"][i]["val_loss"][j])
        except:
            None
    true_train.append(np.mean(temp_1))
    true_val.append(np.mean(temp_2))
plt.figure(figsize=(15,10))
plt.plot(true_train[1:],label = "training loss")
plt.plot(true_val[1:],label = "validation loss")
plt.ylim(0.035,0.062)
x_lab = [1,10,20,30,40,50]
x_ticks = [0,10,20,30,40,48]
plt.xticks(ticks=x_ticks, labels=x_lab)

plt.xlabel("Epochs",fontsize = 15)
plt.ylabel("Loss",fontsize = 15)
plt.grid()
plt.legend(loc=1,fontsize = 15)