 #   3.0 Selection and filtering of non-volatile stocks with Auto encoder

In [75]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

In [76]:
from numpy import array
from keras.models import Model
from keras.layers import Input
from keras.layers import LSTM
from keras.layers import Dense, Activation
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.utils import plot_model
from keras import regularizers, optimizers

from sklearn import preprocessing

In [77]:
# Load the close prices dataset
prices_data = pd.read_csv('datasets/df_close.csv')

In [78]:
prices_data.head()

Unnamed: 0,date,AAPL,AMZN,AVGO,COST,GOOG,GOOGL,HD,JPM,LLY,...,META,MSFT,NVDA,ORCL,PG,TSLA,UNH,V,WMT,XOM
0,2013-01-02 00:00:00+00:00,16.687344,12.8655,2.395675,80.408737,17.969599,18.054642,48.197495,32.313248,38.176548,...,27.915947,22.406956,0.293535,29.126825,49.248264,2.357333,45.537483,35.668819,18.031246,54.313057
1,2013-01-03 00:00:00+00:00,16.476713,12.924,2.408191,81.233017,17.980036,18.065126,48.060829,32.248131,38.338474,...,27.68664,22.106806,0.293766,28.807777,48.935963,2.318,43.40839,35.696354,17.916658,54.215107
2,2013-01-04 00:00:00+00:00,16.017763,12.9575,2.392731,80.971458,18.335327,18.422102,47.969711,32.819733,39.75729,...,28.673666,21.693052,0.303458,29.05966,49.035339,2.293333,43.491879,35.987892,17.984365,54.46611
3,2013-01-07 00:00:00+00:00,15.923532,13.423,2.379478,80.345329,18.255325,18.341721,47.711578,32.855892,39.703312,...,29.331686,21.652491,0.294689,28.908518,48.701778,2.289333,43.491879,36.244991,17.8125,53.835495
4,2013-01-08 00:00:00+00:00,15.966398,13.319,2.363281,80.194725,18.219299,18.305525,48.000088,32.921013,40.004036,...,28.972767,21.538912,0.288227,28.916927,48.623688,2.245333,42.915768,36.582462,17.861969,54.172237


In [79]:
data = prices_data.copy()

In [80]:
data = data.reset_index(drop=True).set_index(['date'])

In [81]:
data.head()

Unnamed: 0_level_0,AAPL,AMZN,AVGO,COST,GOOG,GOOGL,HD,JPM,LLY,MA,META,MSFT,NVDA,ORCL,PG,TSLA,UNH,V,WMT,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2013-01-02 00:00:00+00:00,16.687344,12.8655,2.395675,80.408737,17.969599,18.054642,48.197495,32.313248,38.176548,47.532619,27.915947,22.406956,0.293535,29.126825,49.248264,2.357333,45.537483,35.668819,18.031246,54.313057
2013-01-03 00:00:00+00:00,16.476713,12.924,2.408191,81.233017,17.980036,18.065126,48.060829,32.248131,38.338474,47.600609,27.68664,22.106806,0.293766,28.807777,48.935963,2.318,43.40839,35.696354,17.916658,54.215107
2013-01-04 00:00:00+00:00,16.017763,12.9575,2.392731,80.971458,18.335327,18.422102,47.969711,32.819733,39.75729,47.598743,28.673666,21.693052,0.303458,29.05966,49.035339,2.293333,43.491879,35.987892,17.984365,54.46611
2013-01-07 00:00:00+00:00,15.923532,13.423,2.379478,80.345329,18.255325,18.341721,47.711578,32.855892,39.703312,48.423645,29.331686,21.652491,0.294689,28.908518,48.701778,2.289333,43.491879,36.244991,17.8125,53.835495
2013-01-08 00:00:00+00:00,15.966398,13.319,2.363281,80.194725,18.219299,18.305525,48.000088,32.921013,40.004036,48.264256,28.972767,21.538912,0.288227,28.916927,48.623688,2.245333,42.915768,36.582462,17.861969,54.172237


In [82]:
def defineAutoencoder(num_stock, encoding_dim = 5, verbose=0):
    
    """
    Function for fitting an Autoencoder
    """

    # connect all layers
    input = Input(shape=(num_stock,))

    encoded = Dense(encoding_dim, kernel_regularizer=regularizers.l2(0.00001),name ='Encoder_Input')(input)

    decoded = Dense(num_stock, kernel_regularizer=regularizers.l2(0.00001), name ='Decoder_Input')(encoded)
    decoded = Activation("linear", name='Decoder_Activation_function')(decoded)

    # construct and compile AE model
    autoencoder = Model(inputs=input, outputs=decoded)
    adam = optimizers.Adam(learning_rate=0.0005)
    autoencoder.compile(optimizer=adam, loss='mean_squared_error')
    if verbose!= 0:
        autoencoder.summary()

    return autoencoder

In [83]:
def getReconstructionErrorsDF(df_pct_change, reconstructed_data):
    
    """
    Function for calculating the reconstruction Errors
    """
    array = []
    stocks_ranked = []
    num_columns = reconstructed_data.shape[1]
    for i in range(0, num_columns):
        diff = np.linalg.norm((df_pct_change.iloc[:, i] - reconstructed_data[:, i]))  # 2 norm difference
        array.append(float(diff))

    ranking = np.array(array).argsort()
    r = 1
    for stock_index in ranking:
        stocks_ranked.append([ r
                              ,stock_index
                              ,df_pct_change.iloc[:, stock_index].name
                              ,array[stock_index]
                              ])
        r = r + 1

    columns = ['ranking','stock_index', 'stock_name' ,'recreation_error']
    data = pd.DataFrame(stocks_ranked, columns=columns)
    data = data.set_index('stock_name')
    return data

In [84]:
col_names = data.columns.to_list()
print(col_names)

['AAPL', 'AMZN', 'AVGO', 'COST', 'GOOG', 'GOOGL', 'HD', 'JPM', 'LLY', 'MA', 'META', 'MSFT', 'NVDA', 'ORCL', 'PG', 'TSLA', 'UNH', 'V', 'WMT', 'XOM']


In [None]:
# 'aapl', 'amzn', 'msft', 'googl', 'tsla', 'fb', 'nvda', 'dis', 'nflx', 'intc',
#     'ba', 'v', 'ma', 'goog', 'csco', 'pypl', 'wmt', 'jpm', 'gs', 'sq'

In [85]:
df_pct_change = data.pct_change(1).astype(float)
df_pct_change = df_pct_change.replace([np.inf, -np.inf], np.nan)
df_pct_change = df_pct_change.fillna(method='ffill')

# the percentage change function will make the first two rows equal to nan
df_pct_change = df_pct_change.tail(len(df_pct_change) - 2)

  df_pct_change = df_pct_change.fillna(method='ffill')


In [86]:
df_pct_change.shape

(2766, 20)

In [87]:
# remove columns where there is no change over a longer time period
df_pct_change = df_pct_change[df_pct_change.columns[((df_pct_change == 0).mean() <= 0.05)]]

In [88]:
df_pct_change.head()

Unnamed: 0_level_0,AAPL,AMZN,AVGO,COST,GOOG,GOOGL,HD,JPM,LLY,MA,META,MSFT,NVDA,ORCL,PG,TSLA,UNH,V,WMT,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2013-01-04 00:00:00+00:00,-0.027854,0.002592,-0.00642,-0.00322,0.01976,0.01976,-0.001896,0.017725,0.037008,-3.9e-05,0.03565,-0.018716,0.032993,0.008744,0.002031,-0.010642,0.001923,0.008167,0.003779,0.00463
2013-01-07 00:00:00+00:00,-0.005883,0.035925,-0.005539,-0.007733,-0.004363,-0.004363,-0.005381,0.001102,-0.001358,0.01733,0.022949,-0.00187,-0.028897,-0.005201,-0.006802,-0.001744,0.0,0.007144,-0.009556,-0.011578
2013-01-08 00:00:00+00:00,0.002692,-0.007748,-0.006807,-0.001874,-0.001973,-0.001973,0.006047,0.001982,0.007574,-0.003292,-0.012237,-0.005246,-0.021926,0.000291,-0.001603,-0.01922,-0.013246,0.009311,0.002777,0.006255
2013-01-09 00:00:00+00:00,-0.015629,-0.000113,0.022119,0.000494,0.006573,0.006573,-0.000791,-0.000659,0.02101,0.028137,0.05265,0.00565,-0.022418,0.00058,0.005401,-0.001187,0.018871,0.015248,-0.000291,-0.003843
2013-01-10 00:00:00+00:00,0.012396,-0.003792,0.021639,-0.004841,0.004552,0.004552,0.007123,0.014955,0.005286,-0.008903,0.02321,-0.008989,0.001638,0.013059,0.005662,-0.00327,0.01413,-0.007911,-0.003063,0.010892


In [89]:
#3.5 Construct the Autoencoder
# define the input parameters
hidden_layers = 5
batch_size = 500
epochs = 500
stock_selection_number = 500
num_stock = df_pct_change.shape[1]
verbose = 1

In [90]:
df_pct_change = df_pct_change.dropna()  # Handle missing values
df_scaler = preprocessing.MinMaxScaler()
df_pct_change_normalised = df_scaler.fit_transform(df_pct_change.values)


In [91]:
print('-' * 20 + 'Step 1 : Returns vs. recreation error (recreation_error)')
print('-' * 25 + 'Transform dataset with MinMax Scaler')

--------------------Step 1 : Returns vs. recreation error (recreation_error)
-------------------------Transform dataset with MinMax Scaler


In [92]:
# Normalize the data
df_scaler = preprocessing.MinMaxScaler()
df_pct_change_normalised = df_scaler.fit_transform(df_pct_change)

In [93]:
# define autoencoder
print('-' * 25 + 'Define autoencoder model')
num_stock = len(df_pct_change.columns)
autoencoder = defineAutoencoder(num_stock=num_stock, encoding_dim=10, verbose=verbose)
#plot_model(autoencoder, to_file='img/model_autoencoder_1.png', show_shapes=True,
#           show_layer_names=True)

-------------------------Define autoencoder model


In [94]:
# train autoencoder
print('-' * 25 + 'Train autoencoder model')
autoencoder.fit(df_pct_change_normalised, df_pct_change_normalised, shuffle=False, epochs=epochs,
                batch_size=batch_size,
                verbose=verbose)

-------------------------Train autoencoder model
Epoch 1/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.3893
Epoch 2/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3504 
Epoch 3/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0s/step - loss: 0.3167  
Epoch 4/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2873 
Epoch 5/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2618 
Epoch 6/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2396 
Epoch 7/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2200 
Epoch 8/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2026 
Epoch 9/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1869 
Epoch 10/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x2d6515d54d0>

In [95]:
# predict autoencoder
print('-' * 25 + 'Predict autoencoder model')
reconstruct = autoencoder.predict(df_pct_change_normalised)

-------------------------Predict autoencoder model
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [96]:
# Inverse transform dataset with MinMax Scaler
print('-' * 25 + 'Inverse transform dataset with MinMax Scaler')
reconstruct_real = df_scaler.inverse_transform(reconstruct)
df_reconstruct_real = pd.DataFrame(data=reconstruct_real, columns=df_pct_change.columns)

-------------------------Inverse transform dataset with MinMax Scaler


In [97]:
print(df_pct_change.shape)  # Shape of original data
print(reconstruct_real.shape)  # Shape of reconstructed data

(2766, 20)
(2766, 20)


In [98]:
print(df_pct_change.isnull().sum())

AAPL     0
AMZN     0
AVGO     0
COST     0
GOOG     0
GOOGL    0
HD       0
JPM      0
LLY      0
MA       0
META     0
MSFT     0
NVDA     0
ORCL     0
PG       0
TSLA     0
UNH      0
V        0
WMT      0
XOM      0
dtype: int64


In [99]:
print('-' * 25 + 'Calculate L2 norm as reconstruction loss metric')
df_recreation_error = getReconstructionErrorsDF(df_pct_change=df_pct_change,
                                                reconstructed_data=reconstruct_real)

-------------------------Calculate L2 norm as reconstruction loss metric


In [100]:
df_recreation_error

Unnamed: 0_level_0,ranking,stock_index,recreation_error
stock_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
COST,1,3,0.354805
PG,2,14,0.430957
HD,3,6,0.455876
GOOG,4,4,0.479384
GOOGL,5,5,0.488011
V,6,17,0.536285
MSFT,7,11,0.55863
WMT,8,18,0.560199
AAPL,9,0,0.563678
AMZN,10,1,0.596283


In [101]:
filtered_stocks = df_recreation_error.head(stock_selection_number).index

In [102]:
filtered_stocks

Index(['COST', 'PG', 'HD', 'GOOG', 'GOOGL', 'V', 'MSFT', 'WMT', 'AAPL', 'AMZN',
       'ORCL', 'MA', 'UNH', 'JPM', 'XOM', 'LLY', 'AVGO', 'TSLA', 'META',
       'NVDA'],
      dtype='object', name='stock_name')

In [103]:
# store the list of selected stocks
%store filtered_stocks

Stored 'filtered_stocks' (Index)
