# Using an LSTM-based model to predict stock returns

## Import packages

In [5]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import tensorflow as tf
import pandas as pd
import numpy as np
import os

## Define train, validation and test periods

In [6]:
train_start_date = "2000-01-01"
train_end_date = "2014-12-31"
val_start_date = "2015-01-01"
val_end_date = "2016-12-31"
test_start_date = "2017-01-01"
test_end_date = "2018-12-31"

## Read and label data

In [28]:
nke = pd.read_csv("nke.csv", index_col=0, parse_dates=True)
nke["return"] = np.log(nke["close"] / nke["close"].shift())
nke["label"] = np.where(nke["return"] > 0, 1, 0)
nke.dropna(inplace=True)

In [29]:
nke.head()

Unnamed: 0_level_0,open,high,low,close,volume,return,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-04,5.875,5.914063,5.671875,5.6875,9810400,-0.056089,0
2000-01-05,5.71875,6.046875,5.71875,6.015625,6542400,0.056089,1
2000-01-06,5.984375,5.984375,5.820313,5.984375,4891200,-0.005208,0
2000-01-07,5.960938,6.0,5.875,5.984375,3993600,0.0,0
2000-01-10,6.015625,6.117188,6.0,6.085938,3946400,0.016829,1


In [30]:
nke.loc[:val_start_date, "label"].value_counts(normalize=True)

1    0.513521
0    0.486479
Name: label, dtype: float64

## Feature engineering

In [31]:
return_mean = nke["return"][:val_start_date].mean()
return_std = nke["return"][:val_start_date].std()
nke["norm_return"] = (nke["return"] - return_mean) / return_std

In [32]:
nke.head()

Unnamed: 0_level_0,open,high,low,close,volume,return,label,norm_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-04,5.875,5.914063,5.671875,5.6875,9810400,-0.056089,0,-2.861218
2000-01-05,5.71875,6.046875,5.71875,6.015625,6542400,0.056089,1,2.805549
2000-01-06,5.984375,5.984375,5.820313,5.984375,4891200,-0.005208,0,-0.290936
2000-01-07,5.960938,6.0,5.875,5.984375,3993600,0.0,0,-0.027834
2000-01-10,6.015625,6.117188,6.0,6.085938,3946400,0.016829,1,0.822288


## Create generators

In [33]:
val_start_idx = nke.index.get_loc(val_start_date, method="bfill")
test_start_idx = nke.index.get_loc(test_start_date, method="bfill")

In [34]:
train_generator = TimeseriesGenerator(nke[["norm_return"]].values, nke[["label"]].values,
                                      length=50, batch_size=64, end_index=val_start_idx-1)
val_generator = TimeseriesGenerator(nke[["norm_return"]].values, nke[["label"]].values,
                                    length=50, batch_size=64, start_index=val_start_idx,
                                    end_index=test_start_idx-1)
test_generator = TimeseriesGenerator(nke[["norm_return"]].values, nke[["label"]].values,
                                     length=50, batch_size=64, start_index=test_start_idx)

## Build `model_fn`

In [35]:
def model_fn(params):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.CuDNNLSTM(params["lstm_size"], input_shape=(50, 1)))
    model.add(tf.keras.layers.Dropout(params["dropout"]))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer=tf.keras.optimizers.Adam(params["learning_rate"]),
                  loss="binary_crossentropy", metrics=["accuracy"])

    callbacks = [tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]
    history = model.fit_generator(train_generator, validation_data=val_generator, callbacks=callbacks,
                                  epochs=200, verbose=0).history
    return (history, model)

## Optimize hyperparameters

In [36]:
def random_search(model_fn, p, n, search_dir=None):
    results = []
    os.mkdir(search_dir)
    best_model_path = os.path.join(search_dir, "best_model.h5")
    results_path = os.path.join(search_dir, "results.csv")
    for i in range(n):
        params = {k: v[np.random.randint(len(v))] for k, v in p.items()}
        history, model = model_fn(params)
        epochs = np.argmin(history["val_loss"]) + 1
        result = {k: v[epochs - 1] for k, v in history.items()}
        params["epochs"] = epochs
        if i == 0:
            best_val_acc = result["val_acc"]
            model.save(best_model_path)
        if result["val_acc"] > best_val_acc:
            best_val_acc = result["val_acc"]
            model.save(best_model_path)
        result = {**params, **result}
        results.append(result)
        tf.keras.backend.clear_session()
        print(f"iteration {i + 1} – {', '.join(f'{k}:{v:.4g}' for k, v in result.items())}")
    best_model = tf.keras.models.load_model(best_model_path)
    results = pd.DataFrame(results)
    results.to_csv(results_path)
    return (results, best_model)

In [37]:
p = {"lstm_size": np.linspace(50, 200, 16, dtype=int),
     "dropout": np.linspace(0, 0.4, 9),
     "learning_rate": np.linspace(0.004, 0.01, 13)}

In [38]:
results, best_model = random_search(model_fn, p, 200, "search")

W0712 12:55:27.146746 17652 deprecation.py:506] From C:\Users\jack_\Anaconda3\envs\machinelearning\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0712 12:55:27.282870 17652 deprecation.py:323] From C:\Users\jack_\Anaconda3\envs\machinelearning\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


iteration 1 – lstm_size:100, dropout:0.15, learning_rate:0.0065, epochs:6, loss:0.6916, acc:0.5287, val_loss:0.6895, val_acc:0.5242
iteration 2 – lstm_size:60, dropout:0.3, learning_rate:0.0055, epochs:10, loss:0.6902, acc:0.5352, val_loss:0.6896, val_acc:0.5088
iteration 3 – lstm_size:90, dropout:0.1, learning_rate:0.01, epochs:5, loss:0.6909, acc:0.521, val_loss:0.6882, val_acc:0.5463
iteration 4 – lstm_size:180, dropout:0.4, learning_rate:0.007, epochs:9, loss:0.691, acc:0.5317, val_loss:0.6873, val_acc:0.5264
iteration 5 – lstm_size:90, dropout:0.3, learning_rate:0.005, epochs:10, loss:0.6907, acc:0.5239, val_loss:0.6892, val_acc:0.522
iteration 6 – lstm_size:60, dropout:0.15, learning_rate:0.009, epochs:6, loss:0.6908, acc:0.5215, val_loss:0.6915, val_acc:0.5176
iteration 7 – lstm_size:180, dropout:0.05, learning_rate:0.004, epochs:3, loss:0.691, acc:0.5314, val_loss:0.6904, val_acc:0.5176
iteration 8 – lstm_size:80, dropout:0.2, learning_rate:0.01, epochs:1, loss:0.6932, acc:0.51

iteration 64 – lstm_size:60, dropout:0.3, learning_rate:0.004, epochs:12, loss:0.6901, acc:0.5301, val_loss:0.6898, val_acc:0.5242
iteration 65 – lstm_size:50, dropout:0.05, learning_rate:0.0095, epochs:8, loss:0.6902, acc:0.5322, val_loss:0.6906, val_acc:0.5132
iteration 66 – lstm_size:180, dropout:0.2, learning_rate:0.0045, epochs:1, loss:0.6934, acc:0.5228, val_loss:0.6915, val_acc:0.5154
iteration 67 – lstm_size:80, dropout:0.3, learning_rate:0.009, epochs:1, loss:0.6945, acc:0.511, val_loss:0.689, val_acc:0.5242
iteration 68 – lstm_size:70, dropout:0.1, learning_rate:0.0045, epochs:3, loss:0.6917, acc:0.5266, val_loss:0.6903, val_acc:0.522
iteration 69 – lstm_size:180, dropout:0.35, learning_rate:0.0045, epochs:5, loss:0.6958, acc:0.5054, val_loss:0.6926, val_acc:0.489
iteration 70 – lstm_size:180, dropout:0.1, learning_rate:0.009, epochs:4, loss:0.696, acc:0.5177, val_loss:0.6905, val_acc:0.5044
iteration 71 – lstm_size:200, dropout:0.15, learning_rate:0.0075, epochs:3, loss:0.69

iteration 127 – lstm_size:200, dropout:0, learning_rate:0.01, epochs:3, loss:0.6919, acc:0.5282, val_loss:0.6893, val_acc:0.5242
iteration 128 – lstm_size:80, dropout:0.05, learning_rate:0.0095, epochs:6, loss:0.6912, acc:0.5253, val_loss:0.6896, val_acc:0.5176
iteration 129 – lstm_size:60, dropout:0.1, learning_rate:0.004, epochs:2, loss:0.6923, acc:0.5274, val_loss:0.6915, val_acc:0.5088
iteration 130 – lstm_size:70, dropout:0, learning_rate:0.0045, epochs:5, loss:0.6913, acc:0.5328, val_loss:0.6895, val_acc:0.5242
iteration 131 – lstm_size:90, dropout:0.35, learning_rate:0.01, epochs:3, loss:0.6912, acc:0.5185, val_loss:0.6918, val_acc:0.5022
iteration 132 – lstm_size:90, dropout:0.4, learning_rate:0.0055, epochs:7, loss:0.6913, acc:0.5236, val_loss:0.6876, val_acc:0.5088
iteration 133 – lstm_size:150, dropout:0.35, learning_rate:0.009, epochs:10, loss:0.692, acc:0.5312, val_loss:0.6886, val_acc:0.5374
iteration 134 – lstm_size:160, dropout:0.4, learning_rate:0.0075, epochs:2, loss:

iteration 190 – lstm_size:50, dropout:0.4, learning_rate:0.004, epochs:7, loss:0.6917, acc:0.5279, val_loss:0.6899, val_acc:0.5198
iteration 191 – lstm_size:170, dropout:0.1, learning_rate:0.0065, epochs:1, loss:0.6942, acc:0.5191, val_loss:0.6909, val_acc:0.5132
iteration 192 – lstm_size:150, dropout:0.35, learning_rate:0.009, epochs:1, loss:0.6966, acc:0.511, val_loss:0.688, val_acc:0.5308
iteration 193 – lstm_size:180, dropout:0, learning_rate:0.0095, epochs:5, loss:0.6917, acc:0.525, val_loss:0.6872, val_acc:0.5132
iteration 194 – lstm_size:150, dropout:0.05, learning_rate:0.0065, epochs:4, loss:0.6911, acc:0.5285, val_loss:0.688, val_acc:0.5066
iteration 195 – lstm_size:130, dropout:0.1, learning_rate:0.0065, epochs:1, loss:0.6938, acc:0.511, val_loss:0.6909, val_acc:0.5088
iteration 196 – lstm_size:130, dropout:0.3, learning_rate:0.006, epochs:12, loss:0.6909, acc:0.5253, val_loss:0.6906, val_acc:0.5132
iteration 197 – lstm_size:140, dropout:0.05, learning_rate:0.004, epochs:4, l

W0712 13:16:31.344389 17652 deprecation.py:506] From C:\Users\jack_\Anaconda3\envs\machinelearning\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0712 13:16:31.345390 17652 deprecation.py:506] From C:\Users\jack_\Anaconda3\envs\machinelearning\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0712 13:16:31.346391 17652 deprecation.py:506] From C:\Users\jack_\Anaconda3\envs\machinelearning\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling Zeros.__init__ (from ten

iteration 200 – lstm_size:60, dropout:0.15, learning_rate:0.0085, epochs:2, loss:0.6925, acc:0.5234, val_loss:0.6913, val_acc:0.5154


## Evaluate best model

In [42]:
results.sort_values("val_acc", ascending=False)

Unnamed: 0,acc,dropout,epochs,learning_rate,loss,lstm_size,val_acc,val_loss
125,0.536002,0.20,10,0.0100,0.691142,200,0.559471,0.684303
103,0.524449,0.25,6,0.0100,0.690926,140,0.552863,0.685534
182,0.525255,0.15,12,0.0090,0.690043,60,0.548458,0.681705
89,0.528479,0.40,3,0.0055,0.691596,200,0.548458,0.686424
54,0.534927,0.10,19,0.0045,0.686391,90,0.546256,0.687720
187,0.525255,0.05,6,0.0050,0.690988,180,0.546256,0.688144
2,0.520956,0.10,5,0.0100,0.690914,90,0.546256,0.688151
40,0.530897,0.15,6,0.0100,0.691228,110,0.546256,0.685604
102,0.529554,0.20,7,0.0090,0.690535,100,0.544053,0.686795
119,0.504299,0.40,1,0.0065,0.694999,170,0.544053,0.688652


In [40]:
best_model.evaluate_generator(test_generator)

[0.6983279064297676, 0.5132743]