# Using an LSTM-based model to predict stock returns

## 1. Import packages

In [19]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import tensorflow as tf
import pandas as pd
import numpy as np
import os

## 2. Define training, validation and test periods

In [20]:
train_start_date = "2008-01-01"
train_end_date = "2016-12-31"
val_start_date = "2017-01-01"
val_end_date = "2017-12-31"
test_start_date = "2018-01-01"
test_end_date = "2018-12-31"

## 3. Read and label the data

In [27]:
ezj = pd.read_csv("ezj.csv", index_col=0, parse_dates=True)
ezj["return"] = ezj["close"] / ezj["close"].shift() - 1

In [28]:
ezj.head()

Unnamed: 0_level_0,open,high,low,close,volume,return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-01-02,664.908997,677.455017,654.544983,657.27301,1721833.0,
2008-01-03,651.27301,654.0,617.455017,632.182007,2740650.0,-0.038174
2008-01-04,627.817993,632.72699,590.72699,596.72699,4711938.0,-0.056084
2008-01-07,596.182007,597.817993,560.72699,583.091003,4103622.0,-0.022851
2008-01-08,567.27301,573.27301,497.782013,502.091003,12687374.0,-0.138915


In [29]:
ezj["label"] = np.where(ezj["return"] > 0, 1, 0)

## 4. Engineer features

In [30]:
ezj["std_return"] = (ezj["return"] - ezj["return"][:val_start_date].mean()) / ezj["return"][:val_start_date].std()
ezj["std_volume"] = (ezj["volume"] - ezj["volume"].rolling(50).mean()) / ezj["volume"].rolling(50).std()

In [31]:
ezj.dropna(inplace=True)

## 5. Create generators

In [32]:
val_start_iloc = ezj.index.get_loc(val_start_date, method="bfill")
test_start_iloc = ezj.index.get_loc(test_start_date, method="bfill")

In [33]:
train_generator = TimeseriesGenerator(ezj[["std_return", "std_volume"]].values, ezj[["label"]].values,
                                      length=30, batch_size=64, end_index=val_start_iloc-1)
val_generator = TimeseriesGenerator(ezj[["std_return", "std_volume"]].values, ezj[["label"]].values,
                                    length=30, batch_size=64, start_index=val_start_iloc,
                                    end_index=test_start_iloc-1)
test_generator = TimeseriesGenerator(ezj[["std_return", "std_volume"]].values, ezj[["label"]].values,
                                     length=30, batch_size=64, start_index=test_start_iloc)

## 6. Create `model_fn`

In [38]:
def model_fn(params):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.CuDNNLSTM(params["lstm_size"], input_shape=(30, 2)))
    model.add(tf.keras.layers.Dropout(params["dropout"]))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer=tf.keras.optimizers.Adam(params["learning_rate"]),
                  loss="binary_crossentropy", metrics=["accuracy"])

    callbacks = [tf.keras.callbacks.EarlyStopping(monitor="val_acc", patience=5,
                                                  restore_best_weights=True)]
    history = model.fit_generator(train_generator, validation_data=val_generator,
                                  callbacks=callbacks, epochs=100, verbose=0).history
    return (history, model)

## 7. Create `random_search`

In [40]:
def random_search(model_fn, search_space, n_iter, search_dir):
    results = []
    os.mkdir(search_dir)
    best_model_path = os.path.join(search_dir, "best_model.h5")
    results_path = os.path.join(search_dir, "results.csv")
    for i in range(n_iter):
        params = {k: v[np.random.randint(len(v))] for k, v in search_space.items()}
        history, model = model_fn(params)
        epochs = np.argmax(history["val_acc"]) + 1
        result = {k: v[epochs - 1] for k, v in history.items()}
        params["epochs"] = epochs
        if i == 0:
            best_val_acc = result["val_acc"]
            model.save(best_model_path)
        if result["val_acc"] > best_val_acc:
            best_val_acc = result["val_acc"]
            model.save(best_model_path)
        result = {**params, **result}
        results.append(result)
        tf.keras.backend.clear_session()
        print(f"iteration {i + 1} – {', '.join(f'{k}:{v:.4g}' for k, v in result.items())}")
    best_model = tf.keras.models.load_model(best_model_path)
    results = pd.DataFrame(results)
    results.to_csv(results_path)
    return (results, best_model)

## 8. Run random search

In [39]:
search_space = {"lstm_size": np.linspace(50, 200, 16, dtype=int),
                "dropout": np.linspace(0, 0.4, 9),
                "learning_rate": np.linspace(0.004, 0.01, 13)}

In [42]:
results, best_model = random_search(model_fn, search_space, 200, "search")

iteration 1 – lstm_size:140, dropout:0.2, learning_rate:0.0055, epochs:3, loss:0.6913, acc:0.5271, val_loss:0.6921, val_acc:0.5225
iteration 2 – lstm_size:80, dropout:0.15, learning_rate:0.0045, epochs:9, loss:0.6868, acc:0.5367, val_loss:0.6936, val_acc:0.5631
iteration 3 – lstm_size:60, dropout:0.2, learning_rate:0.0045, epochs:3, loss:0.691, acc:0.5285, val_loss:0.6944, val_acc:0.527
iteration 4 – lstm_size:130, dropout:0.2, learning_rate:0.0065, epochs:3, loss:0.6921, acc:0.5308, val_loss:0.6922, val_acc:0.545
iteration 5 – lstm_size:160, dropout:0.25, learning_rate:0.0095, epochs:8, loss:0.6871, acc:0.5426, val_loss:0.6911, val_acc:0.527
iteration 6 – lstm_size:80, dropout:0, learning_rate:0.009, epochs:4, loss:0.6899, acc:0.5298, val_loss:0.6894, val_acc:0.5495
iteration 7 – lstm_size:120, dropout:0.1, learning_rate:0.0075, epochs:7, loss:0.6882, acc:0.5308, val_loss:0.6926, val_acc:0.5315
iteration 8 – lstm_size:130, dropout:0.4, learning_rate:0.005, epochs:2, loss:0.691, acc:0.

iteration 64 – lstm_size:190, dropout:0.25, learning_rate:0.0045, epochs:1, loss:0.6963, acc:0.528, val_loss:0.6921, val_acc:0.5135
iteration 65 – lstm_size:180, dropout:0.2, learning_rate:0.007, epochs:1, loss:0.6977, acc:0.5166, val_loss:0.6973, val_acc:0.5135
iteration 66 – lstm_size:140, dropout:0.1, learning_rate:0.004, epochs:4, loss:0.6912, acc:0.5326, val_loss:0.694, val_acc:0.518
iteration 67 – lstm_size:80, dropout:0.2, learning_rate:0.0065, epochs:3, loss:0.6917, acc:0.513, val_loss:0.6925, val_acc:0.518
iteration 68 – lstm_size:80, dropout:0.1, learning_rate:0.009, epochs:6, loss:0.6893, acc:0.5394, val_loss:0.6905, val_acc:0.5315
iteration 69 – lstm_size:90, dropout:0.35, learning_rate:0.005, epochs:3, loss:0.6912, acc:0.5226, val_loss:0.6927, val_acc:0.5405
iteration 70 – lstm_size:70, dropout:0.25, learning_rate:0.005, epochs:2, loss:0.6914, acc:0.5358, val_loss:0.6903, val_acc:0.5541
iteration 71 – lstm_size:140, dropout:0.2, learning_rate:0.004, epochs:6, loss:0.6898, 

iteration 127 – lstm_size:190, dropout:0.25, learning_rate:0.0075, epochs:18, loss:0.6575, acc:0.5859, val_loss:0.7417, val_acc:0.545
iteration 128 – lstm_size:150, dropout:0.25, learning_rate:0.008, epochs:10, loss:0.6842, acc:0.549, val_loss:0.695, val_acc:0.5315
iteration 129 – lstm_size:130, dropout:0.35, learning_rate:0.007, epochs:3, loss:0.6913, acc:0.5244, val_loss:0.6912, val_acc:0.5541
iteration 130 – lstm_size:60, dropout:0.3, learning_rate:0.01, epochs:1, loss:0.6953, acc:0.503, val_loss:0.6933, val_acc:0.5
iteration 131 – lstm_size:190, dropout:0.25, learning_rate:0.006, epochs:2, loss:0.6928, acc:0.5335, val_loss:0.6928, val_acc:0.5405
iteration 132 – lstm_size:160, dropout:0.4, learning_rate:0.0045, epochs:2, loss:0.6934, acc:0.5198, val_loss:0.6911, val_acc:0.545
iteration 133 – lstm_size:130, dropout:0.1, learning_rate:0.0085, epochs:2, loss:0.6925, acc:0.5153, val_loss:0.6909, val_acc:0.509
iteration 134 – lstm_size:50, dropout:0.2, learning_rate:0.005, epochs:1, loss

iteration 190 – lstm_size:200, dropout:0.2, learning_rate:0.006, epochs:3, loss:0.6903, acc:0.5317, val_loss:0.6892, val_acc:0.527
iteration 191 – lstm_size:170, dropout:0, learning_rate:0.0075, epochs:3, loss:0.6913, acc:0.5317, val_loss:0.6925, val_acc:0.5225
iteration 192 – lstm_size:130, dropout:0.05, learning_rate:0.009, epochs:2, loss:0.6923, acc:0.5198, val_loss:0.6913, val_acc:0.5315
iteration 193 – lstm_size:50, dropout:0, learning_rate:0.0055, epochs:4, loss:0.689, acc:0.538, val_loss:0.6912, val_acc:0.536
iteration 194 – lstm_size:120, dropout:0.05, learning_rate:0.01, epochs:1, loss:0.6942, acc:0.5358, val_loss:0.6914, val_acc:0.5495
iteration 195 – lstm_size:150, dropout:0.3, learning_rate:0.0095, epochs:1, loss:0.6978, acc:0.5116, val_loss:0.6923, val_acc:0.5045
iteration 196 – lstm_size:60, dropout:0.05, learning_rate:0.0085, epochs:1, loss:0.6944, acc:0.5025, val_loss:0.6929, val_acc:0.5631
iteration 197 – lstm_size:170, dropout:0.35, learning_rate:0.007, epochs:1, loss

In [46]:
results.sort_values("val_acc", ascending=False).head()

Unnamed: 0,acc,dropout,epochs,learning_rate,loss,lstm_size,val_acc,val_loss
90,0.512528,0.2,3,0.009,0.690524,190,0.581081,0.689752
100,0.509339,0.35,2,0.008,0.692828,80,0.576577,0.689374
101,0.518451,0.2,2,0.008,0.692348,100,0.576577,0.689398
114,0.536219,0.35,6,0.0085,0.69076,70,0.567568,0.692731
140,0.526196,0.0,2,0.007,0.691741,70,0.567568,0.691383


## 9. Evaluate final model

In [45]:
best_model.evaluate_generator(test_generator)

[0.6878005266189575, 0.5515695]