# Using an LSTM-based model to predict stock returns

## Import packages

In [22]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import tensorflow as tf
import pandas as pd
import numpy as np
import os

## Define training, validation and test periods

In [23]:
train_start_date = "2008-01-01"
train_end_date = "2016-12-31"
val_start_date = "2017-01-01"
val_end_date = "2017-12-31"
test_start_date = "2018-01-01"
test_end_date = "2018-12-31"

In [24]:
'''We'll use the data in the training period to train models, the data in the 
validation period to optimize hyperparameters and the data in the test period 
to evaluate our final model.'''

"We'll use the data in the training period to train models, the data in the \nvalidation period to optimize hyperparameters and the data in the test period \nto evaluate our final model."

## Read and label the data

In [25]:
ezj = pd.read_csv("ezj.csv", index_col=0, parse_dates=True)
ezj["return"] = ezj["close"] / ezj["close"].shift() - 1

In [26]:
ezj.head()

Unnamed: 0_level_0,open,high,low,close,volume,return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-01-02,664.908997,677.455017,654.544983,657.27301,1721833.0,
2008-01-03,651.27301,654.0,617.455017,632.182007,2740650.0,-0.038174
2008-01-04,627.817993,632.72699,590.72699,596.72699,4711938.0,-0.056084
2008-01-07,596.182007,597.817993,560.72699,583.091003,4103622.0,-0.022851
2008-01-08,567.27301,573.27301,497.782013,502.091003,12687374.0,-0.138915


In [27]:
'''We'll assign a label of 1 to dates on which EasyJet's stock return is 
positive, and a label of 0 to dates on which it is not.'''

"We'll assign a label of 1 to dates on which EasyJet's stock return is \npositive, and a label of 0 to dates on which it is not."

In [28]:
ezj["label"] = np.where(ezj["return"] > 0, 1, 0)

## Engineer features

In [29]:
'''To predict the labels, we'll use returns and volumes from the past 30 
trading days. To reduce the chances of getting stuck in local optima, we'll 
standardize the returns using statistics computed over the training period, 
and the volumes using a sliding window approach.'''

"To predict the labels, we'll use returns and volumes from the past 30 \ntrading days. To reduce the chances of getting stuck in local optima, we'll \nstandardize the returns using statistics computed over the training period, \nand the volumes using a sliding window approach."

In [30]:
ezj["std_return"] = (ezj["return"] - ezj["return"][:val_start_date].mean()) / ezj["return"][:val_start_date].std()
ezj["std_volume"] = (ezj["volume"] - ezj["volume"].rolling(50).mean()) / ezj["volume"].rolling(50).std()

In [31]:
ezj.dropna(inplace=True)

## Create generators

In [32]:
'''Before creating generators for the train, validation and test sets, we 
need the integer locations corresponding to the start of the validation and 
test periods.'''

'Before creating generators for the train, validation and test sets, we \nneed the integer locations corresponding to the start of the validation and \ntest periods.'

In [33]:
val_start_iloc = ezj.index.get_loc(val_start_date, method="bfill")
test_start_iloc = ezj.index.get_loc(test_start_date, method="bfill")

In [34]:
'''We'll use TimeseriesGenerator to create the generators, and pass 
length=30 so that data from the past 30 trading days is used to make 
predictions.'''

"We'll use TimeseriesGenerator to create the generators, and pass \nlength=30 so that data from the past 30 trading days is used to make \npredictions."

In [35]:
train_generator = TimeseriesGenerator(ezj[["std_return", "std_volume"]].values, ezj[["label"]].values,
                                      length=30, batch_size=64, end_index=val_start_iloc-1)
val_generator = TimeseriesGenerator(ezj[["std_return", "std_volume"]].values, ezj[["label"]].values,
                                    length=30, batch_size=64, start_index=val_start_iloc,
                                    end_index=test_start_iloc-1)
test_generator = TimeseriesGenerator(ezj[["std_return", "std_volume"]].values, ezj[["label"]].values,
                                     length=30, batch_size=64, start_index=test_start_iloc)

## Create `model_fn`

In [36]:
'''model_fn trains an LSTM-based model for a maximum of 100 epochs, stopping 
early if validation accuracy does not improve for 5 epochs. If you don't have 
a GPU, make sure to swap CuDNNLSTM for LSTM.'''

"model_fn trains an LSTM-based model for a maximum of 100 epochs, stopping \nearly if validation accuracy does not improve for 5 epochs. If you don't have \na GPU, make sure to swap CuDNNLSTM for LSTM."

In [37]:
def model_fn(params):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.LSTM(params["lstm_size"], input_shape=(30, 2)))
    model.add(tf.keras.layers.Dropout(params["dropout"]))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer=tf.keras.optimizers.Adam(params["learning_rate"]),
                  loss="binary_crossentropy", metrics=["accuracy"])

    callbacks = [tf.keras.callbacks.EarlyStopping(monitor="val_acc", patience=5,
                                                  restore_best_weights=True)]
    history = model.fit_generator(train_generator, validation_data=val_generator,
                                  callbacks=callbacks, epochs=100, verbose=0).history
    return (history, model)

## Create `random_search`

In [38]:
'''We'll use random_search to optimize hyperparameters, which runs a random 
search and saves the results and best model in search_dir.'''

"We'll use random_search to optimize hyperparameters, which runs a random \nsearch and saves the results and best model in search_dir."

In [39]:
def random_search(model_fn, search_space, n_iter, search_dir):
    results = []
    os.mkdir(search_dir)
    best_model_path = os.path.join(search_dir, "best_model.h5")
    results_path = os.path.join(search_dir, "results.csv")
    for i in range(n_iter):
        params = {k: v[np.random.randint(len(v))] for k, v in search_space.items()}
        history, model = model_fn(params)
        epochs = np.argmax(history["val_acc"]) + 1
        result = {k: v[epochs - 1] for k, v in history.items()}
        params["epochs"] = epochs
        if i == 0:
            best_val_acc = result["val_acc"]
            model.save(best_model_path)
        if result["val_acc"] > best_val_acc:
            best_val_acc = result["val_acc"]
            model.save(best_model_path)
        result = {**params, **result}
        results.append(result)
        tf.keras.backend.clear_session()
        print(f"iteration {i + 1} – {', '.join(f'{k}:{v:.4g}' for k, v in result.items())}")
    best_model = tf.keras.models.load_model(best_model_path)
    results = pd.DataFrame(results)
    results.to_csv(results_path)
    return (results, best_model)

## Run random search

In [40]:
'''We'll run the random search for 200 iterations. It should take somewhere 
between 10 and 90 minutes to complete, depending on your hardware.'''

"We'll run the random search for 200 iterations. It should take somewhere \nbetween 10 and 90 minutes to complete, depending on your hardware."

In [41]:
search_space = {"lstm_size": np.linspace(50, 200, 16, dtype=int),
                "dropout": np.linspace(0, 0.4, 9),
                "learning_rate": np.linspace(0.004, 0.01, 13)}

In [None]:
results, best_model = random_search(model_fn, search_space, 200, "search")

Instructions for updating:
Please use Model.fit, which supports generators.
  ...
    to  
  ['...']
  ...
    to  
  ['...']


In [None]:
results.sort_values("val_acc", ascending=False).head()

## Evaluate final model

In [None]:
'''All that's left is to evaluate our final model over the test period.'''

In [None]:
best_model.evaluate_generator(test_generator)