# Imports

In [None]:
import numpy as np
import pandas as pd
from time import time

# Transform inputs to pandas dataframe

## Stock Index

In [None]:
### SPY Price & Volume
#### https://uk.finance.yahoo.com/quote/SPY/history
benchmark_df = pd.read_csv('./inputFeatures/stockIndex/SPY.csv',
    index_col=["Date"], 
    usecols=["Date", "Open", "High", "Low", "SPYClose", "Volume"],
    parse_dates=["Date"])

# Preprocessing Functions

In [None]:
from sklearn import preprocessing
from collections import deque
from time import time

def get_last_days(number_of_days, x):
    return np.delete(x, np.s_[:-number_of_days], 1)

SEQ_LEN = 63
FUTURE_PERIOD_PREDICT = 1

def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

def preprocess_df(df):
    # 'RepoRate', 'ReverseRepoRate', 'Repo', 'ReverseRepo', 'USDGBP', 'USDEUR', 'USDJPY', 'M1Supply', 'EmploymentRate', 'InflationRate', 'GDP', 'PCR', 'UMCSENT', 'Confidence', 'EFFR'
    df = df.drop(columns=['future'])

    output_df = pd.DataFrame(index=df.index)

    for col in df.columns:
        if col in ['SPYClose', 'Open', 'High', 'Low', 'Volume'] :
            column_name = f'{col}DayChange'
            output_df = pd.concat([output_df, df[col].pct_change(fill_method='ffill').rename(column_name)], axis=1)
            output_df.dropna(inplace=True)
        elif col == 'target':
            output_df[col] = df[col]

        if col not in ['SPYClose', 'target']:
            output_df.dropna(inplace=True)
            output_df[column_name] = preprocessing.scale(output_df[column_name].values)
            output_df[column_name] = output_df[column_name].clip(-3, 3) / 3

    output_df.dropna(inplace=True)

    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)
    
    for i in output_df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 21 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!
            
    np.random.shuffle(sequential_data)  # shuffle for good measure.
    ups = []
    downs = []

    for sequence, target in sequential_data:    
        if target == 0:
            downs.append([sequence, target])
        elif target == 1:
            ups.append([sequence, target])
    np.random.shuffle(ups)
    np.random.shuffle(downs)

    ## Get the value of the array with the smallest length
    ## So we can ensure the training process is unbiased
    ## As there will be 50:50 of up days and down days.
    ## The model has to LEARN rather than REMEMBER
    lower = min(len(ups), len(downs))

    ups = ups[:lower]
    downs = downs[:lower]

    sequential_data = ups + downs

    np.random.shuffle(sequential_data)

    X = []
    y = []

    for sequence, target in sequential_data:
        X.append(sequence)
        y.append(target)

    return np.array(X), np.array(y)

# Apply preprocessing, arrange data (training, validation)

In [None]:
# Add column for next day's closing price
benchmark_df['future'] = benchmark_df['SPYClose'].shift(-1)
# Add column to signify if next day's closing price is up (1) or down (0) using classify
# function defined above
benchmark_df['target'] = list(map(classify, benchmark_df['SPYClose'], benchmark_df['future']))


times = sorted(benchmark_df.index.values)
last_20pct = sorted(benchmark_df.index.values)[-int(0.2*len(times))]  # get the last 20% of the times

## Split in sample / out of sample
validation_df = benchmark_df[(benchmark_df.index >= last_20pct)]  # make the validation data where the index is in the last 20%
training_df = benchmark_df[(benchmark_df.index < last_20pct)]  # now the benchmark_df is all the data up to the last 20%

train_x, train_y = preprocess_df(training_df)
validation_x, validation_y = preprocess_df(validation_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {np.count_nonzero(train_y == 0)}, buys: {np.count_nonzero(train_y == 1)}")
print(f"VALIDATION Dont buys: {np.count_nonzero(validation_y == 0)}, buys: {np.count_nonzero(validation_y == 1)}")

# Tensorflow Imports

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dropout, BatchNormalization, Dense, Conv1D, MaxPooling1D, Flatten, Concatenate
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

## Benchmark Iteration

In [None]:
train_x_two_days = get_last_days(2, train_x) # 2 days
train_x_week = get_last_days(5, train_x) # 1 week
train_x_two_weeks = get_last_days(10, train_x) # 2 weeks
train_x_month = get_last_days(21, train_x) # 1 month
train_x_two_months = get_last_days(42, train_x) # 2 months
train_x # 1 quarter

validation_x_two_days = get_last_days(2, validation_x) # 2 days
validation_x_week = get_last_days(5, validation_x) # 1 week
validation_x_two_weeks = get_last_days(10, validation_x) # 2 weeks
validation_x_month = get_last_days(21, validation_x) # 1 month
validation_x_two_months = get_last_days(42, validation_x) # 1 month
validation_x # 1 quarter

def benchmark_model(name, time_at_start, train_inputs, validation_inputs):
    BATCH_SIZE = 32
    EPOCHS = 50

    model = Sequential()
    model.add(Input(shape=(train_inputs.shape[1:])))
        
    model.add(Conv1D(32, 2, padding='same'))
    model.add(Conv1D(32, 2, padding='same'))

    model.add(LSTM(16))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))

    model.add(Dense(2, activation='softmax'))

    opt = tf.keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)

    model.compile(loss='sparse_categorical_crossentropy',
                    optimizer=opt,
                    metrics=['accuracy'])

    tensorboard = TensorBoard(log_dir=f'benchmark_logs-{time_at_start}/{name}')

    checkpoint_filepath = f"benchmark_models-{time_at_start}/" + name + "-{epoch:02d}-{val_accuracy:.3f}.hd5"
    checkpoint = ModelCheckpoint(filepath=checkpoint_filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

    early_stopping = EarlyStopping(monitor='val_accuracy', baseline=0.5, patience=12)

    history = model.fit(
        train_inputs, train_y,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(validation_inputs, validation_y),
        callbacks=[tensorboard, checkpoint, early_stopping]
    )


def benchmark():
    time_at_start = int(time())

    for training_set in ['two_days', 'week', 'two_weeks', 'month', 'two_months', 'quarter']:
        if training_set == 'two_days':
            training_set_to_use = train_x_two_days
            validation_set_to_use = validation_x_two_days
        elif training_set == 'week':
            training_set_to_use = train_x_week
            validation_set_to_use = validation_x_week
        elif training_set == 'two_weeks':
            training_set_to_use = train_x_two_weeks
            validation_set_to_use = validation_x_two_weeks
        elif training_set == 'month':
            training_set_to_use = train_x_month
            validation_set_to_use = validation_x_month
        elif training_set == 'two_months':
            training_set_to_use = train_x_two_months
            validation_set_to_use = validation_x_two_months
        elif training_set == 'quarter':
            training_set_to_use = train_x
            validation_set_to_use = validation_x

        name = f"{training_set}-{int(time())}"
        benchmark_model(name, time_at_start, training_set_to_use, validation_set_to_use)

In [None]:
benchmark()