In [69]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [70]:
data_df = pd.read_csv("data/processed_dataset/dataset_p2.csv")
data_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Sentiment Score,LogReturn,RSI14,MACD,Intermediate Momentum
0,19.889999,20.139999,19.5,19.559999,19.559999,668100,0.4134,0.027233,44.122541,-1.735004,-4.33
1,19.950001,20.190001,19.65,20.1,20.1,578900,0.4134,0.021167,47.909296,-1.7423,-3.73
2,20.4,20.809999,20.049999,20.530001,20.530001,418600,0.4134,0.041504,50.770293,-1.71051,-1.429998
3,21.040001,21.49,20.76,21.4,21.4,1214500,0.4134,0.026742,56.032012,-1.606212,2.199999
4,21.190001,22.030001,21.129999,21.98,21.98,1969300,0.4134,-0.074108,59.165457,-1.461798,5.869999


In [71]:
def normalize(df, scalar):
    df.dropna(inplace = True)
    cols = df.columns.tolist()
    for c in cols:
        df[c] = scalar.fit_transform(np.array(df[c]).reshape(-1,1))

    return df


def batch_dataset(predictors, target, start, end, window):
    X = []
    y = []
    
    start += window
    
    for i in range(start, end):
        idx = range(i - window, i)
        X.append(predictors.loc[idx,:])
        y.append(target.loc[i,:])

    return np.array(X), np.array(y).ravel()

def preprocess_dataset(df, target, test_size, window, scalar):
    
    df = normalize(df, scalar)
    
    X = df.loc[:, df.columns != target]
    y = df.loc[:, df.columns == target]
    
    split = round((1-test_size)*len(df))
    
    X_train, y_train = batch_dataset(X, y, 0, split, window)
    X_test, y_test = batch_dataset(X, y, split, len(df), window)
    
    return X_train, y_train, X_test, y_test

In [72]:
test_size = 0.2
window = 100
scalar = MinMaxScaler()

X_train, y_train, X_test, y_test = preprocess_dataset(data_df, 'Adj Close', test_size, window, scalar)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [76]:
def build_model(input_shape):
    model = Sequential()    
    model.add(LSTM(units = 50, return_sequences = True, input_shape = input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(units = 50, return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(units = 50))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))
    return model

In [78]:
input_shape = (X_train.shape[1], X_train.shape[2])

model = build_model(input_shape)
model.summary()

NameError: name 'Sequential' is not defined

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error',
              metrics='mean_absolute_percentage_error')

In [None]:
checkpointer = ModelCheckpoint(filepath = 'weights_best.hdf5', 
                               monitor = 'val_loss',
                               verbose = 1, 
                               save_best_only = True,
                               mode = 'min')

In [None]:
EPOCHS = 20
BATCH_SIZE = 32
model.fit(X_train, 
          y_train, 
          epochs=EPOCHS, 
          batch_size = BATCH_SIZE, 
          callbacks = [checkpointer],
          verbose = 1,
          shuffle = False,
          validation_split = 0.2)