In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import joblib

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM

import datetime
import math

pd.set_option('display.max_rows', 10000)

%matplotlib inline
%reload_ext tensorboard

In [3]:
np.random.seed(42)
tf.random.set_seed(42)

def create_split(df, pct_train, pct_val, batch_size, window_size):
    length = df.shape[0]
    temp_train_size = find_batch_gcd(math.floor(pct_train * length), batch_size)
    test_size = length - temp_train_size
    train_size = find_batch_gcd(math.floor((1 - pct_val) * temp_train_size), batch_size)
    val_size = temp_train_size - train_size
    df_train = df[:- val_size - test_size]
    df_val = df[- val_size - test_size - window_size:- test_size]
    df_test = df[- test_size - window_size:]
    return df_train, df_val, df_test

def find_batch_gcd(length, batch_size):
    while length % batch_size != 0:
        length -= 1
    return length

def create_dataset(df, window_size):
    X, y = [], []
    for i in range(len(df) - window_size):
        v = df.iloc[i:(i + window_size)].values
        X.append(v)
        y.append(df["Close"].iloc[i + window_size])
    return np.array(X), np.array(y)

def create_multi_pred_dataset(df, window_size, time_steps):
    X, y = [], []
    for i in range(len(df) - window_size - time_steps - 1):
        v = df.iloc[i:(i + window_size)].values
        X.append(v)
        y.append(df["Close"].iloc[i + window_size:i + window_size + time_steps].values)
    return np.array(X), np.array(y)

def create_model(nodes, optimizer, dropout, X_train):
    model = Sequential()
    model.add(LSTM(nodes[0], input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
    model.add(LSTM(nodes[1], return_sequences=True))
    model.add(LSTM(nodes[2]))
    model.add(Dropout(dropout))
    model.add(Dense(nodes[3]))
    model.compile(loss="mse", optimizer=optimizer, metrics=['mae'])
    return model

def flatten_prediction(pred, pred_count, time_steps):
    print(pred_count, pred.shape[0])
    pred = pred[::time_steps]
    pred = pred.flatten()
    if pred_count < pred.shape[0]:
        pred = pred[:pred_count - pred.shape[0]]
    return pred

def evaluate_forecast(pred, actual):
    mse = mean_squared_error(pred, actual)
    print("Test Mean Squared Error:", mse)
    mae = mean_absolute_error(pred, actual)
    print("Test Mean Absolute Error:", mae)
    return

def train_model(pair, batch_size, window_size, nodes_arr, optimizer, dropout, epochs):
    series = pd.read_csv("../data/processed/{}_processed.csv".format(pair))
    
    buy = pair[:3]
    sell = pair[3:]
    
    series = series[series.shape[0] % batch_size:]
    close = series[['Real Close']]

    series = series.drop(['Time', 'Real Close'], axis=1)
    series = series[['Close', 'EMA_10', 'EMA_50', 'RSI', 'A/D Index',
                     '{} Interest Rate'.format(buy), '{} Interest Rate'.format(sell), '{}_CPI'.format(buy), '{}_CPI'.format(sell),
                     '{} Twitter Sentiment'.format(buy), '{} Twitter Sentiment'.format(sell),
                     '{} News Sentiment'.format(buy), '{} News Sentiment'.format(sell),
                     #'EUR_GDP', 'USD_GDP', 'EUR_PPI', 'USD_PPI', 'USD Unemployment Rate', 'EUR Unemployment Rate'
                    ]]

    df_train, df_val, df_test = create_split(series, 0.75, 0.1, batch_size, window_size)
    print(f'df_train.shape {df_train.shape}, df_validation.shape {df_val.shape}, df_test.shape {df_test.shape}')

    closeScaler = MinMaxScaler(feature_range=(0, 1))
    featureScaler = MinMaxScaler(feature_range=(0, 1))
    
    df_train = df_train.copy()
    df_val = df_val.copy()
    df_test = df_test.copy()
    df_train.loc[:, ['Close']] = closeScaler.fit_transform(df_train[['Close']])
    df_train.loc[:, ~df_train.columns.isin(['Close'])] = featureScaler.fit_transform(df_train.loc[:, ~df_train.columns.isin(['Close'])])
    df_val.loc[:, ['Close']] = closeScaler.transform(df_val[['Close']])
    df_val.loc[:, ~df_val.columns.isin(['Close'])] = featureScaler.transform(df_val.loc[:, ~df_val.columns.isin(['Close'])])
    df_test.loc[:, ['Close']] = closeScaler.transform(df_test[['Close']])
    df_test.loc[:, ~df_test.columns.isin(['Close'])] = featureScaler.transform(df_test.loc[:, ~df_test.columns.isin(['Close'])])

    #X_train, y_train = create_dataset(df_train, window_size)
    #X_val, y_val = create_dataset(df_val, window_size)
    #X_test, y_test = create_dataset(df_test, window_size)
    
    X_train, y_train = create_multi_pred_dataset(df_train, window_size, nodes_arr[3])
    X_val, y_val = create_multi_pred_dataset(df_val, window_size, nodes_arr[3])
    print(X_train.shape)
    print(y_train.shape)
    print(X_val.shape)
    print(y_val.shape)
    #X_test, y_test = create_multi_pred_dataset(df_test, window_size, nodes_arr[3])

    model = create_model(nodes_arr, optimizer, dropout, X_train)
    
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    log_dir = "logs/tuning/" + current_time
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, update_freq='epoch', profile_batch=0, histogram_freq=1)

    history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=False,
                    #callbacks=[tensorboard_callback]
                   )
    
    return model, closeScaler, featureScaler

def visualize_loss(history):
    fig = plt.figure(figsize=(16, 10))
    ax1 = fig.subplots(1)
    ax1.set_title('Model Loss')
    ax1.set(xlabel='Epoch', ylabel='Loss')
    ax1.plot(history.history['loss'], label='Train Loss')
    ax1.plot(history.history['val_loss'], label='Val Loss')
    ax1.legend()

In [4]:
def test_model(pair, window_size, batch_size, time_steps, model, scaler, fScaler):
    buy = pair[:3]
    sell = pair[3:]

    series = pd.read_csv("../data/processed/{}_processed.csv".format(pair))
    series = series[series.shape[0] % batch_size:]
    close = series[['Time', 'Real Close', 'Close']]
    close = close.copy()
    close['PrevClose'] = close['Close'].shift(1)

    series = series.drop(['Time', 'Real Close'], axis=1)
    series = series[['Close', 'EMA_10', 'EMA_50', 'RSI', 'A/D Index',
                     '{} Interest Rate'.format(buy), '{} Interest Rate'.format(sell), '{}_CPI'.format(buy), '{}_CPI'.format(sell),
                     '{} Twitter Sentiment'.format(buy), '{} Twitter Sentiment'.format(sell),
                     '{} News Sentiment'.format(buy), '{} News Sentiment'.format(sell),
                     #'EUR_GDP', 'USD_GDP', 'EUR Unemployment Rate', 'USD Unemployment Rate', 'EUR_PPI', 'USD_PPI'
                    ]]

    df_train, df_val, df_test = create_split(series, 0.75, 0.1, batch_size, window_size)
    print(f'df_train.shape {df_train.shape}, df_validation.shape {df_val.shape}, df_test.shape {df_test.shape}')
    df_test = df_test.copy()
    df_test.loc[:, ['Close']] = scaler.transform(df_test[['Close']])
    df_test.loc[:, ~df_test.columns.isin(['Close'])] = fScaler.transform(df_test.loc[:, ~df_test.columns.isin(['Close'])])
    
    X_test, y_test = create_dataset(df_test, window_size)
    #X_test, y_test = create_multi_pred_dataset(df_test, window_size, 5)

    y_pred = model.predict(X_test)

    multi_pred = flatten_prediction(y_pred, y_test.shape[0], time_steps)
    evaluate_forecast(multi_pred, y_test)

    #mse = model.evaluate(X_test, y_test)
    #print("Test Mean Squared Error:", mse)

    index = [i for i in range(multi_pred.shape[0])]
    df_predicted = pd.DataFrame(scaler.inverse_transform(multi_pred.reshape(-1, 1)), columns=['Close'], index=index)
    df_actual = pd.DataFrame(scaler.inverse_transform(y_test.reshape(-1, 1)), columns=['Close'], index=index)

    df = pd.DataFrame(close[-multi_pred.shape[0] - window_size:])
    df.reset_index(inplace=True, drop=True)
    #print(df_test[['Close']][:20])
    #print(scaler.inverse_transform(df_test[['Close']])[:20])
    #print(scaler.inverse_transform(y_test.reshape(-1, 1))[:20])
    
    df = df[window_size:]
    df.reset_index(inplace=True, drop=True)
    #print(df[:20])
    df['rip'] = df_actual['Close']
    
    #df_predicted['Close'] = df['Real Close'].mul(np.exp(df_predicted['Close'].shift(-1))).shift(1)
    df_actual = df['Real Close'].mul(np.exp(df['Close']).shift(-1)).shift(1)
    print(df[:20])
    print(df_actual[:20])

    
    #evaluate_forecast(df_predicted['Close'].iloc[1:], df_actual['Close'].iloc[1:])

    #return df_predicted, df_actual
    
    #index = [i for i in range(y_pred.shape[0])]
    #df_predicted = pd.DataFrame(scaler.inverse_transform(y_pred), columns=['Close'], index=index)
    #df_actual = pd.DataFrame(scaler.inverse_transform(y_test.reshape(-1, 1)), columns=['Close'], index=index)
    
    #df = pd.DataFrame(close['Real Close'][-y_pred.shape[0] - window_size:-window_size])
    #df.reset_index(inplace=True, drop=True)
    
    #df_predicted['Close'] = df['Real Close'].mul(np.exp(df_predicted['Close'].shift(-1))).shift(1)
    #df_actual['Close'] = df['Real Close'].mul(np.exp(df_actual['Close'].shift(-1))).shift(1)
    
    #df_predicted['Close'] = df_predicted['Close']
    #df_actual['Close'] = df_actual['Close']
    
    #return df_predicted, df_actual

def visualize_prediction(df_predicted, df_actual):
    fig = plt.figure(figsize=(16, 10))
    ax1 = fig.subplots(1)
    ax1.set_title('Predicted Closing Price')
    ax1.set(xlabel='Time', ylabel='Close')
    ax1.plot(df_actual['Close'][:100], label='Actual')
    ax1.plot(df_predicted['Close'][:100], label='Prediction')
    ax1.legend()

In [7]:
batch_size = 32
window_size = 10
nodes = [80, 64, 32, 5]
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
dropout = 0.2
epochs = 1

model, closeScaler, featureScaler = train_model("EURUSD", batch_size, window_size, nodes, optimizer, dropout, epochs)

df_train.shape (50400, 13), df_validation.shape (5610, 13), df_test.shape (18698, 13)
(50384, 10, 13)
(50384, 5)
(5594, 10, 13)
(5594, 5)


In [8]:
cool = test_model("EURUSD", window_size, batch_size, 5, model, closeScaler, featureScaler)

df_train.shape (50400, 13), df_validation.shape (5610, 13), df_test.shape (18698, 13)
18688 18688
Test Mean Squared Error: 0.007084544323230347
Test Mean Absolute Error: 0.07748319441465179
                         Time  Real Close     Close  PrevClose       rip
0   2020-04-02 02:00:00+00:00     1.09369 -0.000603  -0.000603 -0.000603
1   2020-04-02 02:15:00+00:00     1.09431  0.000567  -0.000603  0.000567
2   2020-04-02 02:30:00+00:00     1.09435  0.000037   0.000567  0.000037
3   2020-04-02 02:45:00+00:00     1.09335 -0.000914   0.000037 -0.000914
4   2020-04-02 03:00:00+00:00     1.09342  0.000064  -0.000914  0.000064
5   2020-04-02 03:15:00+00:00     1.09350  0.000073   0.000064  0.000073
6   2020-04-02 03:30:00+00:00     1.09375  0.000229   0.000073  0.000229
7   2020-04-02 03:45:00+00:00     1.09376  0.000009   0.000229  0.000009
8   2020-04-02 04:00:00+00:00     1.09385  0.000082   0.000009  0.000082
9   2020-04-02 04:15:00+00:00     1.09377 -0.000073   0.000082 -0.000073
10  202