In [62]:
import pandas as pd
import os
import numpy as np
from functools import reduce
import missingno as msno
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import yfinance as yf
import nbimporter
from data_pipeline import FetchFinancials

In [63]:
import time
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("darkgrid")    
import math
from sklearn.metrics import mean_squared_error

In [28]:
class FetchFinancials(object):
  def __init__(self, assets, dependent):
    """
      Yahoo! Finance Data
      
      Date	Open	High	Low	Close*	Adj Close**	Volumen  
    """
    super().__init__()
    self.assets = assets
    self.dependent = dependent
    self.dataframes = []
  def _fetch(self):
    """
      Fetch Data Twice Every 30 seconds
      
      Returns:
        data: dataframe
    """
    
    # GRAB DATA
    for _ticker in self.assets:
      time.sleep(1)
      try:
        ticker = yf.Ticker(_ticker)

        # Historical Market Data
        result = msft.history(period="max")

        for _val in [i for i in result.columns.tolist() if i not in 'Date']:
          result.rename(columns={_val: f'{_ticker}_{_val}'}, inplace=True)
          

        self.dataframes.append(result)
      except:
        print(f"{_ticker}: ERROR")
      
    # Outer Join on Date
    data = reduce(lambda  left,right: pd.merge(left,right,on=['Date'], how='outer'), self.dataframes)
    
    return data
    
#   def _continuous_fetch(self, interval):
#     """
#       Continuous Fetching of Data using Threading
#     """
    
#     def __continuous_fetch():
#       threading.Timer(30.0, self._continuous_fetch).start()
#       self.data = self._fetch()
      
#     __continuous_fetch()
    
  def _calculate_technical_analysis_indicators(self, dataset):
    dataset.rename(columns={self.dependent: 'price'}, inplace=True)
    
    # Create 7 and 21 days Moving Average
    dataset['ma7'] = dataset['price'].rolling(window=7).mean()
    dataset['ma21'] = dataset['price'].rolling(window=21).mean()

    # Create MACD
    dataset['26ema'] = dataset['price'].ewm(span=26).mean()
    dataset['12ema'] = dataset['price'].ewm(span=12).mean()
    dataset['MACD'] = (dataset['12ema']-dataset['26ema'])

    # Create Bollinger Bands
    dataset['20sd'] = dataset['price'].rolling(window=20).std()
    dataset['upper_band'] = dataset['ma21'] + (dataset['20sd']*2)
    dataset['lower_band'] = dataset['ma21'] - (dataset['20sd']*2)

    # Create Exponential moving average
    dataset['ema'] = dataset['price'].ewm(com=0.5).mean()

    # Create Momentum
    dataset['momentum'] = dataset['price']-1

    dataset['log_momentum'] = np.log1p(dataset['price'])
    
    return dataset
    
  def _clean_data(self, dataset):
    """
      Scales Data (-1,1)

      Args:
        data (pd.DataFrame())

      Returns:
        dataframe
    """
    dataset = self._calculate_technical_analysis_indicators(dataset)
    
    minimum=min(dataset.index).date()
    maximum=max(dataset.index).date()

    idx = pd.date_range(minimum,maximum)
    dataset = dataset.reindex(idx, fill_value=np.nan)

    for val in dataset.columns.tolist() if val not in ['price']:
      dataset[val].interpolate(method='linear', inplace=True)
      scaler = MinMaxScaler(feature_range=(-1, 1))
      dataset[val] = scaler.fit_transform(dataset[val].values.reshape(-1,1))
      
    return dataset

In [29]:
correlated_assets = {
  'JPMorgan': 'JPM',
  'Goldman Sachs': 'GS',
  'Morgan Stanley': 'MS',
  'Wells Fargo': 'WFC',
  'Bank of America': 'BCS',
  'Barclays': 'BCS',
  'Deutche Bank': 'DB',
  'Citigroup': 'C',
  'Credit Suisse': 'DHY',
  'UBS Group': 'UBS',
  'HSBC Holdings': 'HSBC'
}

## Sliding Window Backtest
![title](data/time_series_split.jpg)

In [30]:
assets = list(correlated_assets.values())

grab_data = FetchFinancials(assets, 'JPM_Close')

In [31]:
data = grab_data._fetch()
data = grab_data._calculate_technical_analysis_indicators(data)
data = grab_data._clean_data(data)
data=data.dropna()

In [77]:
class LSTM(nn.Module):
  def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
    super(LSTM, self).__init__()
    self.hidden_dim = hidden_dim
    self.num_layers = num_layers

    self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_dim, output_dim)    
  def forward(self, x):
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
    out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
    out = self.fc(out[:, -1, :]) 
    return out

In [78]:
def plot_charts(original,predict,hist):
  fig = plt.figure()
  fig.subplots_adjust(hspace=0.2, wspace=0.2)

  plt.subplot(1, 2, 1)
  ax = sns.lineplot(x = original.index, y = original[0], label="Data", color='royalblue')
  ax = sns.lineplot(x = predict.index, y = predict[0], label="Training Prediction (LSTM)", color='tomato')
  ax.set_title('Stock price', size = 14, fontweight='bold')
  ax.set_xlabel("Days", size = 14)
  ax.set_ylabel("Cost (USD)", size = 14)
  ax.set_xticklabels('', size=10)


  plt.subplot(1, 2, 2)
  ax = sns.lineplot(data=hist, color='royalblue')
  ax.set_xlabel("Epoch", size = 14)
  ax.set_ylabel("Loss", size = 14)
  ax.set_title("Training Loss", size = 14, fontweight='bold')
  fig.set_figheight(6)
  fig.set_figwidth(16)

In [107]:
def return_metrics(x_test,y_train,y_train_pred,y_test):
  y_test_pred = model(x_test)

  y_train_pred = scaler.inverse_transform(y_train_pred.detach().numpy())
  y_train = scaler.inverse_transform(y_train.detach().numpy())
  y_test_pred = scaler.inverse_transform(y_test_pred.detach().numpy())
  y_test = scaler.inverse_transform(y_test.detach().numpy())


  train_score = math.sqrt(mean_squared_error(y_train[:,0], y_train_pred[:,0]))
#   print(f'Train Score: {train_score} RMSE')
  test_score = math.sqrt(mean_squared_error(y_test[:,0], y_test_pred[:,0]))
#   print(f'Test Score: {test_score} RMSE')
  return train_score, test_score, y_test_pred

In [108]:
def plot_predictions(scaler,y, train_index,test_index):
  trainPlot = y.iloc[train_index]
  testPlot = y.iloc[test_index]

  original = scaler.inverse_transform(y['price'].values.reshape(-1,1))

  breakpoint()
  predictions = np.append(trainPlot, testPlot, axis=1)
  predictions = np.append(predictions, original, axis=1)
  result = pd.DataFrame(predictions)

  plt.figure(figsize=(12, 6), dpi=100)

  for i, col in enumerate(result.columns):
    result[col].plot()

  plt.title('LSTM Predictions')

  plt.xticks(rotation=70)
  plt.legend(['Train Prediction', 'Test Prediction', 'Ground Truth'])
  plt.xlabel("Days")
  plt.ylabel("USD ($)")

In [109]:
for val in data.columns.tolist():
  if 'Splits' in val:
    data=data.drop(val,axis=1)
  if 'Dividends' in val:
    data=data.drop(val,axis=1)

In [110]:
data=data.reset_index(drop=True)
X,y=data.drop('price',axis=1),data[['price']]

In [103]:
from tqdm import tqdm

In [None]:
tscv = TimeSeriesSplit(max_train_size=1000,n_splits=10)


tr_list, test_list = [],[]

for train_index, test_index in tqdm(tscv.split(X),total=10):  
  _y = y.copy()
  scaler = MinMaxScaler(feature_range=(-1, 1))
  _y['price'] = scaler.fit_transform(_y['price'].values.reshape(-1,1))
  
#   print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = np.array(X.iloc[train_index].values), np.array(X.iloc[test_index].values)
  y_train, y_test = np.array(_y.iloc[train_index].values), np.array(_y.iloc[test_index].values)

  
  X_train = torch.from_numpy(X_train).type(torch.Tensor).unsqueeze(-1)
  X_test = torch.from_numpy(X_test).type(torch.Tensor).unsqueeze(-1)
  _y_train = y_train = torch.from_numpy(y_train).type(torch.Tensor)
  y_train = _y_train.unsqueeze(-1)
  _y_test = torch.from_numpy(y_test).type(torch.Tensor)
  y_test = _y_test.unsqueeze(-1)
  
#   breakpoint()
  
  hist = np.zeros(num_epochs)
  start_time = time.time()
  lstm = []
  
  
  input_dim = 1
  hidden_dim = 32
  num_layers = 4
  output_dim = 1
  num_epochs = 50

  model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
  criterion = torch.nn.MSELoss()
  optimiser = torch.optim.Adam(model.parameters(), lr=0.03)
  


  for t in range(num_epochs):
    y_train_pred = model(X_train)    
    loss = criterion(y_train_pred, y_train)
#     print("Epoch ", t, "MSE: ", loss.item())
    hist[t] = loss.item()    
    optimiser.zero_grad()
    loss.backward()
    optimiser.step()

  training_time = time.time()-start_time
#   print("Training time: {}".format(training_time))
  
  predict = pd.DataFrame(scaler.inverse_transform(y_train_pred.detach().numpy()))
  original = pd.DataFrame(scaler.inverse_transform(_y_train.detach().numpy()))
  
  
#   plot_charts(original,predict,hist)
  
  #append train and test scores to list
  train_score, test_score, y_test_pred = return_metrics(X_test,_y_train,y_train_pred,_y_test)
  tr_list.append(train_score); test_list.append(test_score)
  
#   plot_predictions(scaler,_y, train_index,test_index)

  0%|                                        | 0/10 [00:00<?, ?it/s]

In [None]:
print(f"Train MSE: {sum(tr_list)/len(tr_list)}")

In [None]:
print(f"Test MSE: {sum(test_list)/len(test_list)}")