In [None]:
!pip install pandas_ta
!pip install --upgrade yfinance

In [None]:
import matplotlib.pyplot as plt
import datetime
import numpy as np
import pandas as pd
import pandas_ta as ta
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestRegressor

import torch
import torch.optim as optim
import torch.utils.data as data
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary

import yfinance

from xgboost import XGBRegressor

# Define models

In [None]:
class LSTM_Model(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.linear(x)
        x = F.relu(x)
        return x

class GRU_Model(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, num_layers=1):
        super().__init__()
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
    def forward(self, x):
        x, _ = self.gru(x)
        x = self.linear(x)
        x = F.relu(x)
        return x

# Function to add trading indicators to the dataset

In [None]:
def add_indicators(dataset_df, macd=False, rsi=False, bbands=False, obv=False, sma=False, ema=False, stoch=False, adx=False):
  if macd:
    dataset_df.ta.macd(append=True)
  if rsi:
    dataset_df.ta.rsi(append=True)
  if bbands:
    dataset_df.ta.bbands(append=True)
  if obv:
    dataset_df.ta.obv(append=True)

  if sma:
    dataset_df.ta.sma(length=20, append=True)
  if ema:
    dataset_df.ta.ema(length=20, append=True)
  if stoch:
    dataset_df.ta.stoch(append=True)
  if adx:
    dataset_df.ta.adx(append=True)

  return dataset_df

In [None]:
startDate = datetime.datetime(2020, 7, 1)
endDate = datetime.datetime(2023, 12, 1)

interval = '1d'
data_columns = ['Open', 'High', 'Low', 'Close', 'Volume']

In [None]:
SPY_ticker = yfinance.Ticker("SPY")
SPY_data = SPY_ticker.history(start=startDate, end=endDate, interval=interval)

spy_df = SPY_data[data_columns].astype('float32').reset_index(drop=True)
spy_df = add_indicators(spy_df)

RUT_ticker = yfinance.Ticker("^RUT")
RUT_data = RUT_ticker.history(start=startDate, end=endDate, interval=interval)

rut_df = RUT_data[data_columns].astype('float32').reset_index(drop=True)
rut_df = add_indicators(rut_df)

GOLD_ticker = yfinance.Ticker("GC=F")
GOLD_data = GOLD_ticker.history(start=startDate, end=endDate, interval=interval)

gold_df = GOLD_data[data_columns].astype('float32').reset_index(drop=True)
gold_df = add_indicators(gold_df)

IR_ticker = yfinance.Ticker("^TNX")
IR_data = IR_ticker.history(start=startDate, end=endDate, interval=interval)

ir_df = IR_data[data_columns].astype('float32').reset_index(drop=True)
ir_df = add_indicators(ir_df)

nasdaq_ticker = yfinance.Ticker("^IXIC")
nasdaq_data = nasdaq_ticker.history(start=startDate, end=endDate, interval=interval)

nasdaq_df = nasdaq_data[data_columns].astype('float32').reset_index(drop=True)
nasdaq_df = add_indicators(nasdaq_df)

oil_ticker = yfinance.Ticker("^IXIC")
oil_data = oil_ticker.history(start=startDate, end=endDate, interval=interval)

oil_df = oil_data[data_columns].astype('float32').reset_index(drop=True)
oil_df = add_indicators(oil_df)

china_index_ticker = yfinance.Ticker("^HSCE")
china_index_data = china_index_ticker.history(start=startDate, end=endDate, interval=interval)

china_index_df = china_index_data[data_columns].astype('float32').reset_index(drop=True)
china_index_df = add_indicators(china_index_df)

DJI_ticker = yfinance.Ticker("^DJI")
DJI_data = DJI_ticker.history(start=startDate, end=endDate, interval=interval)

dji_df = DJI_data[data_columns].astype('float32').reset_index(drop=True)
dji_df = add_indicators(dji_df)

# Add new features to the dataset

In [None]:
def add_columns_to_the_dataset(dataset_df, spy=False, rut=False, gold=False, ir=False, nasdaq=False, oil=False, china_index=False, dji=False):
  dataset_df['Volatility'] = dataset_df['High'] - dataset_df['Low']
  dataset_df['Change'] = dataset_df['Close'] - dataset_df['High']

  if spy:
    dataset_df['SPY_Close'] = spy_df['Close']
    dataset_df['SPY_Vol'] = spy_df['Volume']
    dataset_df['SPY_High'] = spy_df['High']
    dataset_df['SPY_Low'] = spy_df['Low']

    dataset_df['SPY_Volatility'] = spy_df['High'] - spy_df['Low']
    dataset_df['SPY_Change'] = spy_df['Close'] - spy_df['Open']

  if rut:
    dataset_df['RUT_Close'] = rut_df['Close']
    dataset_df['RUT_Vol'] = rut_df['Volume']
    dataset_df['RUT_High'] = rut_df['High']
    dataset_df['RUT_Low'] = rut_df['Low']

    dataset_df['RUT_Volatility'] = rut_df['High'] - rut_df['Low']
    dataset_df['RUT_Change'] = rut_df['Close'] - rut_df['Open']

  if gold:
    dataset_df['GOLD_Close'] = gold_df['Close']
    dataset_df['GOLD_Vol'] = gold_df['Volume']
    dataset_df['GOLD_High'] = gold_df['High']
    dataset_df['GOLD_Low'] = gold_df['Low']

    dataset_df['GOLD_Volatility'] = gold_df['High'] - gold_df['Low']
    dataset_df['GOLD_Change'] = gold_df['Close'] - gold_df['Open']

  if ir:
    dataset_df['IR_Close'] = ir_df['Close']
    dataset_df['IR_Vol'] = ir_df['Volume']
    dataset_df['IR_High'] = ir_df['High']
    dataset_df['IR_Low'] = ir_df['Low']

    dataset_df['IR_Volatility'] = ir_df['High'] - ir_df['Low']
    dataset_df['IR_Change'] = ir_df['Close'] - ir_df['Open']

  if nasdaq:
    dataset_df['NASDAQ_Close'] = nasdaq_df['Close']
    dataset_df['NASDAQ_Vol'] = nasdaq_df['Volume']
    dataset_df['NASDAQ_High'] = nasdaq_df['High']
    dataset_df['NASDAQ_Low'] = nasdaq_df['Low']

    dataset_df['NASDAQ_Volatility'] = nasdaq_df['High'] - nasdaq_df['Low']
    dataset_df['NASDAQ_Change'] = nasdaq_df['Close'] - nasdaq_df['Open']

  if oil:
    dataset_df['OIL_Close'] = oil_df['Close']
    dataset_df['OIL_Vol'] = oil_df['Volume']
    dataset_df['OIL_High'] = oil_df['High']
    dataset_df['OIL_Low'] = oil_df['Low']

    dataset_df['OIL_Volatility'] = oil_df['High'] - oil_df['Low']
    dataset_df['OIL_Change'] = oil_df['Close'] - oil_df['Open']

  if china_index:
    dataset_df['CHINA_INDEX_Close'] = china_index_df['Close']
    dataset_df['CHINA_INDEX_Vol'] = china_index_df['Volume']
    dataset_df['CHINA_INDEX_High'] = china_index_df['High']
    dataset_df['CHINA_INDEX_Low'] = china_index_df['Low']

    dataset_df['CHINA_INDEX_Volatility'] = china_index_df['High'] - china_index_df['Low']
    dataset_df['CHINA_INDEX_Change'] = china_index_df['Close'] - china_index_df['Open']

  if dji:
    dataset_df['DJI_Close'] = dji_df['Close']
    dataset_df['DJI_Vol'] = dji_df['Volume']
    dataset_df['DJI_High'] = dji_df['High']
    dataset_df['DJI_Low'] = dji_df['Low']

    dataset_df['DJI_Volatility'] = dji_df['High'] - dji_df['Low']
    dataset_df['DJI_Change'] = dji_df['Close'] - dji_df['Open']

  return dataset_df

In [None]:
def train_model(optimizer, loss_fn, model, X_train, y_train, X_test, y_test, n_epochs=20, batch_size=8):

  loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
  best_rmse = 1000

  for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
      y_pred = model(X_batch).ravel()
      loss = loss_fn(y_pred, y_batch)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    model.eval()
    with torch.no_grad():
      y_pred = model(X_train).ravel()
      train_rmse = np.sqrt(loss_fn(y_pred, y_train))
      y_pred = model(X_test).ravel()
      test_rmse = np.sqrt(loss_fn(y_pred, y_test))
      if best_rmse > test_rmse:
        best_rmse = test_rmse
    # print("Epoch %d: train RMSE %.4f, test RMSE %.4f" % (epoch, train_rmse, test_rmse))

  return best_rmse.item()

def train_models(dataset, feature_columns, time_step=1, selected_features=[3], target_feature=3):
  feature_names = [feature_columns[index] + "_" + str(i) for i in range(time_step) for index in selected_features]
  X, y = create_dataset(dataset, time_step, selected_features, target_feature, model=True)

  X_train, y_train, X_test, y_test = train_test_split(X, y)

  print("LSTM: ")
  lstm_model = LSTM_Model(input_size=X_train.shape[1], hidden_size=50, num_layers=2)
  optimizer = optim.SGD(lstm_model.parameters(), lr=0.1, momentum=0.9)
  loss_fn = nn.MSELoss()

  lstm_best_rmse = train_model(optimizer, loss_fn, lstm_model, X_train, y_train, X_test, y_test, n_epochs=40)
  print("LSTM best = " + str(lstm_best_rmse))

  print("GRU: ")
  gru_model = GRU_Model(input_size=X_train.shape[1], hidden_size=50, num_layers=2)
  optimizer = optim.SGD(gru_model.parameters(), lr=0.1, momentum=0.9)
  scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
  loss_fn = nn.MSELoss()

  gru_best_rmse = train_model(optimizer, loss_fn, gru_model, X_train, y_train, X_test, y_test, n_epochs=40)
  print("GRU best = " + str(gru_best_rmse))


In [None]:
def create_dataset(dataset, time_step=1, feature_indices=0, target_index=3, model=False):
  dataX, dataY = [],[]
  for i in range(len(dataset)-time_step-1):
    feature = dataset[i:(i+time_step), feature_indices].flatten()
    target = dataset[i+time_step, target_index]
    dataX.append(feature)
    dataY.append(target)

  X, y = np.array(dataX), np.array(dataY)

  X, y = X[~np.isnan(X).any(axis=1)], y[~np.isnan(X).any(axis=1)]

  if model:
    X, y = torch.tensor(X), torch.tensor(y)

  return X, y

def train_test_split(X, y, train_fraction=0.75):
  test_first_index = int(X.shape[0] * train_fraction)
  X_train, y_train = X[: test_first_index], y[: test_first_index]
  X_test, y_test = X[test_first_index: ], y[test_first_index: ]

  print("X_train shape: " + str(X_train.shape), "y_train shape: " + str(y_train.shape))
  print("X_test shape: " + str(X_test.shape), "y_test shape: " + str(y_test.shape))

  return X_train, y_train, X_test, y_test

def run_experiment(dataset, feature_columns, time_step=1, selected_features=[3], target_feature=3):
  """This function is used to train various models and show their metrics + feature importances for some of them"""
  feature_names = [feature_columns[index] + "_" + str(i) for i in range(time_step) for index in selected_features]
  X, y = create_dataset(dataset, time_step, selected_features, target_feature)
  importances_df = pd.DataFrame()

  X_train, y_train, X_test, y_test = train_test_split(X, y)

  xgb_model = XGBRegressor(importance_type='gain')
  xgb_model.fit(X_train, y_train)
  pred = xgb_model.predict(X_test)

  # RMSE Computation
  rmse = np.sqrt(MSE(y_test, pred))
  print("XGBoost RMSE : % f" %(rmse))

  rf_model = RandomForestRegressor(max_depth=4)
  rf_model.fit(X_train, y_train)
  pred = rf_model.predict(X_test)

  # RMSE Computation
  rmse = np.sqrt(MSE(y_test, pred))
  print("Random Forest RMSE : % f" %(rmse))

  xgb_importances = xgb_model.feature_importances_
  importances_df = pd.concat([importances_df, pd.Series(xgb_importances, index=feature_names)], axis=1)

  rf_importances = rf_model.feature_importances_
  importances_df = pd.concat([importances_df, pd.Series(rf_importances, index=feature_names)], axis=1)

  perm_importance_xgb = permutation_importance(xgb_model, X_test, y_test)
  importances_df = pd.concat([importances_df, pd.Series(perm_importance_xgb.importances_mean, index=feature_names)], axis=1)

  perm_importance_rf = permutation_importance(rf_model, X_test, y_test)
  importances_df = pd.concat([importances_df, pd.Series(perm_importance_rf.importances_mean, index=feature_names)], axis=1)

  importances_df.columns = ['XGB', 'RF', 'Perm on XGB', 'Perm on RF']
  importances_df.plot.barh(title='Feature importance', figsize=(10, 10))

  train_models(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# Amazon base

In [None]:
AMZN_ticker = yfinance.Ticker("AMZN")
AMZN_data = AMZN_ticker.history(start=startDate, end=endDate, interval=interval)

dataset_df = AMZN_data[data_columns].astype('float32').reset_index(drop=True)

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# Amazon all indicators

In [None]:
dataset_df = AMZN_data[data_columns].astype('float32').reset_index(drop=True)
dataset_df = add_indicators(dataset_df, macd=True, rsi=True, bbands=True, obv=True, sma=True, ema=True, stoch=True, adx=True)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=False, rut=False, gold=False, ir=False, nasdaq=False, oil=False, china_index=False, dji=False)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# Amazon chosen indicators

In [None]:
dataset_df = AMZN_data[data_columns].astype('float32').reset_index(drop=True)

dataset_df = add_indicators(dataset_df, macd=True, rsi=False, bbands=True, obv=True, sma=False, ema=False, stoch=False, adx=False)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=False, rut=False, gold=False, ir=False, nasdaq=False, oil=False, china_index=False, dji=False)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# Amazon all macrofactors added

In [None]:
dataset_df = AMZN_data[data_columns].astype('float32').reset_index(drop=True)

dataset_df = add_indicators(dataset_df, macd=False, rsi=False, bbands=False, obv=False, sma=False, ema=False, stoch=False, adx=False)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=True, rut=True, gold=True, ir=True, nasdaq=True, oil=True, china_index=True, dji=True)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# Amazon chosen macrofactors

In [None]:
dataset_df = AMZN_data[data_columns].astype('float32').reset_index(drop=True)

dataset_df = add_indicators(dataset_df, macd=False, rsi=False, bbands=False, obv=False, sma=False, ema=False, stoch=False, adx=False)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=False, rut=True, gold=False, ir=True, nasdaq=False, oil=True, china_index=False, dji=False)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# Amazon final

In [None]:
dataset_df = AMZN_data[data_columns].astype('float32').reset_index(drop=True)

dataset_df = add_indicators(dataset_df, macd=True, rsi=False, bbands=True, obv=True, sma=False, ema=False, stoch=False, adx=False)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=False, rut=True, gold=False, ir=True, nasdaq=False, oil=True, china_index=False, dji=False)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 2
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# BTC base model

In [None]:
BTC_ticker = yfinance.Ticker("BTC-USD")
BTC_data = BTC_ticker.history(start=startDate, end=endDate, interval=interval)

dataset_df = BTC_data[data_columns].astype('float32').reset_index(drop=True)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# BTC all indicators

In [None]:
dataset_df = BTC_data[data_columns].astype('float32').reset_index(drop=True)
dataset_df = add_indicators(dataset_df, macd=True, rsi=True, bbands=True, obv=True, sma=True, ema=True, stoch=True, adx=True)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=False, rut=False, gold=False, ir=False, nasdaq=False, oil=False, china_index=False, dji=False)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# BTC chosen indicators

In [None]:
dataset_df = BTC_data[data_columns].astype('float32').reset_index(drop=True)
dataset_df = add_indicators(dataset_df, macd=False, rsi=True, bbands=True, obv=False, sma=False, ema=False, stoch=False, adx=False)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=False, rut=False, gold=False, ir=False, nasdaq=False, oil=False, china_index=False, dji=False)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# BTC all macrofactors

In [None]:
dataset_df = BTC_data[data_columns].astype('float32').reset_index(drop=True)
dataset_df = add_indicators(dataset_df, macd=False, rsi=False, bbands=False, obv=False, sma=False, ema=False, stoch=False, adx=False)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=True, rut=True, gold=True, ir=True, nasdaq=True, oil=True, china_index=True, dji=True)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# BTC chosen macrofactors

In [None]:
dataset_df = BTC_data[data_columns].astype('float32').reset_index(drop=True)
dataset_df = add_indicators(dataset_df, macd=False, rsi=False, bbands=False, obv=False, sma=False, ema=False, stoch=False, adx=False)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=False, rut=False, gold=False, ir=True, nasdaq=False, oil=False, china_index=False, dji=False)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 1
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)

# BTC final

In [None]:
dataset_df = BTC_data[data_columns].astype('float32').reset_index(drop=True)
dataset_df = add_indicators(dataset_df, macd=False, rsi=True, bbands=True, obv=False, sma=False, ema=False, stoch=False, adx=False)
dataset_df = add_columns_to_the_dataset(dataset_df, spy=False, rut=False, gold=False, ir=True, nasdaq=False, oil=False, china_index=False, dji=False)
dataset_df = dataset_df.astype('float32')

scaler = MinMaxScaler(feature_range=(0,1))
dataset = scaler.fit_transform(dataset_df)

time_step = 2
selected_features = range(len(dataset_df.columns))

run_experiment(dataset, dataset_df.columns, time_step=time_step, selected_features=selected_features, target_feature=0)