In [117]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import math
import matplotlib

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

from collections import defaultdict

## Styling Settings

In [118]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

tqdm.pandas()

In [119]:
df = pd.read_csv("../../data/BTC_1min.csv" , parse_dates=["system_time"])
#Sort the data just incase it's not
df = df.sort_values(by="system_time").reset_index(drop=True)
df.drop("Unnamed: 0", inplace = True, axis = 1)
# Keep only the first 5 columns
df = df.iloc[:, :5]
df.head()

Unnamed: 0,system_time,midpoint,spread,buys,sells
0,2021-04-07 11:33:41.122161+00:00,55896.285,0.01,4448599.0,3899402.0
1,2021-04-07 11:34:41.122161+00:00,55948.685,1.43,1243244.0,3606502.0
2,2021-04-07 11:35:41.122161+00:00,56013.785,0.01,3069094.0,1575759.0
3,2021-04-07 11:36:41.122161+00:00,55903.575,7.17,1220819.0,1321487.0
4,2021-04-07 11:37:41.122161+00:00,55899.995,0.01,2011287.0,3083005.0


In [120]:
#Number of records and columns
df.shape

(17113, 5)

## Split Into Traning, Validation and Test


In [121]:
TRAIN_SET = .7
VAL_SET = .15
TEST_SET = .15

# Total number of samples in the data
total_samples = len(df)

# Calculate the number of samples for each split
train_samples = int(total_samples * TRAIN_SET)
val_samples = int(total_samples * VAL_SET)
test_samples = total_samples - train_samples - val_samples

# Split the data into train, validation, and test sets
train_df = df[:train_samples]
val_df = df[train_samples:train_samples+val_samples]
test_df = df[train_samples+val_samples:]

# Print the shapes of the splits
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

test_df.head()

Train shape: (11979, 5)
Validation shape: (2566, 5)
Test shape: (2568, 5)


Unnamed: 0,system_time,midpoint,spread,buys,sells
14545,2021-04-17 14:57:00.119741+00:00,60763.475,2.23,271816.745971,415404.971343
14546,2021-04-17 14:58:00.119741+00:00,60851.115,0.01,156624.903424,128126.170225
14547,2021-04-17 14:59:00.119741+00:00,60779.705,0.01,153774.17775,350202.171777
14548,2021-04-17 15:00:00.119741+00:00,60722.08,5.0,169720.641668,88220.281238
14549,2021-04-17 15:01:00.119741+00:00,60671.945,0.01,163924.215313,812911.422822


## Normalise the Data

In [122]:
train_df = train_df.set_index('system_time')
val_df = test_df.set_index('system_time')
test_df = test_df.set_index('system_time')


scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

train_df = pd.DataFrame(
    scaler.transform(train_df),
    index = train_df.index,
    columns = train_df.columns
)

val_df = pd.DataFrame(
    scaler.transform(val_df),
    index = test_df.index,
    columns = test_df.columns
)

test_df = pd.DataFrame(
    scaler.transform(test_df),
    index = test_df.index,
    columns = test_df.columns
)

train_df.head()

Unnamed: 0_level_0,midpoint,spread,buys,sells
system_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-07 11:33:41.122161+00:00,-0.901161,-1.0,-0.221796,-0.267346
2021-04-07 11:34:41.122161+00:00,-0.890046,-0.91615,-0.782711,-0.32238
2021-04-07 11:35:41.122161+00:00,-0.876237,-1.0,-0.4632,-0.703947
2021-04-07 11:36:41.122161+00:00,-0.899615,-0.577207,-0.786635,-0.751723
2021-04-07 11:37:41.122161+00:00,-0.900374,-1.0,-0.648308,-0.420743


## Create Sequences

In [123]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length=3):
  sequences = []
  data_size = len(input_data)

  for i in tqdm(range(data_size - sequence_length)):
    sequence = input_data[i:i+sequence_length]
    label_position = i + sequence_length
    label = input_data.iloc[label_position][target_column]

    sequences.append((sequence, label))

  return sequences

In [124]:
SEQUENCE_LENGTH = 100

train_sequences = create_sequences(train_df, "midpoint", sequence_length=SEQUENCE_LENGTH)
val_sequences = create_sequences(val_df, "midpoint", sequence_length=SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "midpoint", sequence_length=SEQUENCE_LENGTH)

  0%|          | 0/11879 [00:00<?, ?it/s]

  0%|          | 0/2468 [00:00<?, ?it/s]

  0%|          | 0/2468 [00:00<?, ?it/s]

In [125]:
# Let's check first sequence label and data
print("Label: ", train_sequences[0][1])
print("")
print("Sequence: ",train_sequences[0][0])
print("Sequence Shape: ",train_sequences[0][0].shape)

Label:  -0.736226848537493

Sequence:                                    midpoint    spread      buys     sells
system_time                                                             
2021-04-07 11:33:41.122161+00:00 -0.901161 -1.000000 -0.221796 -0.267346
2021-04-07 11:34:41.122161+00:00 -0.890046 -0.916150 -0.782711 -0.322380
2021-04-07 11:35:41.122161+00:00 -0.876237 -1.000000 -0.463200 -0.703947
2021-04-07 11:36:41.122161+00:00 -0.899615 -0.577207 -0.786635 -0.751723
2021-04-07 11:37:41.122161+00:00 -0.900374 -1.000000 -0.648308 -0.420743
...                                    ...       ...       ...       ...
2021-04-07 13:08:41.122161+00:00 -0.754964 -0.936227 -0.921722 -0.968393
2021-04-07 13:09:41.122161+00:00 -0.751068 -1.000000 -0.883525 -0.919799
2021-04-07 13:10:41.122161+00:00 -0.762495 -1.000000 -0.956782 -0.959068
2021-04-07 13:11:41.122161+00:00 -0.759446 -1.000000 -0.922827 -0.993496
2021-04-07 13:12:41.122161+00:00 -0.749897 -1.000000 -0.916512 -0.992701

[100 rows x

In [126]:
len(train_sequences), len(val_sequences),len(test_sequences)


(11879, 2468, 2468)

In [127]:
class BTCDataset(Dataset):
  def __init__(self, sequences):
    self.sequences = sequences

  def __len__(self):
    return len(self.sequences)

  def __getitem__(self, idx):
    sequence, label = self.sequences[idx]
    return dict(
        sequence = torch.Tensor(sequence.to_numpy()),
        label = torch.tensor(label).float()
    )

In [128]:
class BTCPriceDataModule(pl.LightningDataModule):
  def __init__(
      self, train_sequences, val_sequences,test_sequences, batch_size = 8
  ):
    super().__init__()
    self.train_sequences = train_sequences
    self.val_sequences = val_sequences
    self.test_sequences = test_sequences
    self.batch_size = batch_size

  def setup(self, stage=None):
    self.train_dataset = BTCDataset(self.train_sequences)
    self.val_dataset = BTCDataset(self.val_sequences)
    self.test_dataset = BTCDataset(self.test_sequences)

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = False,
        num_workers = 3
    )

  def val_dataloader(self):
    return DataLoader(
        self.val_dataset,
        batch_size = self.batch_size,
        shuffle = False,
        num_workers =3
    )
  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = self.batch_size,
        shuffle = False,
        num_workers =6
    )

In [129]:
N_EPOCHS = 20
BATCH_SIZE = 128

data_module = BTCPriceDataModule(train_sequences, val_sequences,test_sequences, batch_size = BATCH_SIZE)
data_module.setup()

In [130]:
train_dataset = BTCDataset(train_sequences)

In [131]:
# Testing our dataloader
a = iter(train_dataset)
b = next(a)
print("Sequence Shape: ", b["sequence"].shape)
print("Label: {} and Label Shape: {}".format(b["label"], b["label"].shape) )

Sequence Shape:  torch.Size([100, 4])
Label: -0.7362268567085266 and Label Shape: torch.Size([])


In [132]:
class PricePredictionModel(nn.Module):
  def __init__(self, n_features, n_hidden=32, n_layers=2):
    super().__init__()

    self.n_hidden = n_hidden

    self.lstm = nn.LSTM(
        input_size = n_features,
        hidden_size = n_hidden,
        batch_first = True,
        num_layers = n_layers, # Stack LSTMs
        dropout = 0.2
    )

    self.regressor = nn.Linear(n_hidden, 1)

  def forward(self, x):
    self.lstm.flatten_parameters()  # For distrubuted training

    _, (hidden, _) = self.lstm(x)
    # We want the output from the last layer to go into the final
    # regressor linear layer
    out = hidden[-1]

    return self.regressor(out)

In [133]:
class BTCPricePredictor(pl.LightningModule):

  def __init__(self, n_features: int):
    super().__init__()
    self.model = PricePredictionModel(n_features)
    self.criterion = nn.MSELoss()

  def forward(self, x, labels=None):
    output = self.model(x)

    loss = 0

    if labels is not None:
      loss = self.criterion(output, labels.unsqueeze(dim=1))

    return loss, output

  def training_step(self, batch, batch_idx):
    sequences = batch["sequence"]
    labels = batch["label"]

    loss, output = self.forward(sequences, labels)

    self.log("train_loss", loss, prog_bar=True, logger=True)
    return loss

  def validation_step(self, batch, batch_idx):
    sequences = batch["sequence"]
    labels = batch["label"]

    loss, output = self.forward(sequences, labels)

    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    sequences = batch["sequence"]
    labels = batch["label"]

    loss, output = self.forward(sequences, labels)

    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def configure_optimizers(self):
    return optim.Adam(self.model.parameters(), lr=0.01)

In [134]:
n_features = b["sequence"].shape[1]


model = BTCPricePredictor(n_features = n_features)

In [None]:
for item in data_module.train_dataloader():
  print(item["sequence"].shape)
  print(item["label"].shape)
  break

In [None]:
# Starting tensorboard
%reload_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k = 1,
    verbose = True,
    monitor = "val_loss",
    mode = "min"
)

logger = TensorBoardLogger("lightning_logs", name = "btc-price")

early_stopping_callback = EarlyStopping(monitor = "val_loss", patience = 5)

In [None]:
trainer = pl.Trainer(
    logger = logger,
    callbacks = [early_stopping_callback , checkpoint_callback],
    max_epochs = N_EPOCHS,
    gpus = 1
)


In [None]:
trainer.fit(model, data_module)

In [None]:
checkpoint_path = "checkpoints/best-checkpoint-v12.ckpt"

trained_model = BTCPricePredictor.load_from_checkpoint(
    checkpoint_path,
    n_features = n_features
)

# Freezing the model for faster predictions
trained_model.freeze()

In [None]:
test_dataset = BTCDataset(test_sequences)
predictions = []
labels = []

for item in tqdm(test_dataset):
  sequence = item["sequence"]
  label = item["label"]

  _, output = trained_model(sequence)
  predictions.append(output.item())
  labels.append(label.item())

In [None]:
len(predictions), len(test_df)

In [None]:
len(test_df) - SEQUENCE_LENGTH

In [None]:
# The values are normalised using MinMaxScaler
test_df.head()

In [None]:
descaler = MinMaxScaler()
descaler.min_, descaler.scale_ = scaler.min_[-1], scaler.scale_[-1]


def descale(descaler, values):
  # Scaler works only with 2D data
  values_2d = np.array(values)[:, np.newaxis]
  return descaler.inverse_transform(values_2d).flatten()

predictions_descaled = descale(descaler, predictions)
labels_descaled = descale(descaler, labels)

print(predictions_descaled[:3])
print(labels_descaled[:3])

In [None]:
test_data = df[train_samples+val_samples:]
len(test_data), len(test_df)

test_sequences_data = test_data.iloc[SEQUENCE_LENGTH:]
len(test_sequences_data), len(test_sequences)

In [None]:
test_sequences_data.head()

In [None]:
predictions_descaled

In [None]:
dates = test_sequences_data.system_time.tolist()
plt.plot_date(dates, predictions_descaled, "-", label= "Predicted", color = "g")
plt.plot_date(dates, labels_descaled, "--", label = "Real", color = "b")
plt.xticks(rotation =45)
plt.legend()