# Download packages

In [None]:
# !pip3 install gdown -q
# !pip3 install polars -q
# !pip3 install numba -q
# !pip3 install numba-progress -q

# Imports

In [215]:
import gdown
import numpy as np
import cupy as cp
import pandas as pd
import polars as pl
from tqdm import tqdm
import matplotlib.pyplot as plt
import numba as nb
from numba import njit
import time 

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [216]:
from numba_functions import numba_imb, numba_calculate_past_returns, numba_log_returns, numba_data_autocorrelation, numba_data_realized_kernel
from numpy_functions import numpy_imb, numpy_calculate_past_returns, numpy_log_returns, numpy_data_autocorrelation, numpy_data_realized_kernel
from cupy_functions import cupy_imb, cupy_calculate_past_returns, cupy_log_returns, cupy_data_autocorrelation, cupy_data_realized_kernel
%autoreload 2

# Download data

In [None]:
url = "https://drive.google.com/file/d/15E15XFVD8laDXNhw7uPpcPiWw-IGkYfv/view?usp=sharing"
output = 'book2h.parquet'
gdown.download(url=url, output=output, fuzzy=True, quiet=True)

url = "https://drive.google.com/file/d/1iTOy1bcQlgyuz-V2U-zgCFfzOuRm7NyN/view?usp=sharing"
output = "ticker2h.parquet"
gdown.download(url=url, output=output, fuzzy=True, quiet=True)

url = "https://drive.google.com/file/d/1Ro-3FbjQC2FDYIg1UZG7TIFEWOQkQ4uG/view?usp=sharing"
output = "trades2h.parquet"
gdown.download(url=url, output=output, fuzzy=True, quiet=True)

# Data Preparation

In [85]:
book = pl.read_parquet('book2h.parquet')
ticker = pl.read_parquet('ticker2h.parquet')
trades = pl.read_parquet('trades2h.parquet')
trades = trades.with_columns(pl.col('side').cast(pl.Float32, strict=False))

In [87]:
book_numpy = book.to_numpy()
ticker_numpy = ticker.to_numpy()
trades_numpy = trades.to_numpy()

In [4]:
# book = book.set_sorted('local_ts').with_columns(
#     pl.from_epoch(pl.col("local_ts"), time_unit="ns"))

# ticker = ticker.set_sorted('local_ts').with_columns(
#     pl.from_epoch(pl.col("local_ts"), time_unit="ns"))

# trades = trades.set_sorted('local_ts').with_columns(
#     pl.from_epoch(pl.col("local_ts"), time_unit="ns"))

# Feature generation

### Improved order book imbalances

In [39]:
ob_sizes_array = np.array([book_numpy.shape[0] // 10 for _ in range(10)])
ob_sizes_array[np.arange(ob_sizes_array.shape[0]) <= (book_numpy.shape[0] % 10)] += 1
ob_sizes_array = np.cumsum(ob_sizes_array)

In [40]:
numba_times = []

for size in tqdm(ob_sizes_array):
    start = time.perf_counter()
    numba_imbalance = np.array(numba_imb(book_numpy[:size]))
    end = time.perf_counter()
    numba_times.append(end - start)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:01<00:00,  6.14it/s]


In [41]:
numpy_times = []

for size in tqdm(ob_sizes_array):
    start = time.perf_counter()
    numpy_imbalance = np.array(numpy_imb(book_numpy[:size]))
    end = time.perf_counter()
    numpy_times.append(end - start)

100%|██████████| 10/10 [03:44<00:00, 22.48s/it]


In [17]:
cupy_times = []

for size in tqdm(ob_sizes_array):
    start = time.perf_counter()
    cupy_imbalance = np.array(cupy_imb(book_numpy[:size]))
    end = time.perf_counter()
    cupy_times.append(end - start)

  0%|          | 0/177479 [00:00<?, ?it/s]100%|██████████| 177479/177479 [08:50<00:00, 334.36it/s]


### Past returns

In [106]:
trades_sizes_array = np.array([trades_numpy.shape[0] // 10 for _ in range(10)])
trades_sizes_array[np.arange(trades_sizes_array.shape[0]) <= (trades_numpy.shape[0] % 10)] += 1
trades_sizes_array = np.cumsum(trades_sizes_array)

In [107]:
past_returns_window = 50

In [174]:
numba_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    numba_past_return = np.array(numba_calculate_past_returns(trades_numpy[:size].astype(float), past_returns_window))
    end = time.perf_counter()
    numba_times.append(end - start)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 27.55it/s]


In [167]:
numpy_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    numpy_past_return = np.array(numpy_calculate_past_returns(trades_numpy[:size].astype(float), past_returns_window))
    end = time.perf_counter()
    numpy_times.append(end - start)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:14<00:00,  7.45s/it]


In [None]:
cupy_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    cupy_past_return = np.array(cupy_calculate_past_returns(trades_numpy[:size].astype(float), past_returns_window))
    end = time.perf_counter()
    cupy_times.append(end - start)

### Log returns

In [187]:
numba_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    numba_log_ret = np.array(numba_log_returns(trades_numpy[:size][:, 1].astype(float)))
    end = time.perf_counter()
    numba_times.append(end - start)

100%|██████████| 10/10 [00:00<00:00, 347.57it/s]


In [188]:
numpy_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    numpy_log_ret = np.array(numpy_log_returns(trades_numpy[:size][:, 1].astype(float)))
    end = time.perf_counter()
    numpy_times.append(end - start)

100%|██████████| 10/10 [00:00<00:00, 342.74it/s]


In [184]:
cupy_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    cupy_log_ret = np.array(cupy_log_returns(trades_numpy[:size][:, 1].astype(float)))
    end = time.perf_counter()
    cupy_times.append(end - start)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]


NotImplementedError: Only full range ``x[::]`` or reverse ``x[::-1]`` is supported for basic slicing in CuPy fusion.

### Autocorrelation

In [211]:
autocorrelation_time_window = 10**5 # ms
lags = np.array([1])

In [212]:
log_ret = np.stack([numba_log_ret, trades_numpy[1:, 0]], axis=1)

In [214]:
numba_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    autocorrelations_per_lag = np.array(numba_data_autocorrelation(log_ret[:size], lags, autocorrelation_time_window))
    end = time.perf_counter()
    numba_times.append(end - start)

100%|██████████| 10/10 [00:00<00:00, 35.02it/s]


In [None]:
numpy_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    autocorrelations_per_lag = np.array(numpy_data_autocorrelation(log_ret[:size], lags, autocorrelation_time_window))
    end = time.perf_counter()
    numpy_times.append(end - start)

In [None]:
cupy_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    autocorrelations_per_lag = np.array(numpy_data_autocorrelation(log_ret[:size], lags, autocorrelation_time_window))
    end = time.perf_counter()
    cupy_times.append(end - start)

### Realized kernel

In [218]:
H = 3

In [221]:
numba_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    realized_kernel_per_H = np.array(numba_data_realized_kernel(log_ret[:size], H, autocorrelation_time_window))
    end = time.perf_counter()
    numba_times.append(end - start)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:01<00:00,  8.92it/s]


In [222]:
numpy_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    realized_kernel_per_H = np.array(numpy_data_realized_kernel(log_ret[:size], H, autocorrelation_time_window))
    end = time.perf_counter()
    numpy_times.append(end - start)

100%|██████████| 10/10 [00:00<00:00, 13.67it/s]


In [None]:
cupy_times = []

for size in tqdm(trades_sizes_array):
    start = time.perf_counter()
    realized_kernel_per_H = np.array(numpy_data_realized_kernel(log_ret[:size], H, autocorrelation_time_window))
    end = time.perf_counter()
    cupy_times.append(end - start)