In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from src.preprocessing import *

# Feature Engineering Notebook

The goal of this notebook is to create new features that can help the future algorithm perform better. The features created are not exhaustive, additional features can be designed and should undergo feature selection to determine which of them are relevant.

In [4]:

# 24/10/2024 BTC data
data = pd.read_csv('./data/bid_ask_data_BTCUSDT_20241024.csv')
print(data.shape)


(835124, 42)


In [5]:

# The table created in the mid_price encoding notebook gives us the value of theta for a given k
k=100
theta = 6.15e-4
data['label'] = get_midprice_variation_column(data, k, theta)



In [19]:
data['label'].value_counts()

label
0    18849
1    15750
2    15401
Name: count, dtype: int64

In [7]:
len(data.columns)

44

There are 44 features in the dataset. The goal of this part is to create new features that will help the deep learning architecture to learn faster, with less data.

## Order Book Agreggation Features

#### Relative Spread
The relative spread measures the difference between the best ask price and best bid price, normalized by the mid-price. This reflects market liquidity and transaction costs.

$$
\text{Relative Spread} = \frac{\text{Ask Price}_1 - \text{Bid Price}_1}{\text{Mid Price}}
$$



#### Depth (Bid and Ask)
Depth represents the cumulative volume of bids and asks up to a specific level in the order book. It quantifies market supply and demand strength.

$$
\text{Cumulative Bid Volume}_k = \sum_{i=1}^{k} \text{Bid Volume}_i, \quad \text{Cumulative Ask Volume}_k = \sum_{i=1}^{k} \text{Ask Volume}_i
$$



#### Volume Imbalance
Volume imbalance compares the total bid and ask volumes at different levels, normalized by the total volume. It highlights imbalances between supply and demand.

$$
\text{Volume Imbalance}_k = \frac{\text{Cumulative Bid Volume}_k - \text{Cumulative Ask Volume}_k}{\text{Cumulative Bid Volume}_k + \text{Cumulative Ask Volume}_k}
$$



#### Weighted Price
The price weighted by volume provides a market-activity weighted average, combining price and volume information for better trend analysis.

$$
\text{Weighted Price}_k = \frac{\sum_{i=1}^{k} (\text{Bid Price}_i \cdot \text{Bid Volume}_i + \text{Ask Price}_i \cdot \text{Ask Volume}_i)}{\sum_{i=1}^{k} (\text{Bid Volume}_i + \text{Ask Volume}_i)}
$$

## Temporal Features

#### Liquidity (Rolling Standard Deviation)
Liquidity measures the variability in market depth over time using a rolling window. High variability often signals changes in market behavior.

$$
\text{Liquidity}_\text{T} = \sqrt{\frac{1}{T} \sum_{t=t-T+1}^{t} \left(\text{Mid Price}(t) - \mu\right)^2}
$$

where  
$
\mu = \frac{1}{T} \sum_{t=t-T+1}^{t} \text{Mid Price}(t)
$




#### Mid-Price Rolling Average
A rolling average of the mid-price smooths out short-term fluctuations, providing clearer long-term trends.

$$
\text{Mid Price}_\text{rolling, T} = \frac{1}{T} \sum_{t=t-T+1}^{t} \text{Mid Price}(t)
$$



#### Timestamp (Hour and Minute)
Extracting the hour and minute from the timestamp captures periodic market trends, such as increased activity during specific hours.


In [10]:
def add_features(data, k_levels=[5, 10], rolling_windows=[10, 20, 50, 100]):

    # --- Order Book Aggregation Features ---
    
    data['Mid Price'] = (data['Ask Price 1'] + data['Bid Price 1']) / 2
    data['Relative Spread'] = (data['Ask Price 1'] - data['Bid Price 1']) / data['Mid Price']

    bid_volume_columns = [f'Bid Volume {i}' for i in range(1, 11)]
    ask_volume_columns = [f'Ask Volume {i}' for i in range(1, 11)]
    bid_price_columns = [f'Bid Price {i}' for i in range(1, 11)]
    ask_price_columns = [f'Ask Price {i}' for i in range(1, 11)]

    for k in k_levels:
        data[f'Cumulative Bid Volume {k}'] = data[bid_volume_columns[:k]].sum(axis=1)
        data[f'Cumulative Ask Volume {k}'] = data[ask_volume_columns[:k]].sum(axis=1)
        weighted_sum = (
            data[bid_price_columns[:k]].mul(data[bid_volume_columns[:k]]).sum(axis=1) +
            data[ask_price_columns[:k]].mul(data[ask_volume_columns[:k]]).sum(axis=1)
        )
        total_volume = (
            data[bid_volume_columns[:k]].sum(axis=1) + data[ask_volume_columns[:k]].sum(axis=1)
        )
        data[f'Weighted Price {k}'] = weighted_sum / total_volume

    # Compute Volume Imbalance for k = 1, 3, 5, 10.
    for k in [1, 3, 5, 10]:
        data[f'Volume Imbalance {k}'] = (
            (data[bid_volume_columns[:k]].sum(axis=1) - data[ask_volume_columns[:k]].sum(axis=1)) /
            (data[bid_volume_columns[:k]].sum(axis=1) + data[ask_volume_columns[:k]].sum(axis=1))
        )

    # --- Temporal Features ---

    for T in rolling_windows:
        data[f'Rolling Std {T}'] = data['Mid Price'].rolling(window=T).std()
        data[f'Rolling Mean {T}'] = data['Mid Price'].rolling(window=T).mean()

    data['Hour'] = pd.to_datetime(data['Timestamp']).dt.hour
    data['Minute'] = pd.to_datetime(data['Timestamp']).dt.minute

    return data


In [11]:
import time

start_time = time.time()
data = add_features(data)
end_time = time.time()
execution_time = end_time - start_time
print(f"Temps d'exécution de la fonction create_features : {execution_time:.2f} secondes")


Temps d'exécution de la fonction create_features : 21.90 secondes


In [12]:
data[100:110]

Unnamed: 0,Update ID,Timestamp,Bid Price 1,Bid Volume 1,Ask Price 1,Ask Volume 1,Bid Price 2,Bid Volume 2,Ask Price 2,Ask Volume 2,...,Rolling Std 10,Rolling Mean 10,Rolling Std 20,Rolling Mean 20,Rolling Std 50,Rolling Mean 50,Rolling Std 100,Rolling Mean 100,Hour,Minute
100,53330286900,2024-10-24 00:00:10.042726+00:00,66671.98,3.95032,66671.99,1.94007,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.523745,66670.377,1.377673,66669.516,0,0
101,53330286916,2024-10-24 00:00:10.142505+00:00,66671.98,3.3996,66671.99,2.87765,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.519713,66670.4436,1.396764,66669.5493,0,0
102,53330286930,2024-10-24 00:00:10.242527+00:00,66671.98,3.33936,66671.99,3.03585,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.51268,66670.5102,1.414806,66669.5826,0,0
103,53330286933,2024-10-24 00:00:10.342198+00:00,66671.98,3.33936,66671.99,3.03585,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.502605,66670.5768,1.431839,66669.6159,0,0
104,53330286946,2024-10-24 00:00:10.442394+00:00,66671.98,3.33786,66671.99,3.03585,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.489427,66670.6434,1.447899,66669.6492,0,0
105,53330286957,2024-10-24 00:00:10.542544+00:00,66671.98,3.33786,66671.99,3.01379,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.473062,66670.71,1.463016,66669.6825,0,0
106,53330286979,2024-10-24 00:00:10.642417+00:00,66671.98,3.15496,66671.99,2.78585,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.453402,66670.7766,1.477221,66669.7158,0,0
107,53330286989,2024-10-24 00:00:10.742255+00:00,66671.98,3.20036,66671.99,3.02873,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.430311,66670.8432,1.49054,66669.7491,0,0
108,53330287010,2024-10-24 00:00:10.842530+00:00,66671.98,3.19825,66671.99,2.30908,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.403621,66670.9098,1.502995,66669.7824,0,0
109,53330287024,2024-10-24 00:00:10.943039+00:00,66671.98,3.19825,66671.99,2.30391,66671.61,0.23191,66672.0,0.08885,...,0.0,66671.985,0.0,66671.985,1.373121,66670.9764,1.514609,66669.8157,0,0


***Note*** the number of different values for rolling_windows=[], depend on what is the prediction horizon k we use. With a greater $k$ we need to increase the value of $T$.