# Analysis
Analysis the data and find features that can be used to predict the target variable.

In [2]:
import lakeapi
import datetime
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import cufflinks

from statsmodels.api import OLS

lakeapi.use_sample_data(anonymous_access=True)
cufflinks.go_offline()

*Parameters*

In [3]:
HORIZONS = np.linspace(1, 10000, 100)
TRAIN_SIZE = 0.7

## Initialisation
Load the data from the input files, add some useful columns. Creation of the return at different horizons.

In [4]:
# Load data
books = lakeapi.load_data(
    table="book",
    start=datetime.datetime(2022, 10, 1),
    end=datetime.datetime(2022, 10, 2),
    symbols=["BTC-USDT"],
    exchanges=["BINANCE"],
)

tick_val = books["received_time"].diff().mean() / pd.Timedelta(seconds=1)  # seconds

books["mid_price"] = (books["ask_0_price"] + books["bid_0_price"]) / 2

targets = pd.DataFrame()
for x in HORIZONS:
    targets[f"target_{x}"] = books["mid_price"].diff(x) / books["mid_price"]
books = pd.concat([books, targets], axis=1)
del targets

#  split data in train and test
books["set"] = ["train"] * int(np.floor(TRAIN_SIZE * len(books))) + ["test"] * int(
    np.ceil((1 - TRAIN_SIZE) * len(books))
)

## Features

In [None]:
def stat(y_col: str, X_col: list[str]):

    def success_ratio(y, yhat):
        return np.mean(np.sign(y) == np.sign(yhat))

    data = books.dropna(
        subset=X_col + ["set"] + [f"target_{x}" for x in HORIZONS], how="any"
    )

    if len(X_col) == 1:
        X_train = data.loc[data["set"] == "train", X_col].values.reshape(-1, 1)
        X_test = data.loc[data["set"] == "test", X_col].values.reshape(-1, 1)

    else:
        X_train = data.loc[data["set"] == "train", X_col].values
        X_test = data.loc[data["set"] == "test", X_col].values

    y_train = data.loc[data["set"] == "train", y_col].values
    y_test = data.loc[data["set"] == "test", y_col].values

    model = OLS(y_train, X_train)
    results = model.fit()

    predictions = results.predict(X_test)

    success = success_ratio(y_test, predictions)

    return (
        1
        - np.sum((y_test - predictions) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2),
        success,
    )

### Book Imbalance
The book imbalance is a feature that is often used in the literature to predict the price movement. The book imbalance is calculated as follows:

$$
\text{Book Imbalance} = \frac{\text{Total Bids} - \text{Total Asks}}{\text{Total Bids} + \text{Total Asks}}
$$

The value of the book imbalance is between -1 and 1.


In [None]:
# C
asks_size = books.filter(regex="ask_[0-9]+_size")
bids_size = books.filter(regex="bid_[0-9]+_size")

bids_size = np.sum(bids_size, axis=1)
asks_size = np.sum(asks_size, axis=1)

books["imb"] = (asks_size - bids_size) / (asks_size + bids_size)


# sbooks["imb"].rolling(30000).corr(books["target_100"]).dropna().plot()


R2 = []
success_ratios = []

for target in [f"target_{x}" for x in HORIZONS]:
    r2, success = stat(target, ["imb"])
    R2.append(r2)
    success_ratios.append(success)


fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=("R2", "Success Ratio"),
)
fig.add_trace(
    px.line(
        x=[tick_val * horizon for horizon in HORIZONS],
        y=R2,
        title="Out of sample R2 vs Horizon",
        labels={"x": "Horizon (s)", "y": "R2"},
    ).data[0],
    row=1,
    col=1,
)
fig.add_hline(
    y=0,
    line_dash="dot",
    row=1,
    col=1,
)
fig.add_trace(
    px.line(
        x=[tick_val * horizon for horizon in HORIZONS][1:],
        y=success_ratios[1:],
        title="Out of sample Success Ratio vs Horizon",
        labels={"x": "Horizon (s)", "y": "Success Ratio"},
    ).data[0],
    row=2,
    col=1,
)
fig.add_hline(
    y=0.5,
    line_dash="dot",
    row=2,
    col=1,
)
fig.update_layout(showlegend=False, title_text="Book Imbalance", template="plotly_dark")


### Spread
The spread is the difference between the best ask price and the best bid price. The spread is a measure of the liquidity of the market. A low spread indicates that the market is liquid, while a high spread indicates that the market is illiquid.

$$
\text{Spread} = \text{Best Ask Price} - \text{Best Bid Price}
$$

In [None]:
books["spread"] = books["ask_0_price"] - books["bid_0_price"]
books["spread"].iplot()

### Volatility

In [None]:
books["volatility"] = volatility(books["mid_price"].values)

### RSI (Relative Strength Index)
The Relative Strength Index (RSI) is a momentum oscillator that measures the speed and change of price movements. The RSI oscillates between 0 and 100. Traditionally, and according to Wilder, RSI is considered overbought when above 70 and oversold when below 30.

$$
\text{RSI} = 100 - \frac{100}{1 + \text{RS}}
$$
where RS is the EMA of the number of days the stock closes up divided by the number of days the stock closes down.
$$
\text{RS} = \frac{\text{EMA}(n, \text{Close Up})}{\text{EMA}(n, \text{Close Down})}
$$
