In [2]:
import plotly.io as pio
pio.renderers.default = "browser"

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy import stats

In [3]:
train_data = pd.read_parquet("../data/train.parquet")
prices = train_data["X853"]
print(prices)

timestamp
2023-03-01 00:00:00    1.914062
2023-03-01 00:01:00    1.914019
2023-03-01 00:02:00    1.913976
2023-03-01 00:03:00    1.913932
2023-03-01 00:04:00    1.913889
                         ...   
2024-02-29 23:55:00    1.726780
2024-02-29 23:56:00    1.726736
2024-02-29 23:57:00    1.726693
2024-02-29 23:58:00    1.726649
2024-02-29 23:59:00    1.726606
Name: X853, Length: 525887, dtype: float64


In [4]:
for i in range(877, 884):
    prices = train_data[f"X{i}"]

    figure = go.Figure()

    figure.add_trace(
        go.Scatter(
            x = prices.index,
            y = prices.values,
            mode = "lines",
            name = "Price"
        )
    )
    figure.update_layout(
        title = f"X{i}",
        xaxis_title = "Time",
        yaxis_title = "Price"
    )

    figure.show()

In [13]:
prices = 0.9 * train_data["X883"] + 0.1 * train_data["X877"]

In [14]:
"""
Time series plot of price movement
"""
figure = go.Figure()

figure.add_trace(
    go.Scatter(
        x = prices.index,
        y = prices.values,
        mode = "lines",
        name = "Price"
    )
)
figure.update_layout(
    title = "Price movement",
    xaxis_title = "Time",
    yaxis_title = "Price"
)

figure.show()

In [37]:
"""
Frequency histogram
"""
figure = go.Figure()

figure.add_trace(go.Histogram(x=prices.values))
figure.update_layout(title = "Price Frequency Histogram")

figure.show()

In [7]:
print(f"mean: {np.mean(prices)}")
print(f"std: {np.std(prices)}")
print(f"skewness: {stats.skew(prices)}")
print(f"kurtosis: {stats.kurtosis(prices)}")
print("--------------------------------")
print(f"max: {np.max(prices)}")
print(f"min: {np.min(prices)}")
print("--------------------------------")
print(f"25th percentile: {np.percentile(prices, 25)}")
print(f"50th percentile: {np.median(prices)}")
print(f"75th percentile: {np.percentile(prices, 75)}")


mean: -0.13586094430964898
std: 0.7357398346979444
skewness: -0.006358737099418635
kurtosis: 1.3418617832963164
--------------------------------
max: 4.055632369316769
min: -5.4865978254640995
--------------------------------
25th percentile: -0.5763062761602153
50th percentile: -0.13346607758880757
75th percentile: 0.3012955901995651


In [5]:
frac_to_save = 0.4
data = pd.read_parquet("../data/train.parquet")

n_to_save = int(len(data) * frac_to_save)

data.tail(n_to_save).to_parquet(f"../data/train_reduced_{int(100*frac_to_save)}.parquet", index=False)

In [None]:
data = pd.read_parquet("../data/train.parquet")

data["bid_ask_spread"] = (data["ask_qty"] - data["bid_qty"]) / (data["ask_qty"] + data["bid_qty"] + 1e-10)

data["buy_pressure"] = data["buy_qty"] / (data["volume"] + 1e-10)
data["sell_pressure"] = data["sell_qty"] / (data["volume"] + 1e-10)


data["liquidity_ratio"] = (data["bid_qty"] + data["ask_qty"]) / (data["volume"] + 1e-10)