In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
from tqdm import tqdm

In [2]:
from cal_regre_engine import CalRegreEngine

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
window_len = 50

In [5]:
mlofi_df1, mid_price_diff1 = pd.DataFrame(), pd.DataFrame()
mlofi_df2, mid_price_diff2 = pd.DataFrame(), pd.DataFrame()
for day1 in [24, 25, 26, 27, 28]:
    mlofi1 = pd.read_parquet("data/mlofi/mlofi_01" + str(day1) + ".parquet")
    diff1 = pd.read_parquet("data/mid_price_diff/mid_price_diff_01" + str(day1) + ".parquet")
    mlofi_df1 = pd.concat([mlofi_df1, mlofi1], axis=0)
    mid_price_diff1 = pd.concat([mid_price_diff1, diff1], axis=0)
    
for day2 in [29, 30]:
    mlofi2 = pd.read_parquet("data/mlofi/mlofi_01" + str(day2) + ".parquet")
    diff2 = pd.read_parquet("data/mid_price_diff/mid_price_diff_01" + str(day2) + ".parquet")
    mlofi_df2 = pd.concat([mlofi_df2, mlofi2], axis=0)
    mid_price_diff2 = pd.concat([mid_price_diff2, diff2], axis=0)

In [6]:
mlofi_df1.index = np.arange(mlofi_df1.shape[0])
mlofi_df2.index = np.arange(mlofi_df2.shape[0])
mid_price_diff1.index = np.arange(mid_price_diff1.shape[0])
mid_price_diff2.index = np.arange(mid_price_diff2.shape[0])

In [7]:
mlofi_df1.shape, mid_price_diff1.shape

((85477, 5), (85477, 1))

In [8]:
mlofi_df2.shape, mid_price_diff2.shape

((34302, 5), (34302, 1))

## Data Description


In [9]:
mid_price_diff1.describe()

Unnamed: 0,mid_price_diff
count,85477.0
mean,0.035807
std,5.551645
min,-120.36
25%,0.0
50%,0.0
75%,0.0
max,159.62


In [10]:
(mid_price_diff1 < -10).sum() / mid_price_diff1.shape[0], (mid_price_diff1 > 10).sum() / mid_price_diff1.shape[0]

(mid_price_diff    0.033073
 dtype: float64,
 mid_price_diff    0.035132
 dtype: float64)

In [11]:
mid_price_diff2.describe()

Unnamed: 0,mid_price_diff
count,34302.0
mean,0.012902
std,6.014275
min,-50.42
25%,0.0
50%,0.0
75%,0.0
max,101.98


In [12]:
(mid_price_diff2 < -10).sum() / mid_price_diff2.shape[0], (mid_price_diff2 > 10).sum() / mid_price_diff2.shape[0]

(mid_price_diff    0.039531
 dtype: float64,
 mid_price_diff    0.041689
 dtype: float64)

## Ridge Classifier


In [13]:
first_bar, second_bar = 10, 5

In [14]:
best_alpha_r1, ridge_r1, metrics_r1 = CalRegreEngine.ridge_classifier_train(mid_price_diff=mid_price_diff1, 
                                                            mlofi=mlofi_df1, 
                                                            first_bar=first_bar, 
                                                            second_bar=second_bar, 
                                                            lag=100, balance=True)

In [15]:
pd.DataFrame(metrics_r1["report"])

Unnamed: 0,-1.0,0.0,1.0,accuracy,macro avg,weighted avg
precision,0.282051,0.365609,0.356077,0.359312,0.334579,0.334763
recall,0.038128,0.756477,0.283531,0.359312,0.359379,0.359312
f1-score,0.067176,0.492966,0.31569,0.359312,0.291944,0.292337
support,577.0,579.0,589.0,0.359312,1745.0,1745.0


In [16]:
metrics_r2, y_pred, y_test = CalRegreEngine.ridge_classifier_test(mid_price_diff=mid_price_diff2, 
                                                  mlofi=mlofi_df2, 
                                                  first_bar=first_bar, 
                                                  second_bar=second_bar, 
                                                  ridge=ridge_r1, 
                                                  lag=100)

In [17]:
pd.DataFrame(metrics_r2["report"])

Unnamed: 0,-1.0,0.0,1.0,accuracy,macro avg,weighted avg
precision,0.044791,0.927926,0.056746,0.721858,0.343154,0.857541
recall,0.034535,0.770671,0.283086,0.721858,0.362764,0.721858
f1-score,0.039,0.842019,0.09454,0.721858,0.325186,0.779865
support,1332.0,31457.0,1413.0,0.721858,34202.0,34202.0


In [18]:
test_pred_df = pd.DataFrame({"y_pred": y_pred.astype(int).tolist(), "y_test": y_test.tolist()})

### Labelling rules
e.g.    y_test=-1. y_pred=1 -> "-1_to_1"


In [28]:
test_pred_df["label"] = np.zeros((test_pred_df.shape[0], 1))

In [29]:
for i in tqdm(range(test_pred_df.shape[0])):
    if test_pred_df["y_pred"].iloc[i] == -1 and test_pred_df["y_test"].iloc[i] == -1:
        test_pred_df["label"].iloc[i] = "-1_to_-1"
    elif test_pred_df["y_pred"].iloc[i] == -1 and test_pred_df["y_test"].iloc[i] == 0:
        test_pred_df["label"].iloc[i] = "-1_to_0"
    elif test_pred_df["y_pred"].iloc[i] == -1 and test_pred_df["y_test"].iloc[i] == 1:
        test_pred_df["label"].iloc[i] = "-1_to_1"
    elif test_pred_df["y_pred"].iloc[i] == 0 and test_pred_df["y_test"].iloc[i] == -1:
        test_pred_df["label"].iloc[i] = "0_to_-1"
    elif test_pred_df["y_pred"].iloc[i] == 0 and test_pred_df["y_test"].iloc[i] == 0:
        test_pred_df["label"].iloc[i] = "0_to_0"
    elif test_pred_df["y_pred"].iloc[i] == 0 and test_pred_df["y_test"].iloc[i] == 1:
        test_pred_df["label"].iloc[i] = "0_to_1"
    elif test_pred_df["y_pred"].iloc[i] == 1 and test_pred_df["y_test"].iloc[i] == -1:
        test_pred_df["label"].iloc[i] = "1_to_-1"
    elif test_pred_df["y_pred"].iloc[i] == 1 and test_pred_df["y_test"].iloc[i] == 0:
        test_pred_df["label"].iloc[i] = "1_to_0"
    elif test_pred_df["y_pred"].iloc[i] == 1 and test_pred_df["y_test"].iloc[i] == 1:
        test_pred_df["label"].iloc[i] = "1_to_1"

100%|██████████| 34202/34202 [00:05<00:00, 6376.55it/s]


In [30]:
test_pred_df["label"].value_counts()

label
0_to_0      24243
1_to_0       6283
0_to_1        963
-1_to_0       931
0_to_-1       920
1_to_1        400
1_to_-1       366
-1_to_1        50
-1_to_-1       46
Name: count, dtype: int64