In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
from tqdm import tqdm

In [2]:
from cal_regre_engine import CalRegreEngine

In [3]:
import warnings
warnings.filterwarnings("ignore")

### 1. Generate the Variables


- window_len = 50


In [4]:
window_len = 50

##### Calculate the Diff and MLOFI


In [5]:
generator_option = False
if generator_option:
    for i in [25, 26, 27, 28, 30]:
        depth_path = 'data/btc_usdt/depth_202401' + str(i) + '.parquet'
        depth_data = pd.read_parquet(depth_path)
        diff = CalRegreEngine.cal_mid_price_diff(depth_data, window_len=window_len, test_len=None)
        # diff.to_parquet("datasets/mid_price_diff_01" + str(i) + ".parquet")

        mlofi = CalRegreEngine.cal_mlofi(depth_data, M=5, window_len=window_len, test_len=None)
        # mlofi.to_parquet("datasets/mlofi_01" + str(i) + ".parquet")

        print(str(i) + ' done')

In [7]:
day = 24
mlofi_df1 = pd.read_parquet("data/mlofi/mlofi_01" + str(day) + ".parquet")
mid_price_diff1 = pd.read_parquet("data/mid_price_diff/mid_price_diff_01" + str(day) + ".parquet")

In [8]:
mlofi_df1.shape, mid_price_diff1.shape

((17182, 5), (17182, 1))

In [11]:
mlofi_df1.head()

Unnamed: 0,mlofi1,mlofi2,mlofi3,mlofi4,mlofi5
0,-1.99673,-0.31191,0.3568,-1.75933,0.42191
1,59.6549,3.82402,13.38811,4.45591,3.23606
2,13.26167,0.39202,0.73904,1.20042,0.81627
3,-0.90033,-0.08089,0.08742,-1.10026,-0.56421
4,-1.35211,-0.0624,-0.68546,-0.01189,1.01046


In [12]:
mid_price_diff1.head()

Unnamed: 0,mid_price_diff
0,-3.33
1,9.96
2,1.24
3,-1.24
4,-4.65


In [9]:
mlofi_df2 = pd.read_parquet("data/mlofi/mlofi_01" + str(day+1) + ".parquet")
mid_price_diff2 = pd.read_parquet("data/mid_price_diff/mid_price_diff_01" + str(day+1) + ".parquet")

In [10]:
mlofi_df2.shape, mid_price_diff2.shape

((17151, 5), (17151, 1))

In [13]:
mlofi_df2.head()

Unnamed: 0,mlofi1,mlofi2,mlofi3,mlofi4,mlofi5
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.49595,0.88347,0.55579
2,-8.87899,-0.5004,-0.37653,-0.33007,-0.80494
3,-4.78709,-0.01023,-0.16396,0.98243,-0.88727
4,36.0146,1.87416,2.38905,2.55124,0.69431


In [14]:
mid_price_diff2.head()

Unnamed: 0,mid_price_diff
0,0.0
1,0.0
2,-13.41
3,-14.63
4,-10.48


### 2. Test the multicollinearity

##### Method 1: Correlation Matrix


In [None]:
correlation_matrix = np.corrcoef(mlofi_df1, rowvar=False)
pd.DataFrame(correlation_matrix)    # correlation matrix

##### Method 2: Eigenvalues


In [None]:
eigenvalues, eigenvectors = np.linalg.eig(correlation_matrix)
sorted_eigenvalues = np.sort(eigenvalues)[::-1]

# 画出特征值的折线图
plt.plot(range(1, len(sorted_eigenvalues) + 1), sorted_eigenvalues, marker='o', color='r', linestyle='-')
plt.xlabel('Number of Eigenvalues')
plt.ylabel('Eigenvalue Size')
plt.grid(False)
plt.show()

##### Method 3: VIF (Variance Inflation Factor)


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["features"] = mlofi_df1.columns
vif["VIF Factor"] = [variance_inflation_factor(mlofi_df1.values, i) for i in range(mlofi_df1.shape[1])]
vif

### 3. OLS regression


In [None]:
mid_price_diff1.index = np.arange(mid_price_diff1.shape[0])
mlofi_df1.index = np.arange(mlofi_df1.shape[0])

In [None]:
ols_res = []
for M in tqdm(range(5)):
    ols_res.append(CalRegreEngine.ols_regre(mid_price_diff1, 
                                            mlofi_df1.iloc[:, :M+1], 
                                            lag=100, 
                                            with_const=False))

In [None]:
M_ls, rsquared_adj_ls = [], []
for j in range(len(ols_res)):
    M_ls.append(j+1)
    rsquared_adj_ls.append(ols_res[j].rsquared_adj)

In [None]:
plt.plot(M_ls, rsquared_adj_ls, marker='o', color='r', linestyle='-')

In [None]:
ols_res[4].summary()

### 4. Ridge Regression


In [None]:
best_alpha_train, metrics_train, M_ = [], [], []
for M in tqdm(range(1, 6)):
    best_alpha, metrics = CalRegreEngine.ridge_regre(mid_price_diff1, 
                                                      mlofi_df1.iloc[:, :M], 
                                                      lag=100, alphas=np.linspace(0.01, 1000))
    best_alpha_train.append(best_alpha)
    metrics_train.append(metrics)
    M_.append(M)

In [None]:
plt.plot(M_, [metrics_train[i]["adj-r2"] for i in range(len(metrics_train))], marker='o', color='y', linestyle='-')   # adj-r2 and M

In [None]:
plt.plot(M_, [metrics_train[i]["rmse"] for i in range(len(metrics_train))], marker='o', color='y', linestyle='-')   # adj-r2 and M

In [None]:
params = {
    'M': M_[2],
    'alpha': best_alpha_train[2]
}

##### Examine on test data


In [None]:
alpha_test, metrics_test = CalRegreEngine.ridge_regre(mid_price_diff1, 
                                        mlofi_df1.iloc[:, :params["M"]], 
                                        lag=100, 
                                        alphas=params["alpha"])

In [None]:
metrics_test