In [30]:
import polars as pl
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from io import StringIO
from linearmodels.panel import PanelOLS

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [21]:
df = pl.read_parquet(r'C:\Users\310\Desktop\Progects_Py\data\microstructure_price_prediction_data\cross_section\df_cross_section_V0.1_.parquet')

Note that here we are going to use our precomputed features and some raw inputs. We are going to drop "target" (use target_one_step_ahead instead).

In [None]:
df = df.select(pl.exclude("target"))


In [23]:
train_test_ratio = 0.5


df_train = pl.DataFrame()
df_test = pl.DataFrame()

for currency_pair in df["symbol"].unique():

    curr_df = df.filter(pl.col('symbol') == currency_pair)
    split_indx = int(len(curr_df) * train_test_ratio)
    
    train = curr_df[:split_indx]
    test = curr_df[split_indx:]

    print(f'Train len for {currency_pair} is {len(train)}')
    print(f'Test len for {currency_pair} is {len(test)}')

    df_train = df_train.vstack(train) if not df_train.is_empty() else train
    df_test = df_test.vstack(test) if not df_test.is_empty() else test



Train len for AVAX-USDT is 676828
Test len for AVAX-USDT is 676829
Train len for DOGE-USDT is 1624660
Test len for DOGE-USDT is 1624660


### Mixed effects (Fixed effects with both time entity effects and time effects).

In [24]:
# Data preparation 
df_train_pd = df_train.to_pandas()
df_test_pd = df_test.to_pandas()

df_train_pd = df_train_pd.set_index(['symbol', 'trade_time'])
df_test_pd = df_test_pd.set_index(['symbol', 'trade_time'])

df_train_pd = df_train_pd.drop(columns=["is_buyer_maker", "date"]).dropna()
df_test_pd = df_test_pd.drop(columns=["is_buyer_maker", "date"]).dropna()

In [None]:
# Feature selection 
target_column = "target_one_step_ahead"
assert target_column in df_train_pd.columns, f"{target_column} not found in train data columns"

scaler = StandardScaler()
X_train = scaler.fit_transform(df_train_pd.drop(target_column, axis=1))

# Feature selection using LASSO
lasso = LassoCV(cv=2).fit(X_train, df_train_pd[target_column])
selector = SelectFromModel(lasso, prefit=True)

# Select important features
selected_features = list(df_train_pd.drop(target_column, axis=1).columns[selector.get_support()])

print(f" features selected by Lasso: {selected_features}")

train_df = df_train_pd[selected_features + [target_column]]
test_df = df_test_pd[selected_features + [target_column]]

['last_ask', 'target_one_step_ahead']


In [33]:
# FE fit 
# Prepare training data
y_train = train_df[target_column]
X_train = train_df[selected_features].assign(const=1)

# Fit fixed-effects model
fe_model = PanelOLS(y_train, X_train, entity_effects=True, time_effects=True)
fe_results = fe_model.fit()
print(fe_results.summary)

# Prepare test data
X_test = test_df[selected_features].assign(const=1)
y_test = test_df[target_column]

# Predict on test data
predictions = fe_results.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, predictions)
mape = mean_absolute_percentage_error(y_test, predictions)

print(f"Evaluation Metrics:\nMSE: {mse}\nMAPE: {mape}")

                            PanelOLS Estimation Summary                            
Dep. Variable:     target_one_step_ahead   R-squared:                        1.0000
Estimator:                      PanelOLS   R-squared (Between):              1.0000
No. Observations:                2301472   R-squared (Within):               1.0000
Date:                   Sat, Nov 23 2024   R-squared (Overall):              1.0000
Time:                           17:13:40   Log-likelihood                 1.439e+07
Cov. Estimator:               Unadjusted                                           
                                           F-statistic:                   4.206e+08
Entities:                              2   P-value                           0.0000
Avg Obs:                       1.151e+06   Distribution:                 F(1,12539)
Min Obs:                       6.768e+05                                           
Max Obs:                       1.625e+06   F-statistic (robust):          4.

In [28]:
progress = tqdm(total=1, desc="Training Random Forest")

# Initialize Random Forest with hyperparameters
rf = RandomForestRegressor(
    n_estimators=100,  # Default, adjust if needed
    criterion='squared_error',
    max_depth=10,      # Maximum depth of random tree
    min_samples_split=50,  # Minimum number of samples required to split an internal node
    min_samples_leaf=20, #Minimum number of samples required to be at a leaf node
    min_weight_fraction_leaf=0.2, #Minimum weighted fraction of the sum of weights required at a leaf node
    #max_features='auto', #Number of features considered when looking for the best split
    max_leaf_nodes=100, #Maximum number of leaf nodes in a tree
    min_impurity_decrease=0.01,
    random_state=42,
    n_jobs=-1,         # Use all cores for faster training
    verbose=2
)

cv_scores = cross_val_score(rf, X.head(1000), y.head(1000), cv=5, scoring="neg_mean_squared_error")

rf.fit(X.head(1000), y.head(1000))
progress.update(1)
progress.close()

# Results
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV R²: {np.mean(cv_scores)}")

Training Random Forest:   0%|          | 0/1 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    0.1s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 1 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100

building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    0.0s


building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    0.0s


building tree 1 of 100building tree 2 of 100
building tree 3 of 100

building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
Training Random Forest: 100%|██████████| 1/1 [00:01<00:00,  1.67s/it]

building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b


