In [None]:
# =============================================================================
# following is the model selection process for "Classic Time Series Models":
# =============================================================================
# Example: Uniqueness is stationary, has a long decaying autocorrelation(cutoff at lag >10) and partial autocorrelation(cutoff at lag 2).

# Start
#  |
#  |-- Is the series stationary? (ADF test, KPSS test, plot)
#  |     | ADF (Augmented Dickey-Fuller) test
#  |     | KPSS (Kwiatkowski–Phillips–Schmidt–Shin) test
#  |     |-- No --> Apply differencing (log/first/second diff)
#  |     |            |
#  |     |            |-- Recheck stationarity
#  |     |                   |
#  |     |                   |-- Still non-stationary? --> Consider advanced models (e.g., trend modeling, transformations)
#  |     |
#  |     |-- Yes
#  |
#  |-- Examine ACF and PACF plots (on stationary series)
#  |     |
#  |     |-- ACF cuts off at lag q, PACF tails off --> MA(q)
#  |     |
#  |     |-- PACF cuts off at lag p, ACF tails off --> AR(p)
#  |     |
#  |     |-- Both ACF and PACF tail off --> ARMA(p, q)
#  |     |
#  |     |-- ACF and PACF have pattern after differencing --> ARIMA(p, d, q)
#  |
#  |-- For Seasonal Patterns? (ACF spikes at seasonal lags like 12, 24)
#  |     |
#  |     |-- Yes --> Use SARIMA(p,d,q)(P,D,Q,s)
#  |     |          |
#  |     |          |-- Use seasonal ACF/PACF for seasonal P and Q
#  |
#  |-- Fit candidate models (e.g., ARIMA, MA, AR, SARIMA)
#  |     |
#  |     |-- Evaluate using AIC, BIC
#  |     |     |
#  |     |     |-- Select models with lowest AIC/BIC
#  |
#  |-- Perform residual diagnostics on selected models
#  |     |
#  |     |-- Are residuals uncorrelated? (Ljung-Box test)
#  |     |-- Are residuals homoscedastic and normal?
#  |     |
#  |     |-- No --> Model misspecification → Refine (adjust p/q, add seasonal terms)
#  |     |
#  |     |-- Yes
#  |
#  |-- (Optional) Compare forecast accuracy on validation data
#  |     |
#  |     |-- Use RMSE, MAE, MAPE
#  |
#  |-- Are multiple models close in performance?
#  |     |
#  |     |-- Yes --> Choose simpler (lower-order) model
#  |     |
#  |     |-- No --> Choose best-performing model
#  |
#  ✅ Final Model Selected
#

In [None]:
# =============================================================================
# Modern(ML) Models advantages over classic models:
#   1. Automatic feature extraction (optional)
#   2. representation learning (optional) (generalize to lead to other more advanced techniques that is unimaginable in the past)
#   3. More complex(e.g. Non-linear) relationships

# Consider family of random forest as baseline "Feature-Based Modern(ML) Time Series Models" here:

# =============================================================================

# | Model                    | Year  | Core Method                  | Key Features & Tricks                          | Training Style         | Functional Modules                                     | Strengths                        | Weaknesses                                       |

# | ------------------------ | ----- | ---------------------------- | ---------------------------------------------- | ---------------------- | ----------------------------------------------------------------------------------------- | ------------------------------------------------ |

# | **CART** (Decision Tree) | 1986  | Greedy splits                | Gini impurity / MSE; max depth; pruning        | Recursive partitioning | - Split criteria                                       | Simple, interpretable            | High variance, overfitting                       |

# |                          |       |                              |                                                |                        | - Tree structure                                       |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Pruning                                              |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Impurity computation                                 |                                  |                                                  |

# | **Bagging**              | 1996  | Bootstrapped trees           | Multiple trees on random samples               | Parallel training      | - Bootstrap sampler                                    | Reduces variance                 | Still sensitive to overfitting on noisy features |

# |                          |       |                              |                                                |                        | - Aggregation (voting/averaging)                       |                                  |                                                  |

# | **Random Forest**        | 2001  | Bagging + feature randomness | Random feature subset at each split            | Parallel training      | - Bootstrap sampler                                    | Robust, handles high dimensions  | Slow for large datasets                          |

# |                          |       |                              |                                                |                        | - Feature subspace sampler                             |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Tree ensemble                                        |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Majority voting                                      |                                  |                                                  |

# | **ExtraTrees**           | 2006  | Fully randomized trees       | Random feature **and** threshold selection     | Parallel training      | - Random threshold selector                            | Very fast, low variance          | Slightly higher bias                             |

# |                          |       |                              |                                                |                        | - Feature subspace                                     |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Ensemble aggregator                                  |                                  |                                                  |

# | **XGBoost**              | 2014  | Gradient Boosting            | Regularization, shrinkage, weighted splits     | Sequential boosting    | - Gradient calculator                                  | High accuracy, scalable          | Sensitive to hyperparams                         |

# |                          |       |                              |                                                |                        | - Loss function                                        |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Tree pruner                                          |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Column block optimization                            |                                  |                                                  |

# | **LightGBM**             | 2017  | Gradient Boosting            | Leaf-wise growth, histogram bins               | Sequential boosting    | - Histogram binning                                    | Fast, efficient memory use       | Overfits small data if not regularized           |

# |                          |       |                              |                                                |                        | - Leaf-wise tree builder                               |                                  |                                                  |

# |                          |       |                              |                                                |                        | - GPU training                                         |                                  |                                                  |

# | **CatBoost**             | 2017  | Ordered Boosting             | Categorical encoding (ordered target encoding) | Sequential boosting    | - Ordered target encoder                               | Best with categorical features   | Slower on numeric-only datasets                  |

# |                          |       |                              |                                                |                        | - Symmetric trees                                      |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Bayesian averaging                                   |                                  |                                                  |

# | **gcForest**             | 2017  | Layered Forests              | Deep cascade of forests, auto ensemble         | Layer-wise cascading   | - Multi-grain scanning                                 | Handles small data well          | Complex to tune and understand                   |

# |                          |       |                              |                                                |                        | - Cascaded forests                                     |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Auto model selection                                 |                                  |                                                  |

# | **Neural Forests**       | 2020s | Soft/diff. splits            | Differentiable nodes, hybrid with neural nets  | Backpropagation + SGD  | - Soft split function (sigmoid)                        | Can be trained end-to-end        | Less interpretable, newer technique              |

# |                          |       |                              |                                                |                        | - Neural layers                                        |                                  |                                                  |

# |                          |       |                              |                                                |                        | - Loss-based gradient optimization                     |                                  |                                                  |

# =============================================================================

In [None]:
# Long term models (macro/micro(fundamentals)-economics indicators):
#   - can have good overall accuracy
#   - features are relatively strong, high covariance to predictable target
# models: ridge, lasso, penalized regression models(elastic nets), classic ensembles like RandomForest(XGBoost) type

# Short term models (market micro-structures):
#   - signal is sparse, noise is constantly present
#   - prioritize Precision/Recall/F1 over overall accuracy
#   - clean signals(trading opportunities) only appear after certain feature combination with certain values
# models: most simply doesn't work

# there are 2 paths for ML4F:
#   - Machine Learning
#     - model learns combinations from thousands of weak features
#     - Model quality is key, most models can't do this well(this even include bootstrap ensembles with boosting like XGBoost, LightGBM, CatBoost)
#     - performance decay is slower, because this is a more general approach, doesn't require large number of high-quality alphas(which decay fast)
#     - once decay, simply refit and you will have a good model again
#     - work well with mined alphas
#   - Alpha Mining:
#     - model relies on high quality mined alphas, which will have good covariance with predicting target
#     - simpler models can work
#     - need very sophisticated alpha mining algorithm(maybe with RL), very hard(technically) to eliminate overfitting
#     - performance decay fast

# standard models like RandomForest/Boosting/NN/ResNet is unlikely to work well on sparse-signals/weak-features:
#     - these are all MLE-alike models(trees are discriminative, NN can be either discriminative or generative(VAE/GAN)), when you cannot fit a good model, you should probably lower expectation, only trade when opportunities are more certain
#     - it tries to optimize overall accuracy, which is practically impossible for low SNR environment
#     - these model make inherent assumptions like: target classes are continuous in feature space (thus the greedy split)
#     - its relatively simple features selection will include the weak(high noise) features to grow trees, which will not have good performance
#     - even worse, boosting/ResNet requires high quality fitted base models(which will not happen/converge) to yield real/meaningful loss values, to be fitted again later
#     - forcefully fitting under low SNR with weak features will result in terrible overfitting (both in based model, and more so in residue models)

# we introduce Confidence-Based (Ensemble) Models for market micro-structure models:
# BNN has the best theoretical foundation, and accurate confidence estimation, but is hard to scale.
# others are usually approximation of BNN under some assumptions, or using ensemble methods to lower variance (may or may not be Bayesian).
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Model / Approach       | Key Idea / Mechanism                       | Confidence Estimation        | Output Selection / Aggregation| Strengths                      | Weaknesses                    | Paper / Origin      |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Deep Ensembles         | Train multiple NNs independently           | Predictive variance across   | Mean prediction; variance     | Captures epistemic uncertainty;| Expensive to train and store  | Lakshminarayanan    |
# |                        | with different inits and data shuffles     | models with different inits  | indicates uncertainty         | strong calibration             | multiple large models         | et al., 2017        |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | MC-Dropout             | Use dropout at test time to sample         | Variance across dropout      | Mean and variance of          | Lightweight Bayesian inference;| Dropout tuning is critical;   | Gal & Ghahramani,   |
# |                        | outputs (Bayesian approximation)           | outputs from same network    | predictions                   | works on existing models       | not always reliable           | 2016                |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Bootstrap Ensembles    | Train NNs on bootstrapped datasets         | Output spread across         | Voting or averaging +         | Handles label noise well;      | Can underperform in data-     | Classic bagging     |
# |                        | to capture diverse noise patterns          | independently trained models | confidence threshold          | diverse feature exploration    | scarce regimes                | with NNs            |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | SelectiveNet           | Jointly learns to predict and abstain      | Trainable confidence         | Predict only when             | High precision when selective; | Needs abstention-aware        | Geifman & El-Yaniv, |
# |                        | using an auxiliary confidence head         | head output                  | confidence exceeds threshold  | end-to-end trainable           | loss; may skip hard cases     | 2019                |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Conf.-Aware KD         | Student learns from confident              | Entropy or margin of         | Filter teacher targets        | Robust to noisy labels;        | Requires strong teacher;      | Hinton-style KD     |
# |                        | teacher predictions in KD setup            | teacher predictions          | by confidence                 | inherits reliable signal       | depends on confident signal   | + confidence gating |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | DeepGAM                | Ensemble with soft feature-gating          | Softmax gate activation      | Weighted expert output        | Interpretable gating;          | Gates may confuse;            | Chang et al.,       |
# |                        | where each expert activates conditionally  | controls expert firing       | by gating network             | local specialization           | model may not abstain         | 2021 (ICML)         |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Bayesian NNs (BNNs)    | Treat weights as distributions             | Posterior variance from      | Mean prediction + credible    | Captures epistemic +           | Hard to scale; variational    | Blundell et al.,    |
# |                        | and output as expectation                  | sampling posterior weights   | interval for confidence       | aleatoric uncertainty          | methods often poor approx.    | 2015                |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Ensemble Temp. Scaling | Calibrate softmax with temperature         | Softmax confidence           | Apply confidence thresholds   | Improves probabilistic         | Doesn’t affect predictions;   | Guo et al.,         |
# |                        | post-training                              | after scaling                | post calibration              | calibration of models          | only recalibrates scores      | 2017 (NIPS)         |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | DUQ                    | Distance to class prototypes               | Distance from embedding      | Predict only if within        | Simple and fast;               | Limited to clear class        | van Amersfoort      |
# |                        | gives predictive confidence                | to known class centroids     | decision boundary             | no sampling required           | boundaries; low flexibility   | et al., 2020        |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Focal Loss Ensembles   | Use focal loss to prioritize               | Internal confidence          | Aggregate only when           | Improves recall on rare        | Sensitive to tuning of        | Lin et al.,         |
# |                        | hard/rare samples in ensemble              | based on loss weighting      | model is confident            | events; reduces overfitting    | focal loss params             | 2017 (RetinaNet)    |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Trust Score Ensemble   | Post-hoc distance ratio                    | Ratio of distances to        | Select models where           | Interpretable; simple          | Needs external scoring        |                     |
# |                        | for estimating prediction trust            | nearest labeled samples      | trust score exceeds threshold | and model-agnostic             | set; limited to clean regions |                     |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+

In [20]:
# Visualize our dataset

# | Method                | Class Separability  | Ground Truth Alignment | Generalization        |
# | --------------------- | ------------------- | ---------------------- | --------------------- |
# | PCA                   | Low (linear)        | Weak                   | Good                  |
# | t-SNE                 | High (local)        | Moderate               | Poor (non-parametric) |
# | UMAP (unsupervised)   | High (local/global) | Moderate               | Good                  |
# | **UMAP (supervised)** | **Very High**       | **Strong**             | Good (but biased)     |

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

import os
import umap
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go


datas = [
    'bar_and_label(calmar)',
    # 'bar_and_label(trend_1to8)',
    # 'bar_and_label(trend_3to24)',
    'features_and_label(candlestick_calmar)',
    'features_and_label(candlestick_trend_3to24)',
    'features_and_label(tsfresh_calmar)',
    'features_and_label(tsfresh_trend_3to24)',
]

def viz(data: str):
    print(f"Visualizing {data} dataset...")
    raw_data = pd.read_parquet(os.path.join(os.getcwd(), f'{data}.parquet'))
    df = raw_data[-(12 * 24 * 7 * 4):].copy()  # Last 1 day

    # Classify labels (optional, here just use existing label)
    y = df['label']
    X = df.drop(columns=['label'])

    # Scale and split
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.01, shuffle=False
    )

    # # 3D t-SNE
    # tsne = TSNE(n_components=3, perplexity=30, learning_rate=200, max_iter=500, random_state=42)
    # X_projection = tsne.fit_transform(X_train)
    
    # unsupervised UMAP
    reducer = umap.UMAP(n_components=3, n_neighbors=40, min_dist=0.1)
    X_projection = np.array(reducer.fit_transform(X_train))

    # Plotly 3D scatter plot
    fig = go.Figure(data=go.Scatter3d(
        x=X_projection[:, 0],
        y=X_projection[:, 1],
        z=X_projection[:, 2],
        mode='markers',
        marker=dict(
            size=3,
            color=y_train,  # Color by label
            colorscale='Viridis',
            opacity=0.8,
            colorbar=dict(title="Label")
        ),
        text=[f"Label: {label}" for label in y_train]
    ))

    fig.update_layout(
        title=f"3D t-SNE/UMAP Visualization of {data}",
        scene=dict(
            xaxis_title='Component 1',
            yaxis_title='Component 2',
            zaxis_title='Component 3'
        ),
        width=400,   # Set plot width (in pixels)
        height=300,  # Set plot height (in pixels)
        margin=dict(l=0, r=0, b=0, t=40)
    )

    fig.show()

for data in datas:
    viz(data)


Visualizing bar_and_label(calmar) dataset...


Visualizing features_and_label(candlestick_calmar) dataset...


Visualizing features_and_label(candlestick_trend_3to24) dataset...


Visualizing features_and_label(tsfresh_calmar) dataset...


Visualizing features_and_label(tsfresh_trend_3to24) dataset...


#### ❌ None of the previous datasets are "easy" to model 

In [65]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.utils import resample, shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score
from sklearn.metrics import roc_curve, classification_report, confusion_matrix, accuracy_score
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# raw_data = pd.read_parquet(os.path.join(os.getcwd(), 'bar_and_label.parquet'))
# raw_data = pd.read_parquet(os.path.join(os.getcwd(), 'bar_and_label(trend_1to8).parquet'))
# raw_data = pd.read_parquet(os.path.join(os.getcwd(), 'bar_and_label(trend_3to24).parquet'))
raw_data = pd.read_parquet(os.path.join(os.getcwd(), 'features_and_label(candlestick_calmar).parquet'))

df = raw_data[-(60*24*7):].copy()
# df = raw_data.copy()

# H_PER_D = 23                                # trade hours per day
# P_PER_B = 5                                 # equivalent bar period
# EMA_VOL_SPAN = int(60/P_PER_B * H_PER_D)    # span for EMA volatility (daily)
# CUSUM_FACTOR = 0.6                          # multiplier for CUSUM threshold
#
# # --- VOLATILITY AND FILTER ---
# label = df[['label']].copy()
# label['ref'] = np.log((df['close'] / df['close'].iloc[0]).fillna(1))
# # EMA-based volatility on log returns
# label['return'] = df['close'].pct_change().fillna(0)
# # daily vol (volume/run bars has more constant volatility for their homoscedasticity)
# label['pos_return'] = label['return'].where(label['return'] > 0, 0.0001)
# label['neg_return'] = label['return'].where(label['return'] < 0, 0.0001)
# label['pos_vol'] = label['pos_return'].ewm(span=EMA_VOL_SPAN, adjust=False).std().replace(0, np.nan).ffill().bfill()
# label['neg_vol'] = label['neg_return'].ewm(span=EMA_VOL_SPAN, adjust=False).std().replace(0, np.nan).ffill().bfill()
#
# # CUSUM to mark breakout events and direction
# s_pos, s_neg = 0.0, 0.0
# label['event'] = 0.0 # np.nan
# for i in range(1, len(label)):
#     # note that for time i, the label/prediction is calculated after all info of that time is known
#     pos_threshold = label['pos_vol'].iloc[i] * CUSUM_FACTOR
#     neg_threshold = label['neg_vol'].iloc[i] * CUSUM_FACTOR
#     diff = label['return'].iloc[i]
#     s_pos = max(0, s_pos + diff)
#     s_neg = min(0, s_neg + diff)
#     index = label.index[i]
#     if s_pos > pos_threshold:
#         label.loc[index, 'event'] = 1
#         s_pos = 0.0
#     elif s_neg < -neg_threshold:
#         label.loc[index, 'event'] = -1
#         s_neg = 0.0
#
# index = label.index[label['event'] != 0]
#
# print(df.shape)
# df.drop(columns=['open', 'high', 'low', 'close', 'uniqueness'], axis=1, inplace=True)
# df = df.loc[index]
# print(df.shape)

# | σ    | Cumulative Probability | Approx. % within ±σ range          |
# |------|------------------------|------------------------------------|
# | 0.1  | 0.0797                 | ~7.97% within ±0.1σ                |
# | 0.2  | 0.1587                 | ~15.87% within ±0.2σ               |
# | 0.3  | 0.2266                 | ~22.66% within ±0.3σ               |
# | 0.4  | 0.3108                 | ~31.08% within ±0.4σ               |
# | 0.5  | 0.3829                 | ~38.29% within ±0.5σ               |
# | 1.0  | 0.6827                 | ~68.27% within ±1σ (1-sigma rule)  |
# | 1.5  | 0.8664                 | ~86.64% within ±1.5σ               |
# | 2.0  | 0.9545                 | ~95.45% within ±2σ (2-sigma rule)  |
# | 2.5  | 0.9876                 | ~98.76% within ±2.5σ               |

label = df['label']
sigma = label.std()*0.5  # assume normal distribution (actually scaled version)


def classify(x):
    if x < -sigma:
        return -1
    elif x > sigma:
        return 1
    else:
        return 0


y = df['label'].apply(classify)
X = df.drop(columns=['label'], axis=1)

print(y.value_counts())

# Split data into training, validation and test sets
X_training_test = X
y_training_test = y
X_train, X_test, y_train, y_test = train_test_split(X_training_test, y_training_test, test_size=0.2, shuffle=False)

n_estimator = 100
depth = 5
c_random_state = 42

# Random Forest Model
rf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator, oob_score=True,
                            criterion='entropy', random_state=c_random_state)
rf.fit(X_train, y_train.values.ravel())
print("Out-of-bag Training Accuracy (OOB Score): {:.6f}".format(rf.oob_score_))

print(f"\n{"\033[91m"}Hot Garbage!! (even worst than random guess on training set, ridiculous){"\033[0m"}\n")

label
 0    4593
 1    2764
-1    2723
Name: count, dtype: int64
Out-of-bag Training Accuracy (OOB Score): 0.451017

[91mHot Garbage!! (even worst than random guess on training set, ridiculous)[0m



In [66]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

# K-fold
no_of_folds = 5
kfold = KFold(shuffle=True, random_state=1, n_splits=no_of_folds)

accuracy_array_train = np.zeros(no_of_folds)
accuracy_array_test = np.zeros(no_of_folds)

# High-confidence thresholds
hierarchy_thresholds = [0.5, 0.6, 0.7, 0.8]
hierarchy_stats_train = {t: [] for t in hierarchy_thresholds}
hierarchy_stats_test = {t: [] for t in hierarchy_thresholds}

i = 0
for train_index, test_index in kfold.split(X_training_test.values):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    rf.fit(X_train, np.array(y_train).ravel())

    # --- TRAIN ---
    P_y_train_pred = rf.predict_proba(X_train)[:, 1]
    y_train_pred = rf.predict(X_train)
    accuracy_array_train[i] = accuracy_score(y_train, y_train_pred)

    for thd in hierarchy_thresholds:
        mask = (P_y_train_pred > thd) | (P_y_train_pred < (1 - thd))
        if np.sum(mask) > 0:
            acc = accuracy_score(y_train[mask], y_train_pred[mask])
            ratio = np.mean(mask)
            hierarchy_stats_train[thd].append((acc, ratio))
        else:
            hierarchy_stats_train[thd].append((np.nan, 0))

    # --- TEST ---
    P_y_test_pred = rf.predict_proba(X_test)[:, 1]
    y_test_pred = rf.predict(X_test)
    accuracy_array_test[i] = accuracy_score(y_test, y_test_pred)

    for thd in hierarchy_thresholds:
        mask = (P_y_test_pred > thd) | (P_y_test_pred < (1 - thd))
        if np.sum(mask) > 0:
            acc = accuracy_score(y_test[mask], y_test_pred[mask])
            ratio = np.mean(mask)
            hierarchy_stats_test[thd].append((acc, ratio))
        else:
            hierarchy_stats_test[thd].append((np.nan, 0))

    i += 1

# --- Summary ---
print(f"Mean KFold train accuracy: {np.mean(accuracy_array_train):.3f}")
print(f"Mean KFold test accuracy: {np.mean(accuracy_array_test):.3f}")

print("\n--- High Confidence Hierarchy Stats ---")
for thd in hierarchy_thresholds:
    accs_train = [x[0] for x in hierarchy_stats_train[thd] if not np.isnan(x[0])]
    ratios_train = [x[1] for x in hierarchy_stats_train[thd] if x[1] > 0]
    accs_test = [x[0] for x in hierarchy_stats_test[thd] if not np.isnan(x[0])]
    ratios_test = [x[1] for x in hierarchy_stats_test[thd] if x[1] > 0]

    print(f"\n> Threshold: >{thd*100:.1f}% or <{(1-thd)*100:.1f}% confidence")

    if accs_train:
        print(f"  Train - Accuracy: {np.mean(accs_train):.3f}, Coverage: {np.mean(ratios_train)*100:.2f}%")
    else:
        print(f"  Train - No high-confidence samples")

    if accs_test:
        print(f"  Test  - Accuracy: {np.mean(accs_test):.3f}, Coverage: {np.mean(ratios_test)*100:.2f}%")
    else:
        print(f"  Test  - No high-confidence samples")

print(f"\n\033[91mK-fold acc ~= OOB acc on both train and test, consistent, but still Garbage!!\033[0m\n")

Mean KFold train accuracy: 0.459
Mean KFold test accuracy: 0.455

--- High Confidence Hierarchy Stats ---

> Threshold: >50.0% or <50.0% confidence
  Train - Accuracy: 0.459, Coverage: 100.00%
  Test  - Accuracy: 0.455, Coverage: 100.00%

> Threshold: >60.0% or <40.0% confidence
  Train - Accuracy: 0.439, Coverage: 0.93%
  Test  - Accuracy: 0.416, Coverage: 0.57%

> Threshold: >70.0% or <30.0% confidence
  Train - Accuracy: 0.972, Coverage: 0.06%
  Test  - No high-confidence samples

> Threshold: >80.0% or <20.0% confidence
  Train - No high-confidence samples
  Test  - No high-confidence samples

[91mK-fold acc ~= OOB acc on both train and test, consistent, but still Garbage!![0m



In [None]:
# RF can't even fit the training data 
# because of bagging and inherent assumption of continuity/linear-separability of labels in feature space

In [45]:
# what if we force over-fit using NN?
# =============================================================================
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np


class DeepMLPWithNorm(nn.Module):
    def __init__(self, X_train_np, y_train_np, task="classification", hidden_dim=256, num_layers=5):
        super().__init__()
        assert task in {"classification", "regression"}, "Invalid task"
        self.task = task
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        input_dim = X_train_np.shape[1]

        # Infer output dimension
        if task == "classification":
            y_flat = y_train_np.ravel()  # works whether y is shape (N,) or (N,1)
            self.output_dim = int(np.max(y_flat)) + 1
        else:
            y_reshaped = y_train_np if y_train_np.ndim == 2 else y_train_np[:, np.newaxis]
            self.output_dim = y_reshaped.shape[1]

        # Standardization parameters
        mean = X_train_np.mean(axis=0)
        std = X_train_np.std(axis=0) + 1e-6
        self.register_buffer("mean", torch.tensor(mean, dtype=torch.float32))
        self.register_buffer("std", torch.tensor(std, dtype=torch.float32))

        # Build deep MLP with BatchNorm and ReLU
        layers = [nn.Linear(input_dim, hidden_dim)]
        for _ in range(num_layers - 1):
            layers += [nn.BatchNorm1d(hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)]
        layers += [nn.BatchNorm1d(hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, self.output_dim)]
        self.net = nn.Sequential(*layers)
        # if batch-size is not defined, the whole dataset will be a single batch

        self.to(self.device)

    def forward(self, x):  # process input in both fit and evaluate
        x = (x - self.mean) / self.std
        return self.net(x)

    # for GPU accelerations to happen on fit() and evaluate():
    #   1. X,y tensors are on GPU
    #   2. model is on GPU

    def fit(self, X_np, y_np, epochs=50, lr=1e-3, verbose=True):
        X_tensor = torch.tensor(X_np, dtype=torch.float32).to(self.device)

        if self.task == "classification":
            y_tensor = torch.tensor(y_np, dtype=torch.long).to(self.device)
            criterion = nn.CrossEntropyLoss()
        else:
            if y_np.ndim == 1:
                y_np = y_np[:, np.newaxis]
            y_tensor = torch.tensor(y_np, dtype=torch.float32).to(self.device)
            criterion = nn.MSELoss()

        optimizer = optim.Adam(self.parameters(), lr=lr)

        for epoch in range(epochs):
            self.train()
            optimizer.zero_grad()
            output = self(X_tensor)
            loss = criterion(output, y_tensor)
            loss.backward()
            optimizer.step()

            if verbose and epoch % 5 == 0:
                print(f"[Epoch {epoch}] Loss: {loss.item():.4f}")

    def evaluate(self, X_np, y_np):
        self.eval()
        X_tensor = torch.tensor(X_np, dtype=torch.float32).to(self.device)

        with torch.no_grad():
            output = self(X_tensor).cpu().numpy()

        if self.task == "classification":
            preds = np.argmax(output, axis=1)
            return accuracy_score(y_np, preds)
        else:
            if y_np.ndim == 1:
                y_np = y_np[:, np.newaxis]
            return mean_squared_error(y_np, output)


raw_data = pd.read_parquet(os.path.join(os.getcwd(), 'features_and_label(candlestick_calmar).parquet'))
df = raw_data[-(12*24*24*1):].copy()
# | σ    | Cumulative Probability | Approx. % within ±σ range          |
# |------|------------------------|------------------------------------|
# | 0.1  | 0.0797                 | ~7.97% within ±0.1σ                |
# | 0.2  | 0.1587                 | ~15.87% within ±0.2σ               |
# | 0.5  | 0.3829                 | ~38.29% within ±0.5σ               |
# | 1.0  | 0.6827                 | ~68.27% within ±1σ (1-sigma rule)  |
# | 2.0  | 0.9545                 | ~95.45% within ±2σ (2-sigma rule)  |
label = df['label']
sigma = label.std()*0.0  # assume normal distribution (actually scaled version)


def classify(x):
    if x < -sigma:
        return 0
    elif x > sigma:
        return 2
    else:
        return 1


y = df['label'].apply(classify)
X = df.drop(columns=['label'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

X_train_np = X_train.to_numpy().astype("float32")
X_test_np = X_test.to_numpy().astype("float32")
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()

# multi-classification
model_cls = DeepMLPWithNorm(X_train_np, y_train_np, task="classification")
model_cls.fit(X_train_np, y_train_np)
print("Train Acc:", model_cls.evaluate(X_train_np, y_train_np))
print("Test  Acc:", model_cls.evaluate(X_test_np, y_test_np))

# # multi-regression
# model_reg = DeepMLPWithNorm(X_train_np, y_train_np, task="regression")
# model_reg.fit(X_train_np, y_train_np)
# print("Train MSE:", model_reg.evaluate(X_train_np, y_train_np))
# print("Test  MSE:", model_reg.evaluate(X_test_np, y_test_np))

print(f"\n\033[91mAgain, Over-fit Garbage!!\033[0m\n")

[Epoch 0] Loss: 1.0387
[Epoch 5] Loss: 0.9692
[Epoch 10] Loss: 0.9398
[Epoch 15] Loss: 0.9018
[Epoch 20] Loss: 0.8513
[Epoch 25] Loss: 0.7964
[Epoch 30] Loss: 0.7498
[Epoch 35] Loss: 0.6893
[Epoch 40] Loss: 0.6191
[Epoch 45] Loss: 0.5730
Train Acc: 0.7258093687827817
Test  Acc: 0.450469992769342

[91mAgain, Over-fit Garbage!![0m



In [1]:
# your model need to be able to fit(or at least identify high certainty sample on) these simple features
# before that, all effort into engineering more features is pointless
# expect difficulty: under extremely high noise at minute level with calmar-ish labels

# NOTE: models(MLE-style) like XGBoost, Neural Networks with standard loss functions are destined to fail

### Look in my eyes!
### What are we missing?