In [None]:
# =============================================================================
# following is the model selection process for "Classic Pure Time Series Models":
# =============================================================================
# Example: Uniqueness is stationary, has a long decaying autocorrelation(cutoff at lag >10) and partial autocorrelation(cutoff at lag 2).

# Start
#  |
#  |-- Is the series stationary? (ADF test, KPSS test, plot)
#  |     | ADF (Augmented Dickey-Fuller) test
#  |     | KPSS (Kwiatkowski–Phillips–Schmidt–Shin) test
#  |     |-- No --> Apply differencing (log/first/second diff)
#  |     |            |
#  |     |            |-- Recheck stationarity
#  |     |                   |
#  |     |                   |-- Still non-stationary? --> Consider advanced models (e.g., trend modeling, transformations)
#  |     |
#  |     |-- Yes
#  |
#  |-- Examine ACF and PACF plots (on stationary series)
#  |     |
#  |     |-- ACF cuts off at lag q, PACF tails off --> MA(q)
#  |     |
#  |     |-- PACF cuts off at lag p, ACF tails off --> AR(p)
#  |     |
#  |     |-- Both ACF and PACF tail off --> ARMA(p, q)
#  |     |
#  |     |-- ACF and PACF have pattern after differencing --> ARIMA(p, d, q)
#  |
#  |-- For Seasonal Patterns? (ACF spikes at seasonal lags like 12, 24)
#  |     |
#  |     |-- Yes --> Use SARIMA(p,d,q)(P,D,Q,s)
#  |     |          |
#  |     |          |-- Use seasonal ACF/PACF for seasonal P and Q
#  |
#  |-- Fit candidate models (e.g., ARIMA, MA, AR, SARIMA)
#  |     |
#  |     |-- Evaluate using AIC, BIC
#  |     |     |
#  |     |     |-- Select models with lowest AIC/BIC
#  |
#  |-- Perform residual diagnostics on selected models
#  |     |
#  |     |-- Are residuals uncorrelated? (Ljung-Box test)
#  |     |-- Are residuals homoscedastic and normal?
#  |     |
#  |     |-- No --> Model misspecification → Refine (adjust p/q, add seasonal terms)
#  |     |
#  |     |-- Yes
#  |
#  |-- (Optional) Compare forecast accuracy on validation data
#  |     |
#  |     |-- Use RMSE, MAE, MAPE
#  |
#  |-- Are multiple models close in performance?
#  |     |
#  |     |-- Yes --> Choose simpler (lower-order) model
#  |     |
#  |     |-- No --> Choose best-performing model
#  |
#  ✅ Final Model Selected
# 

In [None]:
# =============================================================================
# We consider family of random forest as baseline "Feature-Based Time Series (ML)Models" here:
# =============================================================================
# | Model                    | Year  | Core Method                  | Key Features & Tricks                          | Training Style         | Functional Modules                                     | Strengths                        | Weaknesses                                       |
# | ------------------------ | ----- | ---------------------------- | ---------------------------------------------- | ---------------------- | ----------------------------------------------------------------------------------------- | ------------------------------------------------ |
# | **CART** (Decision Tree) | 1986  | Greedy splits                | Gini impurity / MSE; max depth; pruning        | Recursive partitioning | - Split criteria                                       | Simple, interpretable            | High variance, overfitting                       |
# |                          |       |                              |                                                |                        | - Tree structure                                       |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Pruning                                              |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Impurity computation                                 |                                  |                                                  |
# | **Bagging**              | 1996  | Bootstrapped trees           | Multiple trees on random samples               | Parallel training      | - Bootstrap sampler                                    | Reduces variance                 | Still sensitive to overfitting on noisy features |
# |                          |       |                              |                                                |                        | - Aggregation (voting/averaging)                       |                                  |                                                  |
# | **Random Forest**        | 2001  | Bagging + feature randomness | Random feature subset at each split            | Parallel training      | - Bootstrap sampler                                    | Robust, handles high dimensions  | Slow for large datasets                          |
# |                          |       |                              |                                                |                        | - Feature subspace sampler                             |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Tree ensemble                                        |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Majority voting                                      |                                  |                                                  |
# | **ExtraTrees**           | 2006  | Fully randomized trees       | Random feature **and** threshold selection     | Parallel training      | - Random threshold selector                            | Very fast, low variance          | Slightly higher bias                             |
# |                          |       |                              |                                                |                        | - Feature subspace                                     |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Ensemble aggregator                                  |                                  |                                                  |
# | **XGBoost**              | 2014  | Gradient Boosting            | Regularization, shrinkage, weighted splits     | Sequential boosting    | - Gradient calculator                                  | High accuracy, scalable          | Sensitive to hyperparams                         |
# |                          |       |                              |                                                |                        | - Loss function                                        |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Tree pruner                                          |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Column block optimization                            |                                  |                                                  |
# | **LightGBM**             | 2017  | Gradient Boosting            | Leaf-wise growth, histogram bins               | Sequential boosting    | - Histogram binning                                    | Fast, efficient memory use       | Overfits small data if not regularized           |
# |                          |       |                              |                                                |                        | - Leaf-wise tree builder                               |                                  |                                                  |
# |                          |       |                              |                                                |                        | - GPU training                                         |                                  |                                                  |
# | **CatBoost**             | 2017  | Ordered Boosting             | Categorical encoding (ordered target encoding) | Sequential boosting    | - Ordered target encoder                               | Best with categorical features   | Slower on numeric-only datasets                  |
# |                          |       |                              |                                                |                        | - Symmetric trees                                      |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Bayesian averaging                                   |                                  |                                                  |
# | **gcForest**             | 2017  | Layered Forests              | Deep cascade of forests, auto ensemble         | Layer-wise cascading   | - Multi-grain scanning                                 | Handles small data well          | Complex to tune and understand                   |
# |                          |       |                              |                                                |                        | - Cascaded forests                                     |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Auto model selection                                 |                                  |                                                  |
# | **Neural Forests**       | 2020s | Soft/diff. splits            | Differentiable nodes, hybrid with neural nets  | Backpropagation + SGD  | - Soft split function (sigmoid)                        | Can be trained end-to-end        | Less interpretable, newer technique              |
# |                          |       |                              |                                                |                        | - Neural layers                                        |                                  |                                                  |
# |                          |       |                              |                                                |                        | - Loss-based gradient optimization                     |                                  |                                                  |
# =============================================================================


In [None]:
# Long term models (macro/micro(fundamentals)-economics indicators):
#   - can have good overall accuracy
#   - features are relatively strong, high covariance to predictable target
# models: ridge, lasso, penalized regression models(elastic nets), classic ensembles like RandomForest(XGBoost) type

# Short term models (market micro-structures):
#   - signal is sparse, noise is constantly present
#   - prioritize Precision/Recall/F1 over overall accuracy
#   - clean signals(trading opportunities) only appear after certain feature combination with certain values
# models: most simply doesn't work

# there are 2 paths for ML4F:
#   - Machine Learning
#     - model learns combinations from thousands of weak features
#     - Model quality is key, most models can't do this well(this even include bootstrap ensembles with boosting like XGBoost, LightGBM, CatBoost)
#     - performance decay is slower, because this is a more general approach, doesn't require large number of high-quality alphas(which decay fast)
#     - once decay, simply refit and you will have a good model again
#     - work well with mined alphas
#   - Alpha Mining:
#     - model relies on high quality mined alphas, which will have good covariance with predicting target
#     - simpler models can work
#     - need very sophisticated alpha mining algorithm(maybe with RL), very hard(technically) to eliminate overfitting
#     - performance decay fast

# standard models like RandomForest/Boosting/NN/ResNet is unlikely to work well on sparse-signals/weak-features:
#     - these are all MLE-alike models(trees are discriminative, NN can be either discriminative or generative(VAE/GAN)), when you cannot fit a good model, you should probably lower expectation only trade when opportunities are more certain
#     - it tries to optimize overall accuracy, which is practically impossible for low SNR environment
#     - these model make inherent assumptions like: target classes are continuous in feature space (thus the greedy split)
#     - its relatively simple features selection will include the weak(high noise) features to grow trees, which will not have good performance
#     - even worse, boosting/ResNet requires high quality fitted base models(which will not happen/converge) to yield real/meaningful loss values, to be fitted again later
#     - forcefully fitting under low SNR with weak features will result in terrible overfitting (both in based model, and more so in residue models)

# we introduce Confidence-Based Ensemble Models for market micro-structure models:
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Model / Approach       | Key Idea / Mechanism                       | Confidence Estimation        | Output Selection / Aggregation| Strengths                      | Weaknesses                    | Paper / Origin      |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Deep Ensembles         | Train multiple NNs independently           | Predictive variance across   | Mean prediction; variance     | Captures epistemic uncertainty;| Expensive to train and store  | Lakshminarayanan    |
# |                        | with different inits and data shuffles     | models with different inits  | indicates uncertainty         | strong calibration             | multiple large models         | et al., 2017        |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | MC-Dropout             | Use dropout at test time to sample         | Variance across dropout      | Mean and variance of          | Lightweight Bayesian inference;| Dropout tuning is critical;   | Gal & Ghahramani,   |
# |                        | outputs (Bayesian approximation)           | outputs from same network    | predictions                   | works on existing models       | not always reliable           | 2016                |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Bootstrap Ensembles    | Train NNs on bootstrapped datasets         | Output spread across         | Voting or averaging +         | Handles label noise well;      | Can underperform in data-     | Classic bagging     |
# |                        | to capture diverse noise patterns          | independently trained models | confidence threshold          | diverse feature exploration    | scarce regimes                | with NNs            |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | SelectiveNet           | Jointly learns to predict and abstain      | Trainable confidence         | Predict only when             | High precision when selective; | Needs abstention-aware        | Geifman & El-Yaniv, |
# |                        | using an auxiliary confidence head         | head output                  | confidence exceeds threshold  | end-to-end trainable           | loss; may skip hard cases     | 2019                |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Conf.-Aware KD         | Student learns from confident              | Entropy or margin of         | Filter teacher targets        | Robust to noisy labels;        | Requires strong teacher;      | Hinton-style KD     |
# |                        | teacher predictions in KD setup            | teacher predictions          | by confidence                 | inherits reliable signal       | depends on confident signal   | + confidence gating |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | DeepGAM                | Ensemble with soft feature-gating          | Softmax gate activation      | Weighted expert output        | Interpretable gating;          | Gates may confuse;            | Chang et al.,       |
# |                        | where each expert activates conditionally  | controls expert firing       | by gating network             | local specialization           | model may not abstain         | 2021 (ICML)         |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Bayesian NNs (BNNs)    | Treat weights as distributions             | Posterior variance from      | Mean prediction + credible    | Captures epistemic +           | Hard to scale; variational    | Blundell et al.,    |
# |                        | and output as expectation                  | sampling posterior weights   | interval for confidence       | aleatoric uncertainty          | methods often poor approx.    | 2015                |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Ensemble Temp. Scaling | Calibrate softmax with temperature         | Softmax confidence           | Apply confidence thresholds   | Improves probabilistic         | Doesn’t affect predictions;   | Guo et al.,         |
# |                        | post-training                              | after scaling                | post calibration              | calibration of models          | only recalibrates scores      | 2017 (NIPS)         |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | DUQ                    | Distance to class prototypes               | Distance from embedding      | Predict only if within        | Simple and fast;               | Limited to clear class        | van Amersfoort      |
# |                        | gives predictive confidence                | to known class centroids     | decision boundary             | no sampling required           | boundaries; low flexibility   | et al., 2020        |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Focal Loss Ensembles   | Use focal loss to prioritize               | Internal confidence          | Aggregate only when           | Improves recall on rare        | Sensitive to tuning of        | Lin et al.,         |
# |                        | hard/rare samples in ensemble              | based on loss weighting      | model is confident            | events; reduces overfitting    | focal loss params             | 2017 (RetinaNet)    |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+
# | Trust Score Ensemble   | Post-hoc distance ratio                    | Ratio of distances to        | Select models where           | Interpretable; simple          | Needs external scoring        |                     |
# |                        | for estimating prediction trust            | nearest labeled samples      | trust score exceeds threshold | and model-agnostic             | set; limited to clean regions |                     |
# +------------------------+--------------------------------------------+------------------------------+-------------------------------+--------------------------------+-------------------------------+---------------------+



<div style="font-size:10px">

| **Aspect** | **MLE (Maximum Likelihood Estimation) 最大似然估计** | **MAP (Maximum A Posteriori Estimation) 最大后验估计** |
|------------|------------------------------------------|--------------------------------------------|
| **Objective** | Estimate parameter $\theta$ that maximizes the likelihood of the observed data. | Estimate parameter $\theta$ that maximizes the posterior probability given the data. |
| **Optimization Goal** | $\displaystyle \hat{\theta}_{\text{MLE}} = \arg\max_{\theta} P(D \mid \theta)$ | $\displaystyle \hat{\theta}_{\text{MAP}} = \arg\max_{\theta} P(\theta \mid D)$ |
| **Formula Derivation** | Maximize the likelihood: <br> $\displaystyle \mathcal{L}(\theta) = \prod_{i=1}^{n} P(x_i \mid \theta)$ <br> Take the log: <br> $\displaystyle \log \mathcal{L}(\theta) = \sum_{i=1}^{n} \log P(x_i \mid \theta)$ <br> Then: <br> $\displaystyle \hat{\theta}_{\text{MLE}} = \arg\max_{\theta} \log P(D \mid \theta)$ | Use Bayes’ Theorem: <br> $\displaystyle P(\theta \mid D) = \frac{P(D \mid \theta) P(\theta)}{P(D)}$ <br> Ignore constant $P(D)$: <br> $\displaystyle \hat{\theta}_{\text{MAP}} = \arg\max_{\theta} P(D \mid \theta) P(\theta)$ <br> or log-form: <br> $\displaystyle \hat{\theta}_{\text{MAP}} = \arg\max_{\theta} \left[ \log P(D \mid \theta) + \log P(\theta) \right]$ |
| **Includes Prior?** | ❌ No | ✅ Yes |
| **Sensitivity to Prior** | Not sensitive (no prior used) | Sensitive to prior choice |
| **Overfitting Risk** | Higher, especially for small data | Lower, prior acts as regularizer |
| **Asymptotic Behavior** | As $n \to \infty$, MLE is consistent | As $n \to \infty$, MAP $\to$ MLE |
| **Computational Complexity** | Lower (no prior term) | Higher (includes prior) |
| **Interpretation** | Frequentist — parameters are fixed | Bayesian — parameters are random variables |
| **Uniform Prior Case** | MLE = MAP | Yes, if $P(\theta)$ is uniform |
| **Regularization View** | No regularization | Prior acts like regularization <br> Gaussian prior $\Rightarrow L_2$ <br> Laplace prior $\Rightarrow L_1$ |
| **Example: Gaussian Likelihood** | $x_i \sim \mathcal{N}(\mu, \sigma^2)$ <br> $\displaystyle \hat{\mu}_{\text{MLE}} = \frac{1}{n} \sum x_i$ | Prior: $\mu \sim \mathcal{N}(\mu_0, \tau^2)$ <br> $\displaystyle \hat{\mu}_{\text{MAP}} = \frac{n\sigma^{-2}}{n\sigma^{-2} + \tau^{-2}} \bar{x} + \frac{\tau^{-2}}{n\sigma^{-2} + \tau^{-2}} \mu_0$ |

<div>


In [None]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.utils import resample, shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score
from sklearn.metrics import roc_curve, classification_report, confusion_matrix, accuracy_score
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# raw_data = pd.read_parquet(os.path.join(os.getcwd(), 'bar_and_label.parquet'))
raw_data = pd.read_parquet(os.path.join(os.getcwd(), 'bar_and_label_trend_1to8.parquet'))
# raw_data = pd.read_parquet(os.path.join(os.getcwd(), 'bar_and_label_trend_3to24.parquet'))

# df = raw_data[-(60*24*7):].copy()
df = raw_data.copy()

# H_PER_D = 23                                # trade hours per day
# P_PER_B = 5                                 # equivalent bar period
# EMA_VOL_SPAN = int(60/P_PER_B * H_PER_D)    # span for EMA volatility (daily)
# CUSUM_FACTOR = 0.6                          # multiplier for CUSUM threshold
# 
# # --- VOLATILITY AND FILTER ---
# label = df[['label']].copy()
# label['ref'] = np.log((df['close'] / df['close'].iloc[0]).fillna(1))
# # EMA-based volatility on log returns
# label['return'] = df['close'].pct_change().fillna(0)
# # daily vol (volume/run bars has more constant volatility for their homoscedasticity)
# label['pos_return'] = label['return'].where(label['return'] > 0, 0.0001)
# label['neg_return'] = label['return'].where(label['return'] < 0, 0.0001)
# label['pos_vol'] = label['pos_return'].ewm(span=EMA_VOL_SPAN, adjust=False).std().replace(0, np.nan).ffill().bfill()
# label['neg_vol'] = label['neg_return'].ewm(span=EMA_VOL_SPAN, adjust=False).std().replace(0, np.nan).ffill().bfill()
# 
# # CUSUM to mark breakout events and direction
# s_pos, s_neg = 0.0, 0.0
# label['event'] = 0.0 # np.nan
# for i in range(1, len(label)):
#     # note that for time i, the label/prediction is calculated after all info of that time is known
#     pos_threshold = label['pos_vol'].iloc[i] * CUSUM_FACTOR
#     neg_threshold = label['neg_vol'].iloc[i] * CUSUM_FACTOR
#     diff = label['return'].iloc[i]
#     s_pos = max(0, s_pos + diff)
#     s_neg = min(0, s_neg + diff)
#     index = label.index[i]
#     if s_pos > pos_threshold:
#         label.loc[index, 'event'] = 1
#         s_pos = 0.0
#     elif s_neg < -neg_threshold:
#         label.loc[index, 'event'] = -1
#         s_neg = 0.0
# 
# index = label.index[label['event'] != 0]
# 
# print(df.shape)
# df.drop(columns=['open', 'high', 'low', 'close', 'uniqueness'], axis=1, inplace=True)
# df = df.loc[index]
# print(df.shape)

# | σ    | Cumulative Probability | Approx. % within ±σ range          |
# |------|------------------------|------------------------------------|
# | 0.1  | 0.0797                 | ~7.97% within ±0.1σ                |
# | 0.2  | 0.1587                 | ~15.87% within ±0.2σ               |
# | 0.3  | 0.2266                 | ~22.66% within ±0.3σ               |
# | 0.4  | 0.3108                 | ~31.08% within ±0.4σ               |
# | 0.5  | 0.3829                 | ~38.29% within ±0.5σ               |
# | 1.0  | 0.6827                 | ~68.27% within ±1σ (1-sigma rule)  |
# | 1.5  | 0.8664                 | ~86.64% within ±1.5σ               |
# | 2.0  | 0.9545                 | ~95.45% within ±2σ (2-sigma rule)  |
# | 2.5  | 0.9876                 | ~98.76% within ±2.5σ               |

label = df['label']
sigma = label.std()*0.5 # assume normal distribution (actually scaled version)
def classify(x):
    if x < -sigma:
        return -1
    elif x > sigma:
        return 1
    else:
        return 0
y = df['label'].apply(classify)
X = df.drop(columns=['label'], axis=1)

print(y.value_counts())

# Split data into training, validation and test sets
X_training_test = X
y_training_test = y
X_train, X_test, y_train, y_test = train_test_split(X_training_test, y_training_test, test_size=0.2, shuffle=False)

n_estimator = 100
depth = 5
c_random_state = 42

# Random Forest Model
rf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator, oob_score=True,
                            criterion='entropy', random_state=c_random_state)
rf.fit(X_train, y_train.values.ravel())
print("Out-of-bag Accuracy (OOB Score): {:.6f}".format(rf.oob_score_))

In [None]:
# K-fold
no_of_folds = 5
kfold = KFold(shuffle=True, random_state=1, n_splits=no_of_folds)
print(kfold)

accuracy_array = np.zeros(no_of_folds)
i = 0
for train_index, test_index in kfold.split(X_training_test.values):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    rf.fit(X_train, np.array(y_train).ravel())

    y_pred_rf = rf.predict_proba(X_test)[:, 1]
    y_pred = rf.predict(X_test)
    accuracy_array[i] = accuracy_score(y_test, y_pred)
    i += 1
    # print(accuracy_score(y_test, y_pred))

print(accuracy_array)
print("Mean KFold accuracy: {:.6f}".format(np.mean(accuracy_array)))
