In [2]:
import pandas as pd
import numpy as np
import scipy.stats
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.ar_model import AutoReg
from scipy.stats import theilslopes
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 1. read dataset
print("Loading data...")
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
print("Data loaded.")

Loading data...
Data loaded.


In [4]:
import antropy

In [None]:
# --- 8. 熵信息 ---
# @feature
def entropy_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    feats = {}

    # Shannon entropy
    def compute_entropy(x):
        hist, _ = np.histogram(x, bins='auto', density=True)
        hist = hist[hist > 0]
        return scipy.stats.entropy(hist)
    feats['shannon_entropy_0'] = compute_entropy(s1)
    feats['shannon_entropy_1'] = compute_entropy(s2)
    feats['shannon_entropy_diff'] = feats['shannon_entropy_1'] - feats['shannon_entropy_0']

    # Permutation entropy
    feats['perm_entropy_0'] = antropy.perm_entropy(s1, normalize=True)
    feats['perm_entropy_1'] = antropy.perm_entropy(s2, normalize=True)
    feats['perm_entropy_diff'] = feats['perm_entropy_1'] - feats['perm_entropy_0']

    # Spectral entropy
    feats['spectral_entropy_0'] = antropy.spectral_entropy(s1, sf=1.0, normalize=True)
    feats['spectral_entropy_1'] = antropy.spectral_entropy(s2, sf=1.0, normalize=True)
    feats['spectral_entropy_diff'] = feats['spectral_entropy_1'] - feats['spectral_entropy_0']

    # SVD entropy
    feats['svd_entropy_0'] = antropy.svd_entropy(s1, normalize=True)
    feats['svd_entropy_1'] = antropy.svd_entropy(s2, normalize=True)
    feats['svd_entropy_diff'] = feats['svd_entropy_1'] - feats['svd_entropy_0']

    # Approximate entropy
    feats['approx_entropy_0'] = antropy.app_entropy(s1)
    feats['approx_entropy_1'] = antropy.app_entropy(s2)
    feats['approx_entropy_diff'] = feats['approx_entropy_1'] - feats['approx_entropy_0']

    # Sample entropy
    feats['sample_entropy_0'] = antropy.sample_entropy(s1)
    feats['sample_entropy_1'] = antropy.sample_entropy(s2)
    feats['sample_entropy_diff'] = feats['sample_entropy_1'] - feats['sample_entropy_0']

    # Hjorth mobility and complexity
    feats['hjorth_mobility_0'], feats['hjorth_complexity_0'] = antropy.hjorth_params(s1)
    feats['hjorth_mobility_1'], feats['hjorth_complexity_1'] = antropy.hjorth_params(s2)
    feats['hjorth_mobility_diff'] = feats['hjorth_mobility_1'] - feats['hjorth_mobility_0']
    feats['hjorth_complexity_diff'] = feats['hjorth_complexity_1'] - feats['hjorth_complexity_0']

    # Number of zero-crossings
    feats['num_zerocross_0'] = antropy.num_zerocross(s1)
    feats['num_zerocross_1'] = antropy.num_zerocross(s2)
    feats['num_zerocross_diff'] = feats['num_zerocross_1'] - feats['num_zerocross_0']

    # Lempel-Ziv complexity
    def series_to_binary_str(x, method='median'):
        if method == 'median':
            threshold = np.median(x)
            return ''.join(['1' if val > threshold else '0' for val in x])
        # 可扩展：支持quantile或多符号
        return None
    bin_str1 = series_to_binary_str(s1)
    bin_str2 = series_to_binary_str(s2)
    feats['lziv_complexity_0'] = antropy.lziv_complexity(bin_str1, normalize=True)
    feats['lziv_complexity_1'] = antropy.lziv_complexity(bin_str2, normalize=True)
    feats['lziv_complexity_diff'] = feats['lziv_complexity_1'] - feats['lziv_complexity_0']

    def estimate_cond_entropy(x, lag=1):
        x = x - np.mean(x)
        x_lag = x[:-lag]
        x_now = x[lag:]
        bins = 10
        joint_hist, _, _ = np.histogram2d(x_lag, x_now, bins=bins, density=True)
        joint_hist = joint_hist[joint_hist > 0]
        H_xy = -np.sum(joint_hist * np.log(joint_hist))
        H_x = -np.sum(np.histogram(x_lag, bins=bins, density=True)[0] * 
                      np.log(np.histogram(x_lag, bins=bins, density=True)[0] + 1e-12))
        return H_xy - H_x
    feats['cond_entropy_0'] = estimate_cond_entropy(s1)
    feats['cond_entropy_1'] = estimate_cond_entropy(s2)
    feats['cond_entropy_diff'] = feats['cond_entropy_1'] - feats['cond_entropy_0']
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

In [None]:
# --- 9. 分形 ---
# @feature
def fractal_dimension_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    feats = {}

    # Petrosian fractal dimension
    feats['petrosian_fd_0'] = antropy.petrosian_fd(s1)
    feats['petrosian_fd_1'] = antropy.petrosian_fd(s2)
    feats['petrosian_fd_diff'] = feats['petrosian_fd_1'] - feats['petrosian_fd_0']

    # Katz fractal dimension
    feats['katz_fd_0'] = antropy.katz_fd(s1)
    feats['katz_fd_1'] = antropy.katz_fd(s2)
    feats['katz_fd_diff'] = feats['katz_fd_1'] - feats['katz_fd_0']

    # Higuchi fractal dimension
    feats['higuchi_fd_0'] = antropy.higuchi_fd(s1)
    feats['higuchi_fd_1'] = antropy.higuchi_fd(s2)
    feats['higuchi_fd_diff'] = feats['higuchi_fd_1'] - feats['higuchi_fd_0']

    # Detrended fluctuation analysis
    feats['detrended_fluctuation_0'] = antropy.detrended_fluctuation(s1)
    feats['detrended_fluctuation_1'] = antropy.detrended_fluctuation(s2)
    feats['detrended_fluctuation_diff'] = feats['detrended_fluctuation_1'] - feats['detrended_fluctuation_0']

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

In [7]:
for idx in range(15):
    feats = fractal_dimension_features(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

{'petrosian_fd_0': 1.034193067044591, 'petrosian_fd_1': 1.0417460055878138, 'petrosian_fd_diff': 0.00755293854322292, 'katz_fd_0': 5.202100660768645, 'katz_fd_1': 3.677508644821598, 'katz_fd_diff': -1.524592015947047, 'higuchi_fd_0': 2.030685773219249, 'higuchi_fd_1': 2.016132987187854, 'higuchi_fd_diff': -0.014552786031395204, 'detrended_fluctuation_0': 0.25121816818237125, 'detrended_fluctuation_1': 0.46007167285528444, 'detrended_fluctuation_diff': 0.2088535046729132}
structural_breakpoint    False
Name: 0, dtype: bool
{'petrosian_fd_0': 1.031171873816162, 'petrosian_fd_1': 1.0424309224064583, 'petrosian_fd_diff': 0.011259048590296361, 'katz_fd_0': 3.575005827046129, 'katz_fd_1': 3.1589246511622058, 'katz_fd_diff': -0.4160811758839231, 'higuchi_fd_0': 1.9924505970346986, 'higuchi_fd_1': 2.0253631394946767, 'higuchi_fd_diff': 0.0329125424599781, 'detrended_fluctuation_0': 0.5035297142746277, 'detrended_fluctuation_1': 0.4346302145825713, 'detrended_fluctuation_diff': -0.0688994996920