In [1]:
import pandas as pd
import numpy as np
import scipy.stats

import os
import re
import sys
import json
import time
import logging
import inspect
import typing
import joblib
from pathlib import Path
from tqdm.auto import tqdm
from datetime import datetime
from joblib import Parallel, delayed
from typing import List, Dict, Tuple, Optional
import warnings
from statsmodels.tools.sm_exceptions import InterpolationWarning

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_logger(name: str, log_dir: Path):
    """
    获取一个配置好的 logger 实例，它会生成带时间戳的详细日志。
    """
    # 确保日志目录存在
    log_dir.mkdir(exist_ok=True, parents=True)
    
    # 1. 创建带时间戳的详细日志文件名
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    detail_log_file = log_dir / f'{name.lower()}_{timestamp}.log'

    # 2. 为 logger 设置一个唯一的名称（基于时间戳），避免冲突
    logger = logging.getLogger(f"{name}-{timestamp}")
    logger.setLevel(logging.INFO)

    # 防止将日志消息传播到根 logger
    logger.propagate = False

    # 如果已经有处理器，则不重复添加
    if logger.hasHandlers():
        logger.handlers.clear()

    # 3. 创建详细日志的文件处理器
    detail_handler = logging.FileHandler(detail_log_file, mode='a', encoding='utf-8')
    detail_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
    detail_handler.setFormatter(detail_formatter)
    logger.addHandler(detail_handler)
    
    # 4. 创建控制台处理器
    console_handler = logging.StreamHandler(sys.stdout)
    console_formatter = logging.Formatter('%(message)s') # 控制台只输出简洁信息
    console_handler.setFormatter(console_formatter)
    logger.addHandler(console_handler)

    return logger, detail_log_file # 返回 logger 和日志文件路径 

logger = None
log_file_path = None

In [3]:
import crunch

# Load the Crunch Toolings
crunch = crunch.load_notebook()

loaded inline runner with module: <module '__main__'>

cli version: 6.6.1
available ram: 15.73 gb
available cpu: 16 core
----


In [4]:
EXPERIMENTAL_FEATURES = [
    "wavelet_features",
] 
REMAIN_FEATURES = ['RAW_1_stats_mean_right', 'RAW_1_stats_mean_whole', 'RAW_1_stats_median_right', 'RAW_1_stats_median_ratio', 'RAW_1_stats_min_whole', 'RAW_1_stats_range_ratio', 'RAW_1_stats_std_whole', 'RAW_1_stats_kurt_left', 'RAW_1_stats_kurt_whole', 'RAW_1_stats_cv_whole', 'RAW_1_stats_mean_of_rolling_std_whole', 'RAW_1_stats_mean_of_rolling_std_ratio', 'RAW_8_perm_entropy_0', 'RAW_8_sample_entropy_0', 'RAW_10_ratio_value_number_to_time_series_length', 'RAW_10_ratio_beyond_r_sigma_1_5', 'RAW_10_percentage_of_reoccurring_values_to_all_values', 'RAW_10_agg_linear_trend_attr_rvalue_chunk_len_50_f_agg_mean_', 'RAW_10_agg_linear_trend_attr_rvalue_chunk_len_5_f_agg_max_', 'RAW_2_ks_stat', 'RAW_2_ks_pvalue', 'RAW_2_ad_stat', 'RAW_2_ad_pvalue', 'RAW_2_levene_stat', 'RAW_2_levene_pvalue', 'RAW_2_bartlett_stat', 'RAW_2_bartlett_pvalue', 'RAW_2_shapiro_pvalue_left', 'RAW_2_jb_pvalue_left', 'RAW_2_jb_pvalue_ratio', 'RAW_3_sum_ratio', 'RAW_4_autocorr_lag1_diff', 'RAW_4_autocorr_lag1_ratio', 'RAW_8_sample_entropy_whole', 'RAW_8_sample_entropy_diff', 'RAW_8_hjorth_complexity_diff', 'RAW_10_ratio_value_number_to_time_series_length_left', 'RAW_10_ratio_value_number_to_time_series_length_whole', 'RAW_10_ratio_value_number_to_time_series_length_ratio', 'RAW_10_quantile_0_4_right', 'RAW_10_quantile_0_4_whole', 'RAW_10_quantile_0_4_ratio', 'RAW_10_quantile_0_1_ratio', 'RAW_10_percentage_of_reoccurring_values_to_all_values_left', 'RAW_10_percentage_of_reoccurring_values_to_all_values_ratio', 'RAW_10_percentage_of_reoccurring_datapoints_to_all_datapoints_ratio', 'RAW_10_linear_trend_attr_rvalue_ratio', 'RAW_10_linear_trend_attr_intercept_ratio', 'RAW_10_fft_coefficient_attr_imag_coeff_1_whole', 'RAW_10_change_quantiles_f_agg_var_isabs_True_qh_0_6_ql_0_4_ratio', 'RAW_10_change_quantiles_f_agg_var_isabs_False_qh_0_8_ql_0_4_whole', 'RAW_10_change_quantiles_f_agg_var_isabs_False_qh_0_8_ql_0_0_whole', 'RAW_10_change_quantiles_f_agg_var_isabs_False_qh_0_6_ql_0_2_whole', 'RAW_10_agg_linear_trend_attr_rvalue_chunk_len_5_f_agg_max_left', 'RAW_10_agg_linear_trend_attr_rvalue_chunk_len_5_f_agg_max_right', 'RAW_11_param_0_whole', 'RAW_11_param_0_ratio', 'RAW_11_param_3_right', 'RAW_11_param_5_diff', 'RAW_12_rpt_cost_rank_left', 'RAW_12_rpt_cost_cosine_whole']

In [5]:
# --- 特征函数注册表 ---
FEATURE_REGISTRY = {}

def register_feature(_func=None, *, parallelizable=True, func_id=""):
    """一个用于注册特征函数的装饰器，可以标记特征是否可并行化。"""
    def decorator_register(func):
        FEATURE_REGISTRY[func.__name__] = {
            "func": func, 
            "parallelizable": parallelizable,
            "func_id": func_id
        }
        return func

    if _func is None:
        # Used as @register_feature(parallelizable=...)
        return decorator_register
    else:
        # Used as @register_feature
        return decorator_register(_func)

# --- 1. 分布统计特征 ---
def safe_cv(s):
    s = pd.Series(s)
    m = s.mean()
    std = s.std()
    return std / m if abs(m) > 1e-6 else 0.0

def rolling_std_mean(s, window=5):
    s = pd.Series(s)
    if len(s) < window:
        return 0.0
    return s.rolling(window=window).std().dropna().mean()

def slope_theil_sen(s):
    s = pd.Series(s)
    if len(s) < 2:
        return 0.0
    try:
        slope, intercept, _, _ = scipy.stats.theilslopes(s.values, np.arange(len(s)))
        return slope
    except Exception:
        return 0.0

class STATSFeatureExtractor:
    def __init__(self):
        import scipy.stats
        # 所有可用的func类及其名称
        self.func_classes = {
            'mean': np.mean,
            'median': np.median,
            'max': np.max,
            'min': np.min,
            'range': lambda x: np.max(x) - np.min(x),
            'std': np.std,
            'skew': scipy.stats.skew,
            'kurt': scipy.stats.kurtosis,
            'cv': safe_cv,
            'mean_of_rolling_std': rolling_std_mean,
            'theil_sen_slope': slope_theil_sen
        }
    
    def fit(self, signal):
        self.signal = np.asarray(signal)
        self.n = len(signal)

    def calculate(self, func, start, end):
        result = func(self.signal[start:end])
        if isinstance(result, float) or isinstance(result, int):
            return result
        else:
            return result.item()

    def extract(self, signal, boundary):
        """
        输入：
            signal: 1D numpy array，单变量时间序列
            boundary: int，分割点
        输出：
            result: dict，格式为 {func_name: {'left': value, 'right': value}}
        """
        n = self.n
        result = {}
        for name, func in self.func_classes.items():
            try:
                left = self.calculate(func, 0, boundary)
                right = self.calculate(func, boundary, n)
                whole = self.calculate(func, 0, n)
                diff = right - left
                ratio = right / (left + 1e-6)
            except Exception:
                left = None
                right = None
                whole = None
                diff = None
                ratio = None
            result[name] = {'left': left, 'right': right, 'whole': whole, 'diff': diff, 'ratio': ratio}
        return result

@register_feature(func_id="1")
def distribution_stats_features(u: pd.DataFrame) -> dict:
    """统计量的分段值、Diff值、Ratio值"""
    value = u['value'].values.astype(np.float32)
    period = u['period'].values.astype(np.float32)
    boundary = np.where(np.diff(period) != 0)[0].item()
    feats = {}

    extractor = STATSFeatureExtractor()
    extractor.fit(value)
    features = extractor.extract(value, boundary)

    feats = {}
    for k, v in features.items():
        for seg, value in v.items():
            feats[f'stats_{k}_{seg}'] = value

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}
    
# --- 2. 假设检验统计量特征 ---
@register_feature(func_id="2")
def test_stats_features(u: pd.DataFrame) -> dict:
    import scipy.stats
    import statsmodels.tsa.api as tsa
    
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    s_whole = u['value']
    feats = {}

    """假设检验统计量"""
    # KS检验
    ks_stat, ks_pvalue = scipy.stats.ks_2samp(s1, s2)
    feats['ks_stat'] = ks_stat
    feats['ks_pvalue'] = -ks_pvalue

    # T检验
    ttest_stat, ttest_pvalue = scipy.stats.ttest_ind(s1, s2, equal_var=False)
    feats['ttest_pvalue'] = -ttest_pvalue if not np.isnan(ttest_pvalue) else 1

    # AD检验
    ad_stat, _, ad_pvalue = scipy.stats.anderson_ksamp([s1.to_numpy(), s2.to_numpy()])
    feats['ad_stat'] = ad_stat
    feats['ad_pvalue'] = -ad_pvalue

    # Mann-Whitney U检验 (非参数，不假设分布)
    mw_stat, mw_pvalue = scipy.stats.mannwhitneyu(s1, s2, alternative='two-sided')
    feats['mannwhitney_stat'] = mw_stat if not np.isnan(mw_stat) else 0
    feats['mannwhitney_pvalue'] = -mw_pvalue if not np.isnan(mw_pvalue) else 1
    
    # Wilcoxon秩和检验
    try:
        w_stat, w_pvalue = scipy.stats.ranksums(s1, s2)
        feats['wilcoxon_stat'] = w_stat if not np.isnan(w_stat) else 0
        feats['wilcoxon_pvalue'] = -w_pvalue if not np.isnan(w_pvalue) else 1
    except ValueError:
        feats['wilcoxon_stat'] = 0
        feats['wilcoxon_pvalue'] = 1

    # Levene检验
    levene_stat, levene_pvalue = scipy.stats.levene(s1, s2)
    feats['levene_stat'] = levene_stat if not np.isnan(levene_stat) else 0
    feats['levene_pvalue'] = -levene_pvalue if not np.isnan(levene_pvalue) else 1
    
    # Bartlett检验
    bartlett_stat, bartlett_pvalue = scipy.stats.bartlett(s1, s2)
    feats['bartlett_stat'] = bartlett_stat if not np.isnan(bartlett_stat) else 0
    feats['bartlett_pvalue'] = -bartlett_pvalue if not np.isnan(bartlett_pvalue) else 1
    
    """分段假设检验的分段值、Diff值、Ratio值"""
    # Shapiro-Wilk检验
    sw1_stat, sw1_pvalue, sw2_stat, sw2_pvalue, sw_whole_stat, sw_whole_pvalue = (np.nan,)*6
    if len(s1) <= 5000 and len(s1) > 2:
        sw1_stat, sw1_pvalue = scipy.stats.shapiro(s1)
    if len(s2) <= 5000 and len(s2) > 2:
        sw2_stat, sw2_pvalue = scipy.stats.shapiro(s2)
    if len(s_whole) <= 5000 and len(s_whole) > 2:
        sw_whole_stat, sw_whole_pvalue = scipy.stats.shapiro(s_whole)
    
    feats['shapiro_pvalue_left'] = sw1_pvalue
    feats['shapiro_pvalue_right'] = sw2_pvalue
    feats['shapiro_pvalue_whole'] = sw_whole_pvalue
    feats['shapiro_pvalue_diff'] = sw2_pvalue - sw1_pvalue if not (np.isnan(sw1_pvalue) or np.isnan(sw2_pvalue)) else 0
    feats['shapiro_pvalue_ratio'] = sw2_pvalue / (sw1_pvalue + 1e-6) if not (np.isnan(sw1_pvalue) or np.isnan(sw2_pvalue)) else 0

    # Jarque-Bera检验差异
    jb1_stat, jb1_pvalue, jb2_stat, jb2_pvalue, jb_whole_stat, jb_whole_pvalue = (np.nan,)*6
    try:
        if len(s1) > 2: jb1_stat, jb1_pvalue = scipy.stats.jarque_bera(s1)
        if len(s2) > 2: jb2_stat, jb2_pvalue = scipy.stats.jarque_bera(s2)
        if len(s_whole) > 2: jb_whole_stat, jb_whole_pvalue = scipy.stats.jarque_bera(s_whole)
    except:
        pass
    
    feats['jb_pvalue_left'] = jb1_pvalue
    feats['jb_pvalue_right'] = jb2_pvalue
    feats['jb_pvalue_whole'] = jb_whole_pvalue
    feats['jb_pvalue_diff'] = jb2_pvalue - jb1_pvalue if not (np.isnan(jb1_pvalue) or np.isnan(jb2_pvalue)) else 0
    feats['jb_pvalue_ratio'] = jb2_pvalue / (jb1_pvalue + 1e-6) if not (np.isnan(jb1_pvalue) or np.isnan(jb2_pvalue)) else 0

    # KPSS检验
    def extract_kpss_features(s):
        if len(s) <= 12:
            return {'p': 0.1, 'stat': 0.0, 'lag': 0, 'crit_5pct': 0.0, 'reject_5pct': 0}
        kpss = tsa.stattools.kpss(s, regression='c', nlags='auto')
        stat, p, lag, crit = kpss
        crit_5pct = crit['5%']
        return {
            'p': p,
            'stat': stat,
            'lag': lag,
            'crit_5pct': crit_5pct,
            'reject_5pct': int(stat > crit_5pct)  # KPSS原假设是“平稳”，所以 > 临界值 拒绝平稳
        }
    try:
        k1 = extract_kpss_features(s1)
        k2 = extract_kpss_features(s2)
        k_whole = extract_kpss_features(s_whole)

        feats['kpss_pvalue_left'] = k1['p']
        feats['kpss_pvalue_right'] = k2['p']
        feats['kpss_pvalue_whole'] = k_whole['p']
        feats['kpss_pvalue_diff'] = k2['p'] - k1['p']
        feats['kpss_pvalue_ratio'] = k2['p'] / (k1['p'] + 1e-6)

        feats['kpss_stat_left'] = k1['stat']
        feats['kpss_stat_right'] = k2['stat']
        feats['kpss_stat_whole'] = k_whole['stat']
        feats['kpss_stat_diff'] = k2['stat'] - k1['stat']
        feats['kpss_stat_ratio'] = k2['stat'] / (k1['stat'] + 1e-6)
    except:
        feats.update({
            'kpss_pvalue_left': 1, 'kpss_pvalue_right': 1, 'kpss_pvalue_whole': 1, 'kpss_pvalue_diff': 0, 'kpss_pvalue_ratio': 0,
            'kpss_stat_left': 0, 'kpss_stat_right': 0, 'kpss_stat_whole': 0, 'kpss_stat_diff': 0, 'kpss_stat_ratio': 0
        })

    # 平稳性检验 (ADF)
    def extract_adf_features(s):
        if len(s) <= 12:
            return {'p': 1.0, 'stat': 0.0, 'lag': 0, 'ic': 0.0, 'crit_5pct': 0.0, 'reject_5pct': 0}
        adf = tsa.stattools.adfuller(s, autolag='AIC')
        stat, p, lag, _, crit, ic = adf
        crit_5pct = crit['5%']
        return {
            'p': p,
            'stat': stat,
            'lag': lag,
            'ic': ic,
            'crit_5pct': crit_5pct,
            'reject_5pct': int(stat < crit_5pct)
        }
    try:
        f1 = extract_adf_features(s1)
        f2 = extract_adf_features(s2)
        f_whole = extract_adf_features(s_whole)

        feats['adf_pvalue_left'] = f1['p']
        feats['adf_pvalue_right'] = f2['p']
        feats['adf_pvalue_whole'] = f_whole['p']
        feats['adf_pvalue_diff'] = f2['p'] - f1['p']
        feats['adf_pvalue_ratio'] = f2['p'] / (f1['p'] + 1e-6)

        feats['adf_stat_left'] = f1['stat']
        feats['adf_stat_right'] = f2['stat']
        feats['adf_stat_whole'] = f_whole['stat']
        feats['adf_stat_diff'] = f2['stat'] - f1['stat']
        feats['adf_stat_ratio'] = f2['stat'] / (f1['stat'] + 1e-6)

        feats['adf_icbest_left'] = f1['ic']
        feats['adf_icbest_right'] = f2['ic']
        feats['adf_icbest_whole'] = f_whole['ic']
        feats['adf_icbest_diff'] = f2['ic'] - f1['ic']
        feats['adf_icbest_ratio'] = f2['ic'] / (f1['ic'] + 1e-6)
    except:
        feats.update({
            'adf_pvalue_left': 1, 'adf_pvalue_right': 1, 'adf_pvalue_whole': 1, 'adf_pvalue_diff': 0, 'adf_pvalue_ratio': 0,
            'adf_stat_left': 0, 'adf_stat_right': 0, 'adf_stat_whole': 0, 'adf_stat_diff': 0, 'adf_stat_ratio': 0,
            'adf_icbest_left': 0, 'adf_icbest_right': 0, 'adf_icbest_whole': 0, 'adf_icbest_diff': 0, 'adf_icbest_ratio': 0
        })


    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 3. 累积和特征 ---
@register_feature(func_id="3")
def cumulative_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    s_whole = u['value']
    feats = {}
    
    sum1, sum2, sum_whole = s1.sum(), s2.sum(), s_whole.sum()
    feats['sum_left'] = sum1
    feats['sum_right'] = sum2
    feats['sum_whole'] = sum_whole
    feats['sum_diff'] = sum2 - sum1
    feats['sum_ratio'] = sum2 / (sum1 + 1e-6)
    
    cumsum1_max = s1.cumsum().max()
    cumsum2_max = s2.cumsum().max()
    cumsum_whole_max = s_whole.cumsum().max()
    feats['cumsum_max_left'] = cumsum1_max
    feats['cumsum_max_right'] = cumsum2_max
    feats['cumsum_max_whole'] = cumsum_whole_max
    feats['cumsum_max_diff'] = cumsum2_max - cumsum1_max
    feats['cumsum_max_ratio'] = cumsum2_max / (cumsum1_max + 1e-6)

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 4. 振荡特征 ---
@register_feature(func_id="4")
def oscillation_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0].reset_index(drop=True)
    s2 = u['value'][u['period'] == 1].reset_index(drop=True)
    s_whole = u['value'].reset_index(drop=True)
    feats = {}

    def count_zero_crossings(series: pd.Series):
        if len(series) < 2: return 0
        centered_series = series - series.mean()
        if centered_series.eq(0).all(): return 0
        return np.sum(np.diff(np.sign(centered_series)) != 0)

    zc1, zc2, zc_whole = count_zero_crossings(s1), count_zero_crossings(s2), count_zero_crossings(s_whole)
    feats['zero_cross_left'] = zc1
    feats['zero_cross_right'] = zc2
    feats['zero_cross_whole'] = zc_whole
    feats['zero_cross_diff'] = zc2 - zc1
    feats['zero_cross_ratio'] = zc2 / (zc1 + 1e-6)
    
    def autocorr_lag1(s):
        if len(s) < 2: return 0.0
        ac = s.autocorr(lag=1)
        return ac if not np.isnan(ac) else 0.0
        
    ac1, ac2, ac_whole = autocorr_lag1(s1), autocorr_lag1(s2), autocorr_lag1(s_whole)
    feats['autocorr_lag1_left'] = ac1
    feats['autocorr_lag1_right'] = ac2
    feats['autocorr_lag1_whole'] = ac_whole
    feats['autocorr_lag1_diff'] = ac2 - ac1
    feats['autocorr_lag1_ratio'] = ac2 / (ac1 + 1e-6)

    var1, var2, var_whole = s1.diff().var(), s2.diff().var(), s_whole.diff().var()
    feats['diff_var_left'] = var1
    feats['diff_var_right'] = var2
    feats['diff_var_whole'] = var_whole
    feats['diff_var_diff'] = var2 - var1
    feats['diff_var_ratio'] = var2 / (var1 + 1e-6)
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 5. 周期性特征 ---
@register_feature(func_id="5")
def cyclic_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    s_whole = u['value']
    feats = {}

    def get_fft_props(series):
        if len(series) < 2: return 0.0, 0.0
        
        N = len(series)
        yf = np.fft.fft(series.values)
        power = np.abs(yf[1:N//2])**2
        xf = np.fft.fftfreq(N, 1)[1:N//2]
        
        if len(power) == 0: return 0.0, 0.0
            
        dominant_freq = xf[np.argmax(power)]
        max_power = np.max(power)
        return dominant_freq, max_power

    freq1, power1 = get_fft_props(s1)
    freq2, power2 = get_fft_props(s2)
    freq_whole, power_whole = get_fft_props(s_whole)
    
    feats['dominant_freq_left'] = freq1
    feats['dominant_freq_right'] = freq2
    feats['dominant_freq_whole'] = freq_whole
    feats['dominant_freq_diff'] = freq2 - freq1
    feats['dominant_freq_ratio'] = freq2 / (freq1 + 1e-6)

    feats['max_power_left'] = power1
    feats['max_power_right'] = power2
    feats['max_power_whole'] = power_whole
    feats['max_power_diff'] = power2 - power1
    feats['max_power_ratio'] = power2 / (power1 + 1e-6)
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 6. 振幅特征 ---
@register_feature(func_id="6")
def amplitude_features(u: pd.DataFrame) -> dict:
    import scipy.stats
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    s_whole = u['value']
    feats = {}
    
    ptp1, ptp2, ptp_whole = np.ptp(s1), np.ptp(s2), np.ptp(s_whole)
    iqr1, iqr2, iqr_whole = scipy.stats.iqr(s1), scipy.stats.iqr(s2), scipy.stats.iqr(s_whole)

    feats['ptp_left'] = ptp1
    feats['ptp_right'] = ptp2
    feats['ptp_whole'] = ptp_whole
    feats['ptp_diff'] = ptp2 - ptp1
    feats['ptp_ratio'] = ptp2 / (ptp1 + 1e-6)

    feats['iqr_left'] = iqr1
    feats['iqr_right'] = iqr2
    feats['iqr_whole'] = iqr_whole
    feats['iqr_diff'] = iqr2 - iqr1
    feats['iqr_ratio'] = iqr2 / (iqr1 + 1e-6)
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 7. 波动性的波动性特征 ---
@register_feature(func_id="7")
def volatility_of_volatility_features(u: pd.DataFrame) -> dict:
    """
    计算滚动标准差序列的统计特征，以捕捉“波动性的波动性”的变化。
    在Period 0和Period 1内部，分别计算小窗口（如长度为50）的滚动标准差，
    然后比较这两条新的滚动标准差序列的均值，生成四个相关特征：
    1. Period 0 的滚动标准差均值
    2. Period 1 的滚动标准差均值
    3. 两者之差
    4. 两者之比
    """
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    s_whole = u['value']
    feats = {}
    window = 50

    def get_rolling_std_mean(s, w):
        if len(s) < w:
            return 0.0
        rolling_std = s.rolling(window=w).std().dropna()
        if rolling_std.empty:
            return 0.0
        return rolling_std.mean()

    mean1 = get_rolling_std_mean(s1, window)
    mean2 = get_rolling_std_mean(s2, window)
    mean_whole = get_rolling_std_mean(s_whole, window)

    feats[f'rolling_std_w{window}_mean_left'] = mean1
    feats[f'rolling_std_w{window}_mean_right'] = mean2
    feats[f'rolling_std_w{window}_mean_whole'] = mean_whole
    feats[f'rolling_std_w{window}_mean_diff'] = mean2 - mean1
    feats[f'rolling_std_w{window}_mean_ratio'] = mean2 / (mean1 + 1e-6)
    

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 8. 熵信息 ---
@register_feature(func_id="8")
def entropy_features(u: pd.DataFrame) -> dict:
    import scipy.stats
    import antropy
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    s_whole = u['value'].to_numpy()
    feats = {}

    def compute_entropy(x):
        hist, _ = np.histogram(x, bins='auto', density=True)
        hist = hist[hist > 0]
        return scipy.stats.entropy(hist)
    
    entropy_funcs = {
        'shannon_entropy': compute_entropy,
        'perm_entropy': lambda x: antropy.perm_entropy(x, normalize=True),
        'spectral_entropy': lambda x: antropy.spectral_entropy(x, sf=1.0, normalize=True),
        'svd_entropy': lambda x: antropy.svd_entropy(x, normalize=True),
        'approx_entropy': antropy.app_entropy,
        'sample_entropy': antropy.sample_entropy,
        'petrosian_fd': antropy.petrosian_fd,
        'katz_fd': antropy.katz_fd,
        'higuchi_fd': antropy.higuchi_fd,
        'detrended_fluctuation': antropy.detrended_fluctuation,
    }

    for name, func in entropy_funcs.items():
        try:
            v1, v2, v_whole = func(s1), func(s2), func(s_whole)
            feats[f'{name}_left'] = v1
            feats[f'{name}_right'] = v2
            feats[f'{name}_whole'] = v_whole
            feats[f'{name}_diff'] = v2 - v1
            feats[f'{name}_ratio'] = v2 / (v1 + 1e-6)
        except Exception:
            feats.update({f'{name}_left': 0, f'{name}_right': 0, f'{name}_whole': 0, f'{name}_diff': 0, f'{name}_ratio': 0})

    try:
        m1, c1 = antropy.hjorth_params(s1)
        m2, c2 = antropy.hjorth_params(s2)
        m_whole, c_whole = antropy.hjorth_params(s_whole)
        feats.update({
            'hjorth_mobility_left': m1, 'hjorth_mobility_right': m2, 'hjorth_mobility_whole': m_whole,
            'hjorth_mobility_diff': m2 - m1, 'hjorth_mobility_ratio': m2 / (m1 + 1e-6),
            'hjorth_complexity_left': c1, 'hjorth_complexity_right': c2, 'hjorth_complexity_whole': c_whole,
            'hjorth_complexity_diff': c2 - c1, 'hjorth_complexity_ratio': c2 / (c1 + 1e-6)
        })
    except Exception:
        feats.update({'hjorth_mobility_left':0, 'hjorth_mobility_right':0, 'hjorth_mobility_whole':0, 'hjorth_mobility_diff':0, 'hjorth_mobility_ratio':0,
                     'hjorth_complexity_left':0, 'hjorth_complexity_right':0, 'hjorth_complexity_whole':0, 'hjorth_complexity_diff':0, 'hjorth_complexity_ratio':0})


    def series_to_binary_str(x, method='median'):
        if method == 'median':
            threshold = np.median(x)
            return ''.join(['1' if val > threshold else '0' for val in x])
        return None
    bin_str1 = series_to_binary_str(s1)
    bin_str2 = series_to_binary_str(s2)
    bin_str_whole = series_to_binary_str(s_whole)

    try:
        lz1, lz2, lz_whole = antropy.lziv_complexity(bin_str1, normalize=True), antropy.lziv_complexity(bin_str2, normalize=True), antropy.lziv_complexity(bin_str_whole, normalize=True)
        feats.update({
            'lziv_complexity_left': lz1, 'lziv_complexity_right': lz2, 'lziv_complexity_whole': lz_whole,
            'lziv_complexity_diff': lz2 - lz1, 'lziv_complexity_ratio': lz2 / (lz1 + 1e-6)
        })
    except Exception:
        feats.update({'lziv_complexity_left':0, 'lziv_complexity_right':0, 'lziv_complexity_whole':0, 'lziv_complexity_diff':0, 'lziv_complexity_ratio':0})


    def estimate_cond_entropy(x, lag=1):
        x = x - np.mean(x)
        x_lag = x[:-lag]
        x_now = x[lag:]
        bins = 10
        joint_hist, _, _ = np.histogram2d(x_lag, x_now, bins=bins, density=True)
        joint_hist = joint_hist[joint_hist > 0]
        H_xy = -np.sum(joint_hist * np.log(joint_hist))
        H_x = -np.sum(np.histogram(x_lag, bins=bins, density=True)[0] * \
                      np.log(np.histogram(x_lag, bins=bins, density=True)[0] + 1e-12))
        return H_xy - H_x
    try:
        ce1, ce2, ce_whole = estimate_cond_entropy(s1), estimate_cond_entropy(s2), estimate_cond_entropy(s_whole)
        feats.update({
            'cond_entropy_left': ce1, 'cond_entropy_right': ce2, 'cond_entropy_whole': ce_whole,
            'cond_entropy_diff': ce2 - ce1, 'cond_entropy_ratio': ce2 / (ce1 + 1e-6)
        })
    except Exception:
        feats.update({'cond_entropy_left':0, 'cond_entropy_right':0, 'cond_entropy_whole':0, 'cond_entropy_diff':0, 'cond_entropy_ratio':0})
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 9. 分形 ---
@register_feature(func_id="9")
def fractal_dimension_features(u: pd.DataFrame) -> dict:
    import antropy
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    s_whole = u['value'].to_numpy()
    feats = {}
    
    fractal_funcs = {
        'petrosian_fd': antropy.petrosian_fd,
        'katz_fd': antropy.katz_fd,
        'higuchi_fd': antropy.higuchi_fd,
        'detrended_fluctuation': antropy.detrended_fluctuation,
    }

    for name, func in fractal_funcs.items():
        try:
            v1, v2, v_whole = func(s1), func(s2), func(s_whole)
            feats[f'{name}_left'] = v1
            feats[f'{name}_right'] = v2
            feats[f'{name}_whole'] = v_whole
            feats[f'{name}_diff'] = v2 - v1
            feats[f'{name}_ratio'] = v2 / (v1 + 1e-6)
        except Exception:
            feats.update({f'{name}_left': 0, f'{name}_right': 0, f'{name}_whole': 0, f'{name}_diff': 0, f'{name}_ratio': 0})

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 10. tsfresh --- 
@register_feature(func_id="10")
def tsfresh_features(u: pd.DataFrame) -> dict:
    from tsfresh.feature_extraction import feature_calculators as tsfresh_fe
    """基于tsfresh的特征工程"""
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    s_whole = u['value'].to_numpy()
    feats = {}

    funcs = {
        tsfresh_fe.ratio_value_number_to_time_series_length: None,
        tsfresh_fe.ratio_beyond_r_sigma: [6, 1.5],
        tsfresh_fe.quantile: [0.6, 0.4, 0.1],
        tsfresh_fe.percentage_of_reoccurring_values_to_all_values: None,
        tsfresh_fe.percentage_of_reoccurring_datapoints_to_all_datapoints: None,
        tsfresh_fe.last_location_of_maximum: None,
        tsfresh_fe.first_location_of_maximum: None,
        tsfresh_fe.partial_autocorrelation: [{"lag": 2}],
        tsfresh_fe.linear_trend: [{"attr": "slope"}, {"attr": "rvalue"}, {"attr": "intercept"}],
        tsfresh_fe.fft_coefficient: [{"coeff": 3, "attr": "imag"}, {"coeff": 2, "attr": "imag"}, {"coeff": 1, "attr": "imag"}],
        tsfresh_fe.change_quantiles: [
            {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.4},
            {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.2},
            {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.6},
            {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.4},
            {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.2},
            {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.4},
            {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.2},
            {"f_agg": "var", "isabs": True,  "qh": 0.4, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.4},
            {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.4},
            {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.0},
            {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.4},
            {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 0.4, "ql": 0.2},
            {"f_agg": "mean","isabs": True,  "qh": 1.0, "ql": 0.4},
            {"f_agg": "mean","isabs": True,  "qh": 0.6, "ql": 0.4},
        ],
        tsfresh_fe.ar_coefficient: [{"coeff": 2, "k": 10}],
        tsfresh_fe.agg_linear_trend: [
            {"attr": "slope", "chunk_len": 50, "f_agg": "mean"},
            {"attr": "slope", "chunk_len": 5,  "f_agg": "mean"},
            {"attr": "slope", "chunk_len": 10, "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 50, "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 50, "f_agg": "max"},
            {"attr": "rvalue", "chunk_len": 5,  "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 5,  "f_agg": "max"},
            {"attr": "rvalue", "chunk_len": 10, "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 10, "f_agg": "max"},
            {"attr": "intercept", "chunk_len": 50, "f_agg": "mean"},
            {"attr": "intercept", "chunk_len": 50, "f_agg": "max"},
            {"attr": "intercept", "chunk_len": 5,  "f_agg": "mean"},
            {"attr": "intercept", "chunk_len": 5,  "f_agg": "max"},
            {"attr": "intercept", "chunk_len": 10, "f_agg": "mean"},
            {"attr": "intercept", "chunk_len": 10, "f_agg": "max"},
        ]
    }

    def param_to_str(param):
        if isinstance(param, dict):
            return '_'.join([f"{k}_{v}" for k, v in param.items()])
        else:
            return str(param)

    def calculate_stats_for_feature(func, param=None):
        results = {}
        base_name = func.__name__
        if param is not None:
            base_name += f"_{param_to_str(param)}"

        try:
            # Prepare arguments for each segment
            args_s1 = [s1]
            args_s2 = [s2]
            args_s_whole = [s_whole]
            is_combiner = False

            if param is None: # Simple function, no params
                pass
            elif isinstance(param, dict):
                # Check if it's a combiner function or a function with kwargs
                sig = inspect.signature(func)
                if 'param' in sig.parameters: # Combiner function
                    is_combiner = True
                    args_s1.append([param])
                    args_s2.append([param])
                    args_s_whole.append([param])
                else: # Function with kwargs
                    args_s1.append(param)
                    args_s2.append(param)
                    args_s_whole.append(param)
            else: # Simple function with a single parameter
                args_s1.append(param)
                args_s2.append(param)
                args_s_whole.append(param)

            # Execute function for each segment
            if is_combiner:
                v1_dict = {k: v for k, v in func(*args_s1)}
                v2_dict = {k: v for k, v in func(*args_s2)}
                v_whole_dict = {k: v for k, v in func(*args_s_whole)}
                
                for key in v1_dict:
                    v1, v2, v_whole = v1_dict[key], v2_dict[key], v_whole_dict[key]
                    feat_name_base = f"{func.__name__}_{key}"
                    results[f'{feat_name_base}_left'] = v1
                    results[f'{feat_name_base}_right'] = v2
                    results[f'{feat_name_base}_whole'] = v_whole
                    results[f'{feat_name_base}_diff'] = v2 - v1
                    results[f'{feat_name_base}_ratio'] = v2 / (v1 + 1e-6)
                return results

            else:
                if isinstance(param, dict) and not is_combiner:
                    v1, v2, v_whole = func(args_s1[0], **args_s1[1]), func(args_s2[0], **args_s2[1]), func(args_s_whole[0], **args_s_whole[1])
                else:
                    v1, v2, v_whole = func(*args_s1), func(*args_s2), func(*args_s_whole)

                results[f'{base_name}_left'] = v1
                results[f'{base_name}_right'] = v2
                results[f'{base_name}_whole'] = v_whole
                results[f'{base_name}_diff'] = v2 - v1
                results[f'{base_name}_ratio'] = v2 / (v1 + 1e-6)
        
        except Exception:
            # For combiner functions, need to know keys to create nulls
            if 'param' in locals() and inspect.isfunction(func) and 'param' in inspect.signature(func).parameters:
                 # It's a combiner, but we can't get keys without running it. Skip for now on error.
                 pass
            else:
                results[f'{base_name}_left'] = np.nan
                results[f'{base_name}_right'] = np.nan
                results[f'{base_name}_whole'] = np.nan
                results[f'{base_name}_diff'] = np.nan
                results[f'{base_name}_ratio'] = np.nan
                
        return results


    for func, params in funcs.items():
        if params is None:
            feats.update(calculate_stats_for_feature(func))
        else:
            for param in params:
                feats.update(calculate_stats_for_feature(func, param))

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 11. 时间序列建模 ---
@register_feature(func_id="11")
def ar_model_features(u: pd.DataFrame) -> dict:
    """
    基于AR模型派生特征。
    1. 在 period 0 上训练模型，预测 period 1，计算残差统计量。
    2. 在 period 1 上训练模型，预测 period 0，计算残差统计量。
    3. 分别在 period 0 和 1 上训练模型，比较模型参数、残差和信息准则(AIC/BIC)。
    """
    from statsmodels.tsa.ar_model import AutoReg

    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    s_whole = u['value'].to_numpy()
    feats = {}
    lags = 5 # 固定阶数以保证可比性

    # --- 特征组1: 用 s1 训练，预测 s2 ---
    if len(s1) > lags and len(s2) > 0:
        try:
            model1_fit = AutoReg(s1, lags=lags).fit()
            predictions = model1_fit.predict(start=len(s1), end=len(s1) + len(s2) - 1, dynamic=True)
            residuals = s2 - predictions
            feats['ar_residuals_s2_pred_mean'] = np.mean(residuals)
            feats['ar_residuals_s2_pred_std'] = np.std(residuals)
            feats['ar_residuals_s2_pred_skew'] = pd.Series(residuals).skew()
            feats['ar_residuals_s2_pred_kurt'] = pd.Series(residuals).kurt()
        except Exception:
            # 宽泛地捕获异常，防止因数值问题中断
            feats.update({'ar_residuals_s2_pred_mean': 0, 'ar_residuals_s2_pred_std': 0, 'ar_residuals_s2_pred_skew': 0, 'ar_residuals_s2_pred_kurt': 0})
    else:
        feats.update({'ar_residuals_s2_pred_mean': 0, 'ar_residuals_s2_pred_std': 0, 'ar_residuals_s2_pred_skew': 0, 'ar_residuals_s2_pred_kurt': 0})

    # --- 特征组2: 用 s2 训练，预测 s1 ---
    if len(s2) > lags and len(s1) > 0:
        try:
            model2_fit = AutoReg(s2, lags=lags).fit()
            predictions_on_s1 = model2_fit.predict(start=len(s2), end=len(s2) + len(s1) - 1, dynamic=True)
            residuals_s1_pred = s1 - predictions_on_s1
            feats['ar_residuals_s1_pred_mean'] = np.mean(residuals_s1_pred)
            feats['ar_residuals_s1_pred_std'] = np.std(residuals_s1_pred)
            feats['ar_residuals_s1_pred_skew'] = pd.Series(residuals_s1_pred).skew()
            feats['ar_residuals_s1_pred_kurt'] = pd.Series(residuals_s1_pred).kurt()
        except Exception:
            feats.update({'ar_residuals_s1_pred_mean': 0, 'ar_residuals_s1_pred_std': 0, 'ar_residuals_s1_pred_skew': 0, 'ar_residuals_s1_pred_kurt': 0})
    else:
        feats.update({'ar_residuals_s1_pred_mean': 0, 'ar_residuals_s1_pred_std': 0, 'ar_residuals_s1_pred_skew': 0, 'ar_residuals_s1_pred_kurt': 0})


    # --- 特征组3: 分别建模，比较差异 ---
    s1_resid_std, s1_params = np.nan, np.full(lags + 1, np.nan)
    s1_aic, s1_bic = np.nan, np.nan
    if len(s1) > lags:
        try:
            fit1 = AutoReg(s1, lags=lags).fit()
            s1_resid_std = np.std(fit1.resid)
            s1_params = fit1.params
            s1_aic = fit1.aic
            s1_bic = fit1.bic
        except Exception:
            pass

    s2_resid_std, s2_params = np.nan, np.full(lags + 1, np.nan)
    s2_aic, s2_bic = np.nan, np.nan
    if len(s2) > lags:
        try:
            fit2 = AutoReg(s2, lags=lags).fit()
            s2_resid_std = np.std(fit2.resid)
            s2_params = fit2.params
            s2_aic = fit2.aic
            s2_bic = fit2.bic
        except Exception:
            pass

    swhole_resid_std, swhole_params = np.nan, np.full(lags + 1, np.nan)
    swhole_aic, swhole_bic = np.nan, np.nan
    if len(s_whole) > lags:
        try:
            fit_whole = AutoReg(s_whole, lags=lags).fit()
            swhole_resid_std = np.std(fit_whole.resid)
            swhole_params = fit_whole.params
            swhole_aic = fit_whole.aic
            swhole_bic = fit_whole.bic
        except Exception:
            pass
            
    feats['ar_resid_std_left'] = s1_resid_std
    feats['ar_resid_std_right'] = s2_resid_std
    feats['ar_resid_std_whole'] = swhole_resid_std
    feats['ar_resid_std_diff'] = (s2_resid_std - s1_resid_std) if not (np.isnan(s1_resid_std) or np.isnan(s2_resid_std)) else 0
    feats['ar_resid_std_ratio'] = (s2_resid_std / (s1_resid_std + 1e-6)) if not (np.isnan(s1_resid_std) or np.isnan(s2_resid_std)) else 0
    
    feats['ar_aic_left'] = s1_aic
    feats['ar_aic_right'] = s2_aic
    feats['ar_aic_whole'] = swhole_aic
    feats['ar_aic_diff'] = (s2_aic - s1_aic) if not (np.isnan(s1_aic) or np.isnan(s2_aic)) else 0
    feats['ar_aic_ratio'] = (s2_aic / (s1_aic + 1e-6)) if not (np.isnan(s1_aic) or np.isnan(s2_aic)) else 0

    feats['ar_bic_left'] = s1_bic
    feats['ar_bic_right'] = s2_bic
    feats['ar_bic_whole'] = swhole_bic
    feats['ar_bic_diff'] = (s2_bic - s1_bic) if not (np.isnan(s1_bic) or np.isnan(s2_bic)) else 0
    feats['ar_bic_ratio'] = (s2_bic / (s1_bic + 1e-6)) if not (np.isnan(s1_bic) or np.isnan(s2_bic)) else 0
    
    # 比较模型系数
    for i in range(len(s1_params)):
        feats[f'param_{i}_left'] = s1_params[i]
        feats[f'param_{i}_right'] = s2_params[i]
        feats[f'param_{i}_whole'] = swhole_params[i]
        feats[f'param_{i}_diff'] = s2_params[i] - s1_params[i]
        feats[f'param_{i}_ratio'] = s2_params[i] / (s1_params[i] + 1e-6)

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 12. 分段损失 ---
class RPTFeatureExtractor:
    def __init__(self):
        import ruptures as rpt
        # 所有可用的cost类及其名称
        self.cost_classes = {
            'l1': rpt.costs.CostL1,               # 中位数
            'l2': rpt.costs.CostL2,               # 均值
            'clinear': rpt.costs.CostCLinear,     # 线性协方差
            'rbf': rpt.costs.CostRbf,             # RBF核
            'normal': rpt.costs.CostNormal,       # 协方差
            'ar': rpt.costs.CostAR,               # 自回归
            'mahalanobis': rpt.costs.CostMl,      # 马氏距离
            'rank': rpt.costs.CostRank,           # 排名
            'cosine': rpt.costs.CostCosine,       # 余弦距离
        }

    def calculate(self, cost, start, end):
        result = cost.error(start, end)
        if isinstance(result, (np.ndarray, list)) and np.array(result).size == 1:
            return float(np.array(result).squeeze())
        return result

    def extract(self, signal, boundary):
        """
        输入：
            signal: 1D numpy array，单变量时间序列
            boundary: int，分割点
        输出：
            result: dict，格式为 {cost_name: {'left': value, 'right': value}}
        """
        signal = np.asarray(signal)
        n = len(signal)
        result = {}
        for name, cls in self.cost_classes.items():
            try:
                if name == 'ar':
                    cost = cls(order=4)
                else:
                    cost = cls()
                cost.fit(signal)
                left = self.calculate(cost, 0, boundary)
                right = self.calculate(cost, boundary, n)
                whole = self.calculate(cost, 0, n)
                diff = right - left if left is not None and right is not None else None
                ratio = right / (left + 1e-6) if left is not None and right is not None else None
            except Exception:
                left = None
                right = None
                whole = None
                diff = None
                ratio = None
            result[name] = {'left': left, 'right': right, 'whole': whole, 'diff': diff, 'ratio': ratio}
        return result

@register_feature(func_id="12")
def rupture_cost_features(u: pd.DataFrame) -> dict:
    value = u['value'].values.astype(np.float32)
    period = u['period'].values.astype(np.float32)
    boundary = np.where(np.diff(period) != 0)[0].item()
    feats = {}

    extractor = RPTFeatureExtractor()
    features = extractor.extract(value, boundary)

    feats = {}
    for k, v in features.items():
        for seg, value in v.items():
            feats[f'rpt_cost_{k}_{seg}'] = value

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 时序变换函数注册表 ---
TRANSFORM_REGISTRY = {}

def register_transform(_func=None, *, output_mode_names=[]):
    """一个用于注册时序变换函数的装饰器。"""
    def decorator_register(func):
        TRANSFORM_REGISTRY[func.__name__] = {
            "func": func, 
            "output_mode_names": output_mode_names
        }
        return func

    if _func is None:
        # Used as @register_transform(output_mode_names=...)
        return decorator_register
    else:
        # Used as @register_transform
        return decorator_register(_func)

@register_transform(output_mode_names=['RAW'])
def no_transformation(X_df: pd.DataFrame) -> List[pd.DataFrame]:
    """
    原始时序
    """
    result_dfs = []
    result_dfs.append(X_df)

    return result_dfs

# @register_transform(output_mode_names=['MAde_trend', 'MAde_resid'])
def moving_average_decomposition(X_df: pd.DataFrame) -> List[pd.DataFrame]:
    """
    滑动平均分解
    
    Args:
        X_df: 输入数据框，包含MultiIndex (id, time) 和 columns ['value', 'period']
        
    Returns:
        List[pd.DataFrame]: 包含两个数据框的列表 [趋势值, 残差值]
    """
    X_df_sorted = X_df.sort_index()
    result_dfs = []
    
    # 为每个模态创建一个空的数据框
    for mode_name in ['trend', 'resid']:
        mode_df = X_df_sorted.copy()
        mode_df['value'] = np.nan
        result_dfs.append(mode_df)
    
    # 对每个id进行分解
    for series_id in X_df_sorted.index.get_level_values('id').unique():
        series_data = X_df_sorted.loc[series_id]
        series_data = series_data.sort_index()
        values = series_data['value'].values
        
        # 滑动平均分解
        window_size = 200
        trend = pd.Series(values).rolling(window=window_size, center=True, min_periods=1).mean()
        trend.iloc[:window_size//2] = trend.iloc[window_size//2]
        trend.iloc[-(window_size//2):] = trend.iloc[-(window_size//2)]
        
        residual = values - trend.values
        
        result_dfs[0].loc[series_id, 'value'] = trend.values  # 趋势值
        result_dfs[1].loc[series_id, 'value'] = residual  # 残差值
    
    return result_dfs

In [6]:
def _apply_feature_func_sequential(func, X_df: pd.DataFrame) -> pd.DataFrame:
    """顺序应用单个特征函数"""
    all_ids = X_df.index.get_level_values("id").unique()
    results = [
        {**{'id': id_val}, **func(X_df.loc[id_val])}
        for id_val in tqdm(all_ids, desc=f"Running {func.__name__} (sequentially)")
    ]
    return pd.DataFrame(results).set_index('id')

def _apply_transform_func(func, X_df: pd.DataFrame) -> List[pd.DataFrame]:
    """执行变换函数"""
    return func(X_df)

def apply_transformation(X_df: pd.DataFrame, transform_funcs: List[str] = None) -> Dict[str, pd.DataFrame]:
    """
    应用时序变换
    
    Args:
        X_df: 输入数据框
        transform_funcs: 要应用的变换函数名称列表，如果为None则应用所有注册的变换函数
        
    Returns:
        Dict[str, pd.DataFrame]: 键为模态名称，值为对应的数据框
    """
    if transform_funcs is None:
        transform_funcs = list(TRANSFORM_REGISTRY.keys())
    
    # 验证变换函数是否存在
    valid_transform_funcs = []
    for func_name in transform_funcs:
        if func_name not in TRANSFORM_REGISTRY:
            logger.warning(f"变换函数 {func_name} 未在注册表中找到，已跳过。")
        else:
            valid_transform_funcs.append(func_name)
    
    transform_funcs = valid_transform_funcs
    
    # 存储所有模态的数据框
    transformed_data = {}
    
    for func_name in transform_funcs:
        logger.info(f"--- 开始应用变换函数: {func_name} ---")
        start_time = time.time()
        
        transform_info = TRANSFORM_REGISTRY[func_name]
        func = transform_info['func']
        output_mode_names = transform_info['output_mode_names']
        
        # 执行变换
        transformed_results = _apply_transform_func(func, X_df)
        
        # 存储结果
        for mode_name, mode_df in zip(output_mode_names, transformed_results):
            transformed_data[mode_name] = mode_df
        
        duration = time.time() - start_time
        logger.info(f"'{func_name}' 变换完毕，耗时: {duration:.2f} 秒，生成模态: {output_mode_names}")
    
    return transformed_data

def clean_feature_names(df: pd.DataFrame, prefix: str = "f") -> pd.DataFrame:
    """清理特征名称，确保它们是合法的列名。"""
    cleaned_columns = []
    for i, col in enumerate(df.columns):
        # 替换非法字符为 _
        cleaned = re.sub(r'[^\w]', '_', col)
        # 防止开头是数字（如 "123_feature"）非法
        if re.match(r'^\d', cleaned):
            cleaned = f"{prefix}_{cleaned}"
        # 多个连续 _ 合并为一个
        cleaned = re.sub(r'__+', '_', cleaned)
        cleaned_columns.append(cleaned)
    df.columns = cleaned_columns
    return df

In [7]:
def train(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    model_directory_path: str,
):
    # For our baseline t-test approach, we don't need to train a model
    # This is essentially an unsupervised approach calculated at inference time
    model = None

    # You could enhance this by training an actual model, for example:
    # 1. Extract features from before/after segments of each time series
    # 2. Train a classifier using these features and y_train labels
    # 3. Save the trained model

    joblib.dump(model, os.path.join(model_directory_path, 'none.joblib'))

In [8]:
def load_models(model_directory_path):
    """Load all LightGBM model files saved with joblib and prepare them for ensemble"""
    models = []
    dirpath = Path(model_directory_path)
    model_files = list(dirpath.glob('*.pkl'))
    
    if not model_files:
        logger.info(f"Warning: No model files found under {model_directory_path}!")
        return models
    logger.info(f"Found a total of {len(model_files)} model files.")
    
    for model_path in model_files:
        try:
            logger.info(f"Loading model: {model_path}")
            model = joblib.load(model_path)
            models.append(model)
        except Exception as e:
            logger.info(f"Error loading model {model_path}: {e}")
    
    return models

In [9]:
def infer(
    X_test: typing.Iterable[pd.DataFrame],
    model_directory_path: str,
):
    global logger, log_file_path
    if logger is None:  # 防止重复初始化
        logger, log_file_path = get_logger('Inference', Path(model_directory_path))

    # Load models
    models = load_models(model_directory_path)

    # Funcs to run
    funcs_to_run = [
        f for f in FEATURE_REGISTRY.keys() 
        if f not in EXPERIMENTAL_FEATURES
    ]
    trans_to_run = None
    logger.info(f"未指定特征函数，将运行所有 {len(funcs_to_run)} 个非实验性特征。")

    yield  # Mark as ready

    # X_test can only be iterated once.
    # Before getting the next dataset, you must predict the current one.
    for X_df in X_test:
        logger.info("未找到基础特征文件，将创建全新的特征集。")
        feature_df, metadata = pd.DataFrame(index=X_df.index.get_level_values('id').unique()), {}
        logger.info("=== 开始时序分解 ===")
        transformed_data = apply_transformation(X_df, trans_to_run)
        logger.info(f"分解完成，共生成 {len(transformed_data)} 个模态: {list(transformed_data.keys())}")

        loaded_features = feature_df.columns.tolist()
        initial_feature_count = len(feature_df.columns)

        for mode_name, mode_df in transformed_data.items():
            logger.info(f"=== 开始为模态 '{mode_name}' 生成特征 ===")
            for func_name in funcs_to_run:
                logger.info(f"--- 开始生成特征: {func_name} ---")
                start_time = time.time()
                
                feature_info = FEATURE_REGISTRY[func_name]
                func = feature_info['func']
                is_parallelizable = feature_info['parallelizable']
                is_parallelizable = None  # 强制禁用并行化
                func_id = feature_info['func_id']
                
                if is_parallelizable:
                    new_features_df = _apply_feature_func_parallel(func, mode_df)
                else:
                    logger.info(f"函数 '{func_name}' 不可并行化，将顺序执行。")
                    new_features_df = _apply_feature_func_sequential(func, mode_df)
                new_features_df.columns = [f"{mode_name}_{func_id}_{col}" for col in new_features_df.columns]

                # 记录日志
                duration = time.time() - start_time
                logger.info(f"'{func_name}' 生成完毕，耗时: {duration:.2f} 秒。")
                logger.info(f"  新生成特征列名: {new_features_df.columns.tolist()}")
                
                for col in new_features_df.columns:
                    null_ratio = new_features_df[col].isnull().sum() / len(new_features_df)
                    zero_ratio = (new_features_df[col] == 0).sum() / len(new_features_df)
                    logger.info(f"    - '{col}': 空值比例={null_ratio:.2%}, 零值比例={zero_ratio:.2%}")

                # 删除旧版本特征（如果存在），然后合并
                feature_df = feature_df.drop(columns=new_features_df.columns, errors='ignore')
                feature_df = feature_df.merge(new_features_df, left_index=True, right_index=True, how='left')
                # feature_df, removed_features = check_new_features_corr(feature_df, loaded_features, drop_flag=True, threshold=0.95)
                feature_df = clean_feature_names(feature_df)
                loaded_features = feature_df.columns.tolist()

        feature_df = feature_df[REMAIN_FEATURES]
        logger.info("--- 生成后完整特征列表 ---")
        logger.info(f"{feature_df.columns.tolist()}")
        logger.info("-----------------------------")
        logger.info(f"生成/更新完成。总特征数: {final_feature_count}")

        def ensemble_predict(models, X):
            preds = [model.predict_proba(X)[:, 1] for model in models]
            return np.mean(preds, axis=0)
        prediction = ensemble_predict(models, feature_df)

        yield prediction  # Send the prediction for the current dataset

In [None]:
crunch.test(
    # Uncomment to disable the train
    # force_first_train=False,

    # Uncomment to disable the determinism check
    # no_determinism_check=True,
)

[32m21:40:49[0m [33mno forbidden library found[0m
[32m21:40:49[0m [33m[0m
[32m21:40:52[0m started
[32m21:40:52[0m running local test
[32m21:40:52[0m [33minternet access isn't restricted, no check will be done[0m
[32m21:40:52[0m 
[32m21:40:55[0m starting unstructured loop...
[32m21:40:55[0m executing - command=train


data\X_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_train.parquet (204327238 bytes)
data\X_train.parquet: already exists, file length match
data\X_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_test.reduced.parquet (2380918 bytes)
data\X_test.reduced.parquet: already exists, file length match
data\y_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_train.parquet (61003 bytes)
data\y_train.parquet: already exists, file length match
data\y_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_test.reduced.parquet (2655 bytes)
data\y_test.reduced.parquet: already exists, file length match


[32m21:40:56[0m executing - command=infer


Found a total of 5 model files.
Loading model: resources\model_fold_1.pkl
Loading model: resources\model_fold_2.pkl
Loading model: resources\model_fold_3.pkl
Loading model: resources\model_fold_4.pkl
Loading model: resources\model_fold_5.pkl
未指定特征函数，将运行所有 12 个非实验性特征。
未找到基础特征文件，将创建全新的特征集。
=== 开始时序分解 ===
--- 开始应用变换函数: no_transformation ---
'no_transformation' 变换完毕，耗时: 0.00 秒，生成模态: ['RAW']
分解完成，共生成 1 个模态: ['RAW']
=== 开始为模态 'RAW' 生成特征 ===
--- 开始生成特征: distribution_stats_features ---
函数 'distribution_stats_features' 不可并行化，将顺序执行。


Running distribution_stats_features (sequentially): 100%|██████████| 1/1 [00:00<00:00,  4.36it/s]

'distribution_stats_features' 生成完毕，耗时: 0.23 秒。
  新生成特征列名: ['RAW_1_stats_mean_left', 'RAW_1_stats_mean_right', 'RAW_1_stats_mean_whole', 'RAW_1_stats_mean_diff', 'RAW_1_stats_mean_ratio', 'RAW_1_stats_median_left', 'RAW_1_stats_median_right', 'RAW_1_stats_median_whole', 'RAW_1_stats_median_diff', 'RAW_1_stats_median_ratio', 'RAW_1_stats_max_left', 'RAW_1_stats_max_right', 'RAW_1_stats_max_whole', 'RAW_1_stats_max_diff', 'RAW_1_stats_max_ratio', 'RAW_1_stats_min_left', 'RAW_1_stats_min_right', 'RAW_1_stats_min_whole', 'RAW_1_stats_min_diff', 'RAW_1_stats_min_ratio', 'RAW_1_stats_range_left', 'RAW_1_stats_range_right', 'RAW_1_stats_range_whole', 'RAW_1_stats_range_diff', 'RAW_1_stats_range_ratio', 'RAW_1_stats_std_left', 'RAW_1_stats_std_right', 'RAW_1_stats_std_whole', 'RAW_1_stats_std_diff', 'RAW_1_stats_std_ratio', 'RAW_1_stats_skew_left', 'RAW_1_stats_skew_right', 'RAW_1_stats_skew_whole', 'RAW_1_stats_skew_diff', 'RAW_1_stats_skew_ratio', 'RAW_1_stats_kurt_left', 'RAW_1_stats_kurt_ri


look-up table. The actual p-value is greater than the p-value returned.

  kpss = tsa.stattools.kpss(s, regression='c', nlags='auto')
look-up table. The actual p-value is greater than the p-value returned.

  kpss = tsa.stattools.kpss(s, regression='c', nlags='auto')
look-up table. The actual p-value is greater than the p-value returned.

  kpss = tsa.stattools.kpss(s, regression='c', nlags='auto')
Running test_stats_features (sequentially): 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]

'test_stats_features' 生成完毕，耗时: 1.18 秒。
  新生成特征列名: ['RAW_2_ks_stat', 'RAW_2_ks_pvalue', 'RAW_2_ttest_pvalue', 'RAW_2_ad_stat', 'RAW_2_ad_pvalue', 'RAW_2_mannwhitney_stat', 'RAW_2_mannwhitney_pvalue', 'RAW_2_wilcoxon_stat', 'RAW_2_wilcoxon_pvalue', 'RAW_2_levene_stat', 'RAW_2_levene_pvalue', 'RAW_2_bartlett_stat', 'RAW_2_bartlett_pvalue', 'RAW_2_shapiro_pvalue_left', 'RAW_2_shapiro_pvalue_right', 'RAW_2_shapiro_pvalue_whole', 'RAW_2_shapiro_pvalue_diff', 'RAW_2_shapiro_pvalue_ratio', 'RAW_2_jb_pvalue_left', 'RAW_2_jb_pvalue_right', 'RAW_2_jb_pvalue_whole', 'RAW_2_jb_pvalue_diff', 'RAW_2_jb_pvalue_ratio', 'RAW_2_kpss_pvalue_left', 'RAW_2_kpss_pvalue_right', 'RAW_2_kpss_pvalue_whole', 'RAW_2_kpss_pvalue_diff', 'RAW_2_kpss_pvalue_ratio', 'RAW_2_kpss_stat_left', 'RAW_2_kpss_stat_right', 'RAW_2_kpss_stat_whole', 'RAW_2_kpss_stat_diff', 'RAW_2_kpss_stat_ratio', 'RAW_2_adf_pvalue_left', 'RAW_2_adf_pvalue_right', 'RAW_2_adf_pvalue_whole', 'RAW_2_adf_pvalue_diff', 'RAW_2_adf_pvalue_ratio', 'RAW_2




    - 'RAW_2_adf_stat_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_2_adf_stat_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_2_adf_stat_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_2_adf_stat_ratio': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_2_adf_icbest_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_2_adf_icbest_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_2_adf_icbest_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_2_adf_icbest_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_2_adf_icbest_ratio': 空值比例=0.00%, 零值比例=0.00%
--- 开始生成特征: cumulative_features ---
函数 'cumulative_features' 不可并行化，将顺序执行。


Running cumulative_features (sequentially): 100%|██████████| 1/1 [00:00<00:00, 500.22it/s]

'cumulative_features' 生成完毕，耗时: 0.01 秒。
  新生成特征列名: ['RAW_3_sum_left', 'RAW_3_sum_right', 'RAW_3_sum_whole', 'RAW_3_sum_diff', 'RAW_3_sum_ratio', 'RAW_3_cumsum_max_left', 'RAW_3_cumsum_max_right', 'RAW_3_cumsum_max_whole', 'RAW_3_cumsum_max_diff', 'RAW_3_cumsum_max_ratio']
    - 'RAW_3_sum_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_3_sum_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_3_sum_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_3_sum_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_3_sum_ratio': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_3_cumsum_max_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_3_cumsum_max_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_3_cumsum_max_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_3_cumsum_max_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_3_cumsum_max_ratio': 空值比例=0.00%, 零值比例=0.00%
--- 开始生成特征: oscillation_features ---
函数 'oscillation_features' 不可并行化，将顺序执行。



Running oscillation_features (sequentially): 100%|██████████| 1/1 [00:00<00:00, 132.65it/s]

'oscillation_features' 生成完毕，耗时: 0.01 秒。
  新生成特征列名: ['RAW_4_zero_cross_left', 'RAW_4_zero_cross_right', 'RAW_4_zero_cross_whole', 'RAW_4_zero_cross_diff', 'RAW_4_zero_cross_ratio', 'RAW_4_autocorr_lag1_left', 'RAW_4_autocorr_lag1_right', 'RAW_4_autocorr_lag1_whole', 'RAW_4_autocorr_lag1_diff', 'RAW_4_autocorr_lag1_ratio', 'RAW_4_diff_var_left', 'RAW_4_diff_var_right', 'RAW_4_diff_var_whole', 'RAW_4_diff_var_diff', 'RAW_4_diff_var_ratio']
    - 'RAW_4_zero_cross_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_4_zero_cross_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_4_zero_cross_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_4_zero_cross_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_4_zero_cross_ratio': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_4_autocorr_lag1_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_4_autocorr_lag1_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_4_autocorr_lag1_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_4_autocorr_lag1_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_4_autocorr_lag1_ratio': 空值比例=0.00%, 零值比例=0.00


Running cyclic_features (sequentially): 100%|██████████| 1/1 [00:00<00:00, 132.97it/s]

'cyclic_features' 生成完毕，耗时: 0.01 秒。
  新生成特征列名: ['RAW_5_dominant_freq_left', 'RAW_5_dominant_freq_right', 'RAW_5_dominant_freq_whole', 'RAW_5_dominant_freq_diff', 'RAW_5_dominant_freq_ratio', 'RAW_5_max_power_left', 'RAW_5_max_power_right', 'RAW_5_max_power_whole', 'RAW_5_max_power_diff', 'RAW_5_max_power_ratio']
    - 'RAW_5_dominant_freq_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_5_dominant_freq_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_5_dominant_freq_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_5_dominant_freq_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_5_dominant_freq_ratio': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_5_max_power_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_5_max_power_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_5_max_power_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_5_max_power_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_5_max_power_ratio': 空值比例=0.00%, 零值比例=0.00%
--- 开始生成特征: amplitude_features ---
函数 'amplitude_features' 不可并行化，将顺序执行。



Running amplitude_features (sequentially): 100%|██████████| 1/1 [00:00<00:00, 332.04it/s]


'amplitude_features' 生成完毕，耗时: 0.01 秒。
  新生成特征列名: ['RAW_6_ptp_left', 'RAW_6_ptp_right', 'RAW_6_ptp_whole', 'RAW_6_ptp_diff', 'RAW_6_ptp_ratio', 'RAW_6_iqr_left', 'RAW_6_iqr_right', 'RAW_6_iqr_whole', 'RAW_6_iqr_diff', 'RAW_6_iqr_ratio']
    - 'RAW_6_ptp_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_6_ptp_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_6_ptp_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_6_ptp_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_6_ptp_ratio': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_6_iqr_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_6_iqr_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_6_iqr_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_6_iqr_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_6_iqr_ratio': 空值比例=0.00%, 零值比例=0.00%
--- 开始生成特征: volatility_of_volatility_features ---
函数 'volatility_of_volatility_features' 不可并行化，将顺序执行。


Running volatility_of_volatility_features (sequentially): 100%|██████████| 1/1 [00:00<00:00, 333.41it/s]

'volatility_of_volatility_features' 生成完毕，耗时: 0.01 秒。
  新生成特征列名: ['RAW_7_rolling_std_w50_mean_left', 'RAW_7_rolling_std_w50_mean_right', 'RAW_7_rolling_std_w50_mean_whole', 'RAW_7_rolling_std_w50_mean_diff', 'RAW_7_rolling_std_w50_mean_ratio']
    - 'RAW_7_rolling_std_w50_mean_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_7_rolling_std_w50_mean_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_7_rolling_std_w50_mean_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_7_rolling_std_w50_mean_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_7_rolling_std_w50_mean_ratio': 空值比例=0.00%, 零值比例=0.00%
--- 开始生成特征: entropy_features ---
函数 'entropy_features' 不可并行化，将顺序执行。



Running entropy_features (sequentially): 100%|██████████| 1/1 [00:08<00:00,  8.35s/it]

'entropy_features' 生成完毕，耗时: 8.35 秒。
  新生成特征列名: ['RAW_8_shannon_entropy_left', 'RAW_8_shannon_entropy_right', 'RAW_8_shannon_entropy_whole', 'RAW_8_shannon_entropy_diff', 'RAW_8_shannon_entropy_ratio', 'RAW_8_perm_entropy_left', 'RAW_8_perm_entropy_right', 'RAW_8_perm_entropy_whole', 'RAW_8_perm_entropy_diff', 'RAW_8_perm_entropy_ratio', 'RAW_8_spectral_entropy_left', 'RAW_8_spectral_entropy_right', 'RAW_8_spectral_entropy_whole', 'RAW_8_spectral_entropy_diff', 'RAW_8_spectral_entropy_ratio', 'RAW_8_svd_entropy_left', 'RAW_8_svd_entropy_right', 'RAW_8_svd_entropy_whole', 'RAW_8_svd_entropy_diff', 'RAW_8_svd_entropy_ratio', 'RAW_8_approx_entropy_left', 'RAW_8_approx_entropy_right', 'RAW_8_approx_entropy_whole', 'RAW_8_approx_entropy_diff', 'RAW_8_approx_entropy_ratio', 'RAW_8_sample_entropy_left', 'RAW_8_sample_entropy_right', 'RAW_8_sample_entropy_whole', 'RAW_8_sample_entropy_diff', 'RAW_8_sample_entropy_ratio', 'RAW_8_petrosian_fd_left', 'RAW_8_petrosian_fd_right', 'RAW_8_petrosian_fd


Running fractal_dimension_features (sequentially): 100%|██████████| 1/1 [00:00<00:00, 446.16it/s]

'fractal_dimension_features' 生成完毕，耗时: 0.01 秒。
  新生成特征列名: ['RAW_9_petrosian_fd_left', 'RAW_9_petrosian_fd_right', 'RAW_9_petrosian_fd_whole', 'RAW_9_petrosian_fd_diff', 'RAW_9_petrosian_fd_ratio', 'RAW_9_katz_fd_left', 'RAW_9_katz_fd_right', 'RAW_9_katz_fd_whole', 'RAW_9_katz_fd_diff', 'RAW_9_katz_fd_ratio', 'RAW_9_higuchi_fd_left', 'RAW_9_higuchi_fd_right', 'RAW_9_higuchi_fd_whole', 'RAW_9_higuchi_fd_diff', 'RAW_9_higuchi_fd_ratio', 'RAW_9_detrended_fluctuation_left', 'RAW_9_detrended_fluctuation_right', 'RAW_9_detrended_fluctuation_whole', 'RAW_9_detrended_fluctuation_diff', 'RAW_9_detrended_fluctuation_ratio']
    - 'RAW_9_petrosian_fd_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_9_petrosian_fd_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_9_petrosian_fd_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_9_petrosian_fd_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_9_petrosian_fd_ratio': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_9_katz_fd_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_9_katz_fd_right': 空值比例=0.00%, 零值比例=


Running tsfresh_features (sequentially): 100%|██████████| 1/1 [00:00<00:00,  2.06it/s]

'tsfresh_features' 生成完毕，耗时: 0.49 秒。
  新生成特征列名: ['RAW_10_ratio_value_number_to_time_series_length_left', 'RAW_10_ratio_value_number_to_time_series_length_right', 'RAW_10_ratio_value_number_to_time_series_length_whole', 'RAW_10_ratio_value_number_to_time_series_length_diff', 'RAW_10_ratio_value_number_to_time_series_length_ratio', 'RAW_10_ratio_beyond_r_sigma_6_left', 'RAW_10_ratio_beyond_r_sigma_6_right', 'RAW_10_ratio_beyond_r_sigma_6_whole', 'RAW_10_ratio_beyond_r_sigma_6_diff', 'RAW_10_ratio_beyond_r_sigma_6_ratio', 'RAW_10_ratio_beyond_r_sigma_1.5_left', 'RAW_10_ratio_beyond_r_sigma_1.5_right', 'RAW_10_ratio_beyond_r_sigma_1.5_whole', 'RAW_10_ratio_beyond_r_sigma_1.5_diff', 'RAW_10_ratio_beyond_r_sigma_1.5_ratio', 'RAW_10_quantile_0.6_left', 'RAW_10_quantile_0.6_right', 'RAW_10_quantile_0.6_whole', 'RAW_10_quantile_0.6_diff', 'RAW_10_quantile_0.6_ratio', 'RAW_10_quantile_0.4_left', 'RAW_10_quantile_0.4_right', 'RAW_10_quantile_0.4_whole', 'RAW_10_quantile_0.4_diff', 'RAW_10_quantile




    - 'RAW_10_change_quantiles_f_agg_var_isabs_True_qh_0.4_ql_0.2_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantiles_f_agg_var_isabs_True_qh_0.4_ql_0.2_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantiles_f_agg_var_isabs_True_qh_0.4_ql_0.2_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantiles_f_agg_var_isabs_True_qh_0.4_ql_0.2_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantiles_f_agg_var_isabs_True_qh_0.4_ql_0.2_ratio': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantiles_f_agg_var_isabs_False_qh_1.0_ql_0.4_left': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantiles_f_agg_var_isabs_False_qh_1.0_ql_0.4_right': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantiles_f_agg_var_isabs_False_qh_1.0_ql_0.4_whole': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantiles_f_agg_var_isabs_False_qh_1.0_ql_0.4_diff': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantiles_f_agg_var_isabs_False_qh_1.0_ql_0.4_ratio': 空值比例=0.00%, 零值比例=0.00%
    - 'RAW_10_change_quantile

Running ar_model_features (sequentially): 100%|██████████| 1/1 [00:00<00:00, 32.19it/s]

'ar_model_features' 生成完毕，耗时: 0.04 秒。
  新生成特征列名: ['RAW_11_ar_residuals_s2_pred_mean', 'RAW_11_ar_residuals_s2_pred_std', 'RAW_11_ar_residuals_s2_pred_skew', 'RAW_11_ar_residuals_s2_pred_kurt', 'RAW_11_ar_residuals_s1_pred_mean', 'RAW_11_ar_residuals_s1_pred_std', 'RAW_11_ar_residuals_s1_pred_skew', 'RAW_11_ar_residuals_s1_pred_kurt', 'RAW_11_ar_resid_std_left', 'RAW_11_ar_resid_std_right', 'RAW_11_ar_resid_std_whole', 'RAW_11_ar_resid_std_diff', 'RAW_11_ar_resid_std_ratio', 'RAW_11_ar_aic_left', 'RAW_11_ar_aic_right', 'RAW_11_ar_aic_whole', 'RAW_11_ar_aic_diff', 'RAW_11_ar_aic_ratio', 'RAW_11_ar_bic_left', 'RAW_11_ar_bic_right', 'RAW_11_ar_bic_whole', 'RAW_11_ar_bic_diff', 'RAW_11_ar_bic_ratio', 'RAW_11_param_0_left', 'RAW_11_param_0_right', 'RAW_11_param_0_whole', 'RAW_11_param_0_diff', 'RAW_11_param_0_ratio', 'RAW_11_param_1_left', 'RAW_11_param_1_right', 'RAW_11_param_1_whole', 'RAW_11_param_1_diff', 'RAW_11_param_1_ratio', 'RAW_11_param_2_left', 'RAW_11_param_2_right', 'RAW_11_param


Running rupture_cost_features (sequentially): 100%|██████████| 1/1 [00:00<00:00,  2.46it/s]

'rupture_cost_features' 生成完毕，耗时: 0.41 秒。
  新生成特征列名: ['RAW_12_rpt_cost_l1_left', 'RAW_12_rpt_cost_l1_right', 'RAW_12_rpt_cost_l1_whole', 'RAW_12_rpt_cost_l1_diff', 'RAW_12_rpt_cost_l1_ratio', 'RAW_12_rpt_cost_l2_left', 'RAW_12_rpt_cost_l2_right', 'RAW_12_rpt_cost_l2_whole', 'RAW_12_rpt_cost_l2_diff', 'RAW_12_rpt_cost_l2_ratio', 'RAW_12_rpt_cost_clinear_left', 'RAW_12_rpt_cost_clinear_right', 'RAW_12_rpt_cost_clinear_whole', 'RAW_12_rpt_cost_clinear_diff', 'RAW_12_rpt_cost_clinear_ratio', 'RAW_12_rpt_cost_rbf_left', 'RAW_12_rpt_cost_rbf_right', 'RAW_12_rpt_cost_rbf_whole', 'RAW_12_rpt_cost_rbf_diff', 'RAW_12_rpt_cost_rbf_ratio', 'RAW_12_rpt_cost_normal_left', 'RAW_12_rpt_cost_normal_right', 'RAW_12_rpt_cost_normal_whole', 'RAW_12_rpt_cost_normal_diff', 'RAW_12_rpt_cost_normal_ratio', 'RAW_12_rpt_cost_ar_left', 'RAW_12_rpt_cost_ar_right', 'RAW_12_rpt_cost_ar_whole', 'RAW_12_rpt_cost_ar_diff', 'RAW_12_rpt_cost_ar_ratio', 'RAW_12_rpt_cost_mahalanobis_left', 'RAW_12_rpt_cost_mahalanobis_righ


Traceback (most recent call last):
  File "https://github.com/crunchdao/competitions/raw/refs/heads/master/competitions/structural-break/scoring/runner.py", line 31, in run
  File "d:\anaconda3\envs\adia\Lib\site-packages\crunch\runner\local.py", line 559, in execute
    result = utils.smart_call(
             ^^^^^^^^^^^^^^^^^
  File "d:\anaconda3\envs\adia\Lib\site-packages\crunch\utils.py", line 275, in smart_call
    return function(**arguments)
           ^^^^^^^^^^^^^^^^^^^^^
  File "https://github.com/crunchdao/competitions/raw/refs/heads/master/competitions/structural-break/scoring/runner.py", line 127, in infer
  File "d:\anaconda3\envs\adia\Lib\site-packages\crunch\container.py", line 187, in collect
    y = next(iterator, sentinel)
        ^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\25105\AppData\Local\Temp\ipykernel_21876\4291079202.py", line 70, in infer
    feature_df = feature_df[REMAIN_FEATURES]
                 ~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "d:\anaconda3\envs\adia\

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
prediction = pd.read_parquet("data/prediction.parquet")
prediction

In [None]:
# Load the targets
target = pd.read_parquet("data/y_test.reduced.parquet")["structural_breakpoint"]

# Call the scoring function
sklearn.metrics.roc_auc_score(
    target,
    prediction,
)