In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.tsa.api as tsa
from statsmodels.tsa.ar_model import AutoReg
import antropy
import sklearn
from tsfresh.feature_extraction import feature_calculators as tsfresh_fe
import ruptures as rpt

import lightgbm as lgb

import os
import re
import sys
import json
import time
import logging
import inspect
import typing
import joblib
from itertools import combinations
from pathlib import Path
from tqdm.auto import tqdm
from datetime import datetime
from joblib import Parallel, delayed
from typing import List, Dict, Tuple, Optional
import warnings
from statsmodels.tools.sm_exceptions import InterpolationWarning

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import crunch

# Load the Crunch Toolings
crunch = crunch.load_notebook()

loaded inline runner with module: <module '__main__'>

cli version: 6.6.1
available ram: 15.73 gb
available cpu: 16 core
----


In [3]:
# @crunch/keep:on
warnings.filterwarnings("ignore", category=UserWarning)
warnings.simplefilter("ignore", InterpolationWarning)
warnings.simplefilter("ignore", FutureWarning)

In [4]:
# @crunch/keep:on
def get_logger(name: str, log_dir: Path, verbose: bool = True):
    """
    获取一个配置好的 logger 实例，它会生成带时间戳的详细日志。
    """
    # 确保日志目录存在
    log_dir.mkdir(exist_ok=True, parents=True)
    
    # 1. 创建带时间戳的详细日志文件名
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    detail_log_file = log_dir / f'{name.lower()}_{timestamp}.log'

    # 2. 为 logger 设置一个唯一的名称（基于时间戳），避免冲突
    logger = logging.getLogger(f"{name}-{timestamp}")
    logger.setLevel(logging.INFO)

    # 防止将日志消息传播到根 logger
    logger.propagate = False

    # 如果已经有处理器，则不重复添加
    if logger.hasHandlers():
        logger.handlers.clear()

    # 3. 创建详细日志的文件处理器
    detail_handler = logging.FileHandler(detail_log_file, mode='a', encoding='utf-8')
    detail_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
    detail_handler.setFormatter(detail_formatter)
    logger.addHandler(detail_handler)
    
    # 4. 创建控制台处理器
    # 控制台 - INFO级别 (受verbose控制)
    if verbose:
        info_handler = logging.StreamHandler(sys.stdout)
        info_handler.setLevel(logging.INFO)
        info_handler.addFilter(lambda record: record.levelno == logging.INFO)
        info_formatter = logging.Formatter('%(message)s')
        info_handler.setFormatter(info_formatter)
        logger.addHandler(info_handler)

    # 控制台 - WARNING及以上 (始终输出)
    warn_handler = logging.StreamHandler(sys.stdout)
    warn_handler.setLevel(logging.WARNING)
    warn_formatter = logging.Formatter('%(levelname)s: %(message)s')
    warn_handler.setFormatter(warn_formatter)
    logger.addHandler(warn_handler)

    return logger, detail_log_file # 返回 logger 和日志文件路径 

logger = None
log_file_path = None

In [5]:
# @crunch/keep:on
# --- Exclude Features ---
EXPERIMENTAL_FEATURES = [
] 

# --- Top Features ---
TOP_FEATURES = [
    'RAW_1_stats_cv_whole',
    'RAW_8_percentage_of_reoccurring_datapoints_to_all_datapoints_contribution_left',
    'RAW_8_percentage_of_reoccurring_values_to_all_values_contribution_left',
    'RAW_3_cumsum_linear_trend_pvalue_whole',
    'RAW_8_index_mass_quantile_q_0_8_ratio_to_whole_left',
    'RAW_7_sample_entropy_left',
    'RAW_1_stats_std_whole',
    'RAW_8_quantile_0_4_whole',
    'RAW_8_index_mass_quantile_q_0_1_right',
    'RAW_8_percentage_of_reoccurring_values_to_all_values_ratio_to_whole_left',
]

# --- Remain Features ---
REMAIN_FEATURES = [
    'mul_RAW_1_stats_cv_whole_RAW_1_stats_std_whole',
    'add_RAW_8_index_mass_quantile_q_0_8_ratio_to_whole_left_RAW_8_index_mass_quantile_q_0_1_right',
    'sub_RAW_8_percentage_of_reoccurring_values_to_all_values_contribution_left_RAW_7_sample_entropy_left',
    'sub_RAW_8_percentage_of_reoccurring_datapoints_to_all_datapoints_contribution_left_RAW_7_sample_entropy_left',
    'div_RAW_7_sample_entropy_left_RAW_3_cumsum_linear_trend_pvalue_whole',
    'RAW_2_bartlett_stat',
    'RAW_2_bartlett_pvalue',
    'div_RAW_1_stats_std_whole_RAW_1_stats_cv_whole',
    'sqmul_RAW_8_index_mass_quantile_q_0_1_right_RAW_8_index_mass_quantile_q_0_8_ratio_to_whole_left',
    'RAW_8_ratio_value_number_to_time_series_length_whole',
    'RAW_8_energy_ratio_by_chunks_num_segments_10_segment_focus_9_left',
    'mul_RAW_1_stats_cv_whole_RAW_8_change_quantiles_f_agg_var_isabs_False_qh_0_8_ql_0_0_whole',
    'mul_RAW_1_stats_cv_whole_RAW_2_ad_stat',
    'RAW_2_ks_stat',
    'sqmul_RAW_1_stats_cv_whole_RAW_1_stats_std_whole',
    'div_RAW_8_percentage_of_reoccurring_datapoints_to_all_datapoints_contribution_left_RAW_8_percentage_of_reoccurring_values_to_all_values_ratio_to_whole_left',
    'RAW_3_cumsum_max_ratio_to_whole_left',
    'RAW_1_stats_median_ratio',
    'RAW_2_adf_icbest_left',
    'RAW_8_quantile_0_4_ratio_to_whole_left',
    'RAW_8_fft_coefficient_attr_imag_coeff_1_ratio_to_whole_left',
    'sqmul_RAW_1_stats_std_whole_RAW_1_stats_cv_whole',
    'div_RAW_1_stats_cv_whole_RAW_8_percentage_of_reoccurring_datapoints_to_all_datapoints_contribution_left',
    'RAW_4_diff_var_contribution_left',
    'RAW_1_stats_kurt_whole',
    'div_RAW_8_quantile_0_4_whole_RAW_1_stats_std_whole',
    'mul_RAW_1_stats_cv_whole_RAW_1_stats_min_whole',
    'RAW_8_agg_linear_trend_attr_rvalue_chunk_len_50_f_agg_max_ratio_to_whole_left',
    'RAW_2_ad_stat',
    'RAW_8_benford_correlation_whole',
    'RAW_7_katz_fd_whole',
    'RAW_2_ad_pvalue',
    'sub_RAW_8_index_mass_quantile_q_0_8_ratio_to_whole_left_RAW_7_sample_entropy_left',
    'add_RAW_8_percentage_of_reoccurring_datapoints_to_all_datapoints_contribution_left_RAW_1_stats_std_whole',
    'RAW_7_perm_entropy_left',
    'RAW_8_first_location_of_maximum_left',
    'div_RAW_1_stats_std_whole_RAW_8_quantile_0_4_whole',
    'RAW_2_adf_stat_left',
    'mul_RAW_1_stats_cv_whole_RAW_8_agg_linear_trend_attr_slope_chunk_len_10_f_agg_mean_contribution_left',
    'div_RAW_8_percentage_of_reoccurring_values_to_all_values_contribution_left_RAW_8_percentage_of_reoccurring_values_to_all_values_ratio_to_whole_left',
    'RAW_2_ks_pvalue',
    'RAW_10_rpt_cost_cosine_whole',
    'RAW_8_ratio_beyond_r_sigma_1_left',
    'RAW_7_higuchi_fd_ratio_to_whole_right',
    'RAW_8_index_mass_quantile_q_0_8_left',
    'RAW_8_change_quantiles_f_agg_var_isabs_True_qh_1_0_ql_0_4_contribution_left',
    'RAW_8_linear_trend_attr_rvalue_ratio',
    'div_RAW_1_stats_cv_whole_RAW_8_quantile_0_4_whole',
    'RAW_8_index_mass_quantile_q_0_6_ratio_to_whole_left',
    'RAW_8_ratio_beyond_r_sigma_3_diff',
    'RAW_5_dominant_freq_right',
    'RAW_9_ar_param_5_diff',
    'RAW_8_quantile_0_1_ratio_to_whole_right',
    'RAW_8_ratio_beyond_r_sigma_0_5_diff',
    'RAW_8_quantile_0_6_right',
    'RAW_4_diff_var_contribution_right',
    'RAW_8_agg_linear_trend_attr_intercept_chunk_len_10_f_agg_max_ratio_to_whole_right',
    'mul_RAW_1_stats_cv_whole_RAW_8_ar_coefficient_coeff_2_k_10_left',
    'RAW_9_ar_param_3_right',
    'RAW_8_fft_coefficient_attr_imag_coeff_1_left',
    'RAW_8_change_quantiles_f_agg_var_isabs_False_qh_1_0_ql_0_2_ratio_to_whole_right',
    'RAW_8_benford_correlation_ratio_to_whole_left',
    'RAW_7_svd_entropy_whole',
    'RAW_8_ratio_beyond_r_sigma_1_ratio_to_whole_left',
    'div_RAW_1_stats_std_whole_RAW_3_cumsum_linear_trend_pvalue_whole',
    'RAW_3_cumsum_detrend_volatility_normalized_whole',
    'RAW_4_autocorr_lag1_diff',
    'div_RAW_8_percentage_of_reoccurring_datapoints_to_all_datapoints_contribution_left_RAW_8_percentage_of_reoccurring_values_to_all_values_contribution_left',
    'RAW_8_ratio_beyond_r_sigma_1_5_diff',
    'RAW_8_index_mass_quantile_q_0_6_right',
    'mul_RAW_1_stats_cv_whole_RAW_8_fft_coefficient_attr_imag_coeff_1_ratio_to_whole_right',
    'RAW_1_stats_median_right',
    'mul_RAW_1_stats_cv_whole_RAW_3_cumsum_max_ratio_to_whole_left',
    'RAW_9_ar_param_3_ratio_to_whole_left',
    'RAW_8_ratio_beyond_r_sigma_3_left',
    'RAW_8_agg_linear_trend_attr_rvalue_chunk_len_50_f_agg_mean_diff',
    'mul_RAW_1_stats_cv_whole_RAW_2_kpss_stat_whole',
    'RAW_8_fft_coefficient_attr_imag_coeff_1_ratio_to_whole_right',
    'RAW_2_ttest_pvalue',
    'RAW_8_change_quantiles_f_agg_var_isabs_False_qh_0_8_ql_0_0_right',
    'RAW_8_friedrich_coefficients_coeff_3_m_3_r_30_ratio',
    'RAW_4_autocorr_lag1_ratio',
    'RAW_1_stats_median_whole',
    'mul_RAW_1_stats_cv_whole_RAW_1_stats_min_diff',
    'RAW_8_sum_of_reoccurring_data_points_ratio_to_whole_left',
    'RAW_1_stats_kurt_left',
    'RAW_8_ratio_value_number_to_time_series_length_ratio_to_whole_left',
    'sqmul_RAW_1_stats_std_whole_RAW_7_sample_entropy_left',
    'mul_RAW_1_stats_cv_whole_RAW_8_fft_coefficient_attr_imag_coeff_3_left',
    'RAW_2_kpss_stat_whole',
    'mul_RAW_7_sample_entropy_left_RAW_8_percentage_of_reoccurring_values_to_all_values_ratio_to_whole_left',
    'RAW_3_cumsum_detrend_volatility_normalized_left',
    'RAW_8_quantile_0_4_right',
    'RAW_7_approx_entropy_left',
    'div_RAW_7_sample_entropy_left_RAW_8_index_mass_quantile_q_0_8_ratio_to_whole_left',
    'add_RAW_8_percentage_of_reoccurring_values_to_all_values_contribution_left_RAW_3_cumsum_linear_trend_pvalue_whole',
    'RAW_8_change_quantiles_f_agg_var_isabs_True_qh_0_6_ql_0_4_whole',
    'div_RAW_8_quantile_0_4_whole_RAW_3_cumsum_linear_trend_pvalue_whole',
    'sqmul_RAW_8_quantile_0_4_whole_RAW_7_sample_entropy_left',
    'add_RAW_8_percentage_of_reoccurring_datapoints_to_all_datapoints_contribution_left_RAW_8_quantile_0_4_whole',
    'RAW_3_cumsum_detrend_volatility_left',
    'mul_RAW_1_stats_cv_whole_RAW_8_agg_linear_trend_attr_rvalue_chunk_len_50_f_agg_mean_diff',
    'RAW_8_energy_ratio_by_chunks_num_segments_10_segment_focus_9_whole',
    'mul_RAW_1_stats_cv_whole_RAW_8_agg_linear_trend_attr_rvalue_chunk_len_50_f_agg_max_ratio_to_whole_left',
    'RAW_7_sample_entropy_ratio_to_whole_right',
    'mul_RAW_8_index_mass_quantile_q_0_8_ratio_to_whole_left_RAW_8_index_mass_quantile_q_0_1_right',
]

In [6]:
# @crunch/keep:on
# --- 时序变换函数注册表 ---
TRANSFORM_REGISTRY = {}

def register_transform(_func=None, *, output_mode_names=[]):
    """一个用于注册时序变换函数的装饰器。"""
    def decorator_register(func):
        TRANSFORM_REGISTRY[func.__name__] = {
            "func": func, 
            "output_mode_names": output_mode_names
        }
        return func

    if _func is None:
        # Used as @register_transform(output_mode_names=...)
        return decorator_register
    else:
        # Used as @register_transform
        return decorator_register(_func)

@register_transform(output_mode_names=['RAW'])
def no_transformation(X_df: pd.DataFrame) -> List[pd.DataFrame]:
    """
    原始时序
    """
    result_dfs = []
    result_dfs.append(X_df)

    return result_dfs

In [7]:
# @crunch/keep:on
# --- 特征函数注册表 ---
FEATURE_REGISTRY = {}

def register_feature(_func=None, *, parallelizable=True, func_id=""):
    """一个用于注册特征函数的装饰器，可以标记特征是否可并行化。"""
    def decorator_register(func):
        FEATURE_REGISTRY[func.__name__] = {
            "func": func, 
            "parallelizable": parallelizable,
            "func_id": func_id
        }
        return func

    if _func is None:
        # Used as @register_feature(parallelizable=...)
        return decorator_register
    else:
        # Used as @register_feature
        return decorator_register(_func)

def _add_diff_ratio_feats(feats: dict, name: str, left, right):
    """
    一个辅助函数，用于向特征字典中添加差异和比例特征。

    Args:
        feats (dict): 要更新的特征字典。
        name (str): 特征的基础名称 (例如, 'stats_mean')。
        left (float): 左侧分段的特征值。
        right (float): 右侧分段的特征值。
    """
    # check nan/None 
    if np.isnan(left) or np.isnan(right) or left is None or right is None:
        feats[f'{name}_diff'] = 0.0
        feats[f'{name}_ratio'] = 0.0
        return
    # 做差
    feats[f'{name}_diff'] = right - left
    # 做比
    feats[f'{name}_ratio'] = right / (left + 1e-6)


def _add_contribution_ratio_feats(feats: dict, name: str, left, right, whole):
    """
    一个辅助函数，用于向特征字典中添加贡献度和与整体的比例特征。

    Args:
        feats (dict): 要更新的特征字典。
        name (str): 特征的基础名称 (例如, 'stats_mean')。
        left (float): 左侧分段的特征值。
        right (float): 右侧分段的特征值。
        whole (float): 整个序列的特征值。
    """
    # check nan/None 
    if np.isnan(left) or np.isnan(right) or np.isnan(whole) or left is None or right is None or whole is None :
        feats[f'{name}_contribution_left'] = 0.0
        feats[f'{name}_contribution_right'] = 0.0
        feats[f'{name}_ratio_to_whole_left'] = 0.0
        feats[f'{name}_ratio_to_whole_right'] = 0.0
        return
    # 特征贡献度
    feats[f'{name}_contribution_left'] = left / (left + right + 1e-6)
    feats[f'{name}_contribution_right'] = right / (left + right + 1e-6)
    # 与整体特征的比例
    feats[f'{name}_ratio_to_whole_left'] = left / (whole + 1e-6)
    feats[f'{name}_ratio_to_whole_right'] = right / (whole + 1e-6)

# --- 1. 分布统计特征 ---
def safe_cv(s):
    s = pd.Series(s)
    m = s.mean()
    std = s.std()
    return std / m if abs(m) > 1e-6 else 0.0

def rolling_std_mean(s, window=50):
    s = pd.Series(s)
    if len(s) < window:
        return 0.0
    return s.rolling(window=window).std().dropna().mean()

def slope_theil_sen(s):
    s = pd.Series(s)
    if len(s) < 2:
        return 0.0
    try:
        slope, intercept, _, _ = scipy.stats.theilslopes(s.values, np.arange(len(s)))
        return slope
    except Exception:
        return 0.0

class STATSFeatureExtractor:
    def __init__(self):
        # 所有可用的func类及其名称
        self.func_classes = {
            # 'mean': np.mean,
            'median': np.median,
            # 'max': np.max,
            'min': np.min,
            # 'range': lambda x: np.max(x) - np.min(x),
            'std': np.std,
            # 'skew': scipy.stats.skew,
            'kurt': scipy.stats.kurtosis,
            'cv': safe_cv,
            # 'mean_of_rolling_std': rolling_std_mean,
            # 'theil_sen_slope': slope_theil_sen
        }
    
    def fit(self, signal):
        self.signal = np.asarray(signal)
        self.n = len(signal)

    def calculate(self, func, start, end):
        result = func(self.signal[start:end])
        if isinstance(result, float) or isinstance(result, int):
            return result
        else:
            return result.item()

    def extract(self, signal, boundary):
        """
        输入：
            signal: 1D numpy array，单变量时间序列
            boundary: int，分割点
        输出：
            result: dict，格式为 {func_name: {'left': value, 'right': value}}
        """
        n = self.n
        result = {}
        for name, func in self.func_classes.items():
            try:
                left = self.calculate(func, 0, boundary)
                right = self.calculate(func, boundary, n)
                whole = self.calculate(func, 0, n)
                # diff = right - left
                # ratio = right / (left + 1e-6)
            except Exception:
                left = None
                right = None
                whole = None
                # diff = None
                # ratio = None
            # Move to _add_diff_ratio_feats, 'diff': diff, 'ratio': ratio
            result[name] = {'left': left, 'right': right, 'whole': whole}   
        return result

@register_feature(func_id="1")
def distribution_stats_features(u: pd.DataFrame) -> dict:
    """统计量的分段值、Diff值、Ratio值"""
    value = u['value'].values.astype(np.float32)
    period = u['period'].values.astype(np.float32)
    boundary = np.where(np.diff(period) != 0)[0].item()
    feats = {}

    extractor = STATSFeatureExtractor()
    extractor.fit(value)
    features = extractor.extract(value, boundary)

    feats = {}
    for k, v in features.items():
        for seg, value in v.items():
            feats[f'stats_{k}_{seg}'] = value
        _add_diff_ratio_feats(feats, f'stats_{k}', v['left'], v['right'])
        # _add_contribution_ratio_feats(feats, f'stats_{k}', v['left'], v['right'], v['whole'])

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}
    
# --- 2. 假设检验统计量特征 ---
@register_feature(func_id="2")
def test_stats_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    s_whole = u['value']
    feats = {}

    """假设检验统计量"""
    # KS检验
    ks_stat, ks_pvalue = scipy.stats.ks_2samp(s1, s2)
    feats['ks_stat'] = ks_stat
    feats['ks_pvalue'] = -ks_pvalue

    # T检验
    ttest_stat, ttest_pvalue = scipy.stats.ttest_ind(s1, s2, equal_var=False)
    feats['ttest_pvalue'] = -ttest_pvalue if not np.isnan(ttest_pvalue) else 1

    # AD检验
    ad_stat, _, ad_pvalue = scipy.stats.anderson_ksamp([s1.to_numpy(), s2.to_numpy()])
    feats['ad_stat'] = ad_stat
    feats['ad_pvalue'] = -ad_pvalue

    # # Mann-Whitney U检验 (非参数，不假设分布)
    # mw_stat, mw_pvalue = scipy.stats.mannwhitneyu(s1, s2, alternative='two-sided')
    # feats['mannwhitney_stat'] = mw_stat if not np.isnan(mw_stat) else 0
    # feats['mannwhitney_pvalue'] = -mw_pvalue if not np.isnan(mw_pvalue) else 1
    
    # # Wilcoxon秩和检验
    # w_stat, w_pvalue = scipy.stats.ranksums(s1, s2)
    # feats['wilcoxon_stat'] = w_stat if not np.isnan(w_stat) else 0
    # feats['wilcoxon_pvalue'] = -w_pvalue if not np.isnan(w_pvalue) else 1

    # # Levene检验
    # levene_stat, levene_pvalue = scipy.stats.levene(s1, s2)
    # feats['levene_stat'] = levene_stat if not np.isnan(levene_stat) else 0
    # feats['levene_pvalue'] = -levene_pvalue if not np.isnan(levene_pvalue) else 1
    
    # Bartlett检验
    bartlett_stat, bartlett_pvalue = scipy.stats.bartlett(s1, s2)
    feats['bartlett_stat'] = bartlett_stat if not np.isnan(bartlett_stat) else 0
    feats['bartlett_pvalue'] = -bartlett_pvalue if not np.isnan(bartlett_pvalue) else 1
    
    # """分段假设检验的分段值、Diff值、Ratio值"""
    # # Shapiro-Wilk检验
    # sw1_stat, sw1_pvalue, sw2_stat, sw2_pvalue, sw_whole_stat, sw_whole_pvalue = (np.nan,)*6
    # try:
    #     sw1_stat, sw1_pvalue = scipy.stats.shapiro(s1)
    #     sw2_stat, sw2_pvalue = scipy.stats.shapiro(s2)
    #     sw_whole_stat, sw_whole_pvalue = scipy.stats.shapiro(s_whole)
    # except:
    #     pass
    # feats['shapiro_pvalue_left'] = sw1_pvalue
    # feats['shapiro_pvalue_right'] = sw2_pvalue
    # feats['shapiro_pvalue_whole'] = sw_whole_pvalue
    # _add_diff_ratio_feats(feats, 'shapiro_pvalue', sw1_pvalue, sw2_pvalue)
    # _add_contribution_ratio_feats(feats, 'shapiro_pvalue', sw1_pvalue, sw2_pvalue, sw_whole_pvalue)

    # # Jarque-Bera检验差异
    # jb1_stat, jb1_pvalue, jb2_stat, jb2_pvalue, jb_whole_stat, jb_whole_pvalue = (np.nan,)*6
    # try:
    #     jb1_stat, jb1_pvalue = scipy.stats.jarque_bera(s1)
    #     jb2_stat, jb2_pvalue = scipy.stats.jarque_bera(s2)
    #     jb_whole_stat, jb_whole_pvalue = scipy.stats.jarque_bera(s_whole)
    # except:
    #     pass
    # feats['jb_pvalue_left'] = jb1_pvalue
    # feats['jb_pvalue_right'] = jb2_pvalue
    # feats['jb_pvalue_whole'] = jb_whole_pvalue
    # _add_diff_ratio_feats(feats, 'jb_pvalue', jb1_pvalue, jb2_pvalue)
    # _add_contribution_ratio_feats(feats, 'jb_pvalue', jb1_pvalue, jb2_pvalue, jb_whole_pvalue)

    # KPSS检验
    def extract_kpss_features(s):
        if len(s) <= 12:
            return {'p': 0.1, 'stat': 0.0, 'lag': 0, 'crit_5pct': 0.0, 'reject_5pct': 0}
        kpss = tsa.stattools.kpss(s, regression='c', nlags='auto')
        stat, p, lag, crit = kpss
        crit_5pct = crit['5%']
        return {
            'p': p,
            'stat': stat,
            'lag': lag,
            'crit_5pct': crit_5pct,
            'reject_5pct': int(stat > crit_5pct)  # KPSS原假设是"平稳"，所以 > 临界值 拒绝平稳
        }
    try:
        k1 = extract_kpss_features(s1)
        k2 = extract_kpss_features(s2)
        k_whole = extract_kpss_features(s_whole)

        # feats['kpss_pvalue_left'] = k1['p']
        # feats['kpss_pvalue_right'] = k2['p']
        # feats['kpss_pvalue_whole'] = k_whole['p']
        # _add_diff_ratio_feats(feats, 'kpss_pvalue', k1['p'], k2['p'])
        # _add_contribution_ratio_feats(feats, 'kpss_pvalue', k1['p'], k2['p'], k_whole['p'])

        feats['kpss_stat_left'] = k1['stat']
        feats['kpss_stat_right'] = k2['stat']
        feats['kpss_stat_whole'] = k_whole['stat']
        # _add_diff_ratio_feats(feats, 'kpss_stat', k1['stat'], k2['stat'])
        # _add_contribution_ratio_feats(feats, 'kpss_stat', k1['stat'], k2['stat'], k_whole['stat'])
    except:
        feats.update({
            'kpss_pvalue_left': 1, 'kpss_pvalue_right': 1, 'kpss_pvalue_whole': 1, 'kpss_pvalue_diff': 0, 'kpss_pvalue_ratio': 0,
            'kpss_stat_left': 0, 'kpss_stat_right': 0, 'kpss_stat_whole': 0, 'kpss_stat_diff': 0, 'kpss_stat_ratio': 0
        })

    # 平稳性检验 (ADF)
    def extract_adf_features(s):
        if len(s) <= 12:
            return {'p': 1.0, 'stat': 0.0, 'lag': 0, 'ic': 0.0, 'crit_5pct': 0.0, 'reject_5pct': 0}
        adf = tsa.stattools.adfuller(s, autolag='AIC')
        stat, p, lag, _, crit, ic = adf
        crit_5pct = crit['5%']
        return {
            'p': p,
            'stat': stat,
            'lag': lag,
            'ic': ic,
            'crit_5pct': crit_5pct,
            'reject_5pct': int(stat < crit_5pct)
        }
    try:
        f1 = extract_adf_features(s1)
        f2 = extract_adf_features(s2)
        f_whole = extract_adf_features(s_whole)

        # feats['adf_pvalue_left'] = f1['p']
        # feats['adf_pvalue_right'] = f2['p']
        # feats['adf_pvalue_whole'] = f_whole['p']
        # _add_diff_ratio_feats(feats, 'adf_pvalue', f1['p'], f2['p'])
        # _add_contribution_ratio_feats(feats, 'adf_pvalue', f1['p'], f2['p'], f_whole['p'])

        feats['adf_stat_left'] = f1['stat']
        feats['adf_stat_right'] = f2['stat']
        feats['adf_stat_whole'] = f_whole['stat']
        # _add_diff_ratio_feats(feats, 'adf_stat', f1['stat'], f2['stat'])
        # _add_contribution_ratio_feats(feats, 'adf_stat', f1['stat'], f2['stat'], f_whole['stat'])

        feats['adf_icbest_left'] = f1['ic']
        feats['adf_icbest_right'] = f2['ic']
        feats['adf_icbest_whole'] = f_whole['ic']
        # _add_diff_ratio_feats(feats, 'adf_icbest', f1['ic'], f2['ic'])
        # _add_contribution_ratio_feats(feats, 'adf_icbest', f1['ic'], f2['ic'], f_whole['ic'])
    except:
        feats.update({
            'adf_pvalue_left': 1, 'adf_pvalue_right': 1, 'adf_pvalue_whole': 1, 'adf_pvalue_diff': 0, 'adf_pvalue_ratio': 0,
            'adf_stat_left': 0, 'adf_stat_right': 0, 'adf_stat_whole': 0, 'adf_stat_diff': 0, 'adf_stat_ratio': 0,
            'adf_icbest_left': 0, 'adf_icbest_right': 0, 'adf_icbest_whole': 0, 'adf_icbest_diff': 0, 'adf_icbest_ratio': 0
        })

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 3. 累积和特征 ---
@register_feature(func_id="3")
def cumulative_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    s_whole = u['value']
    feats = {}

    def analyze_cumsum_curve(series, seg):
        """分析累积和曲线的各种特征"""
        if len(series) < 3:
            return {}
        
        cumsum_curve = series.cumsum()
        curve_feats = {}
        
        # 线性趋势
        x = np.arange(len(cumsum_curve))
        slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, cumsum_curve)
        # curve_feats[f'cumsum_linear_trend_slope_{seg}'] = slope
        # curve_feats[f'cumsum_linear_trend_r2_{seg}'] = r_value ** 2
        curve_feats[f'cumsum_linear_trend_pvalue_{seg}'] = p_value

        # # 波动率
        # curve_feats[f'cumsum_std_{seg}'] = np.std(cumsum_curve)
        # curve_feats[f'cumsum_cv_{seg}'] = safe_cv(cumsum_curve)
    
        # 趋势背离
        linear_trend = slope * x + intercept
        detrended = cumsum_curve - linear_trend
        curve_feats[f'cumsum_detrend_volatility_{seg}'] = np.std(detrended)
        curve_feats[f'cumsum_detrend_volatility_normalized_{seg}'] = np.std(detrended) / (np.abs(np.mean(cumsum_curve)) + 1e-6)
        curve_feats[f'cumsum_detrend_max_deviation_{seg}'] = np.max(np.abs(detrended))
        
        # 极值特征
        # curve_feats[f'cumsum_min_{seg}'] = np.min(cumsum_curve)
        curve_feats[f'cumsum_max_{seg}'] = np.max(cumsum_curve)
        
        return curve_feats
    
    feats.update(analyze_cumsum_curve(s1, 'left'))
    feats.update(analyze_cumsum_curve(s2, 'right'))
    feats.update(analyze_cumsum_curve(s_whole, 'whole'))
    
    # _add_diff_ratio_feats(feats, 'cumsum_linear_trend_slope', feats.get('cumsum_linear_trend_slope_left', 0), feats.get('cumsum_linear_trend_slope_right', 0))
    # _add_diff_ratio_feats(feats, 'cumsum_std', feats.get('cumsum_std_left', 0), feats.get('cumsum_std_right', 0))
    # _add_diff_ratio_feats(feats, 'cumsum_cv', feats.get('cumsum_cv_left', 0), feats.get('cumsum_cv_right', 0))
    # _add_contribution_ratio_feats(feats, 'cumsum_min', feats.get('cumsum_min_left', 0), feats.get('cumsum_min_right', 0), feats.get('cumsum_min_whole', 0))
    _add_contribution_ratio_feats(feats, 'cumsum_max', feats.get('cumsum_max_left', 0), feats.get('cumsum_max_right', 0), feats.get('cumsum_max_whole', 0))

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 4. 振荡特征 ---
@register_feature(func_id="4")
def oscillation_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0].reset_index(drop=True)
    s2 = u['value'][u['period'] == 1].reset_index(drop=True)
    s_whole = u['value'].reset_index(drop=True)
    feats = {}

    def count_zero_crossings(series: pd.Series):
        if len(series) < 2: return 0
        centered_series = series - series.mean()
        if centered_series.eq(0).all(): return 0
        return np.sum(np.diff(np.sign(centered_series)) != 0)

    # zc1, zc2, zc_whole = count_zero_crossings(s1), count_zero_crossings(s2), count_zero_crossings(s_whole)
    # feats['zero_cross_left'] = zc1
    # feats['zero_cross_right'] = zc2
    # feats['zero_cross_whole'] = zc_whole
    # _add_diff_ratio_feats(feats, 'zero_cross', zc1, zc2)
    # _add_contribution_ratio_feats(feats, 'zero_cross', zc1, zc2, zc_whole)
    
    def autocorr_lag1(s):
        if len(s) < 2: return 0.0
        ac = s.autocorr(lag=1)
        return ac if not np.isnan(ac) else 0.0
        
    ac1, ac2, ac_whole = autocorr_lag1(s1), autocorr_lag1(s2), autocorr_lag1(s_whole)
    feats['autocorr_lag1_left'] = ac1
    feats['autocorr_lag1_right'] = ac2
    feats['autocorr_lag1_whole'] = ac_whole
    _add_diff_ratio_feats(feats, 'autocorr_lag1', ac1, ac2)
    # _add_contribution_ratio_feats(feats, 'autocorr_lag1', ac1, ac2, ac_whole)

    var1, var2, var_whole = s1.diff().var(), s2.diff().var(), s_whole.diff().var()
    feats['diff_var_left'] = var1
    feats['diff_var_right'] = var2
    feats['diff_var_whole'] = var_whole
    # _add_diff_ratio_feats(feats, 'diff_var', var1, var2)
    _add_contribution_ratio_feats(feats, 'diff_var', var1, var2, var_whole)
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 5. 频域特征 ---
@register_feature(func_id="5")
def cyclic_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    s_whole = u['value']
    feats = {}

    def get_fft_props(series):
        if len(series) < 2: return 0.0, 0.0
        
        N = len(series)
        yf = np.fft.fft(series.values)
        power = np.abs(yf[1:N//2])**2
        xf = np.fft.fftfreq(N, 1)[1:N//2]
        
        if len(power) == 0: return 0.0, 0.0
            
        dominant_freq = xf[np.argmax(power)]
        max_power = np.max(power)
        return dominant_freq, max_power

    # freq1, power1 = get_fft_props(s1)
    freq2, power2 = get_fft_props(s2)
    # freq_whole, power_whole = get_fft_props(s_whole)
    
    # feats['dominant_freq_left'] = freq1
    feats['dominant_freq_right'] = freq2
    # feats['dominant_freq_whole'] = freq_whole
    # _add_diff_ratio_feats(feats, 'dominant_freq', freq1, freq2)
    # _add_contribution_ratio_feats(feats, 'dominant_freq', freq1, freq2, freq_whole)

    # feats['max_power_left'] = power1
    # feats['max_power_right'] = power2
    # feats['max_power_whole'] = power_whole
    # _add_diff_ratio_feats(feats, 'max_power', power1, power2)
    # _add_contribution_ratio_feats(feats, 'max_power', power1, power2, power_whole)
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 6. 振幅特征 ---
@register_feature(func_id="6")
def amplitude_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    s_whole = u['value']
    feats = {}
    
    # ptp1, ptp2, ptp_whole = np.ptp(s1), np.ptp(s2), np.ptp(s_whole)
    # iqr1, iqr2, iqr_whole = scipy.stats.iqr(s1), scipy.stats.iqr(s2), scipy.stats.iqr(s_whole)

    # feats['ptp_left'] = ptp1
    # feats['ptp_right'] = ptp2
    # feats['ptp_whole'] = ptp_whole
    # _add_diff_ratio_feats(feats, 'ptp', ptp1, ptp2)
    # _add_contribution_ratio_feats(feats, 'ptp', ptp1, ptp2, ptp_whole)

    # feats['iqr_left'] = iqr1
    # feats['iqr_right'] = iqr2
    # feats['iqr_whole'] = iqr_whole
    # _add_diff_ratio_feats(feats, 'iqr', iqr1, iqr2)
    # _add_contribution_ratio_feats(feats, 'iqr', iqr1, iqr2, iqr_whole)
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 7. 熵信息 ---
@register_feature(func_id="7")
def entropy_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    s_whole = u['value'].to_numpy()
    feats = {}

    def compute_entropy(x):
        hist, _ = np.histogram(x, bins='auto', density=True)
        hist = hist[hist > 0]
        return scipy.stats.entropy(hist)
    
    entropy_funcs = {
        # 'shannon_entropy': compute_entropy,
        'perm_entropy': lambda x: antropy.perm_entropy(x, normalize=True),
        # 'spectral_entropy': lambda x: antropy.spectral_entropy(x, sf=1.0, normalize=True),
        'svd_entropy': lambda x: antropy.svd_entropy(x, normalize=True),
        'approx_entropy': antropy.app_entropy,
        'sample_entropy': antropy.sample_entropy,
        # 'petrosian_fd': antropy.petrosian_fd,
        'katz_fd': antropy.katz_fd,
        'higuchi_fd': antropy.higuchi_fd,
        # 'detrended_fluctuation': antropy.detrended_fluctuation,
    }

    for name, func in entropy_funcs.items():
        try:
            v1, v2, v_whole = func(s1), func(s2), func(s_whole)
            feats[f'{name}_left'] = v1
            feats[f'{name}_right'] = v2
            feats[f'{name}_whole'] = v_whole
            _add_diff_ratio_feats(feats, name, v1, v2)
            _add_contribution_ratio_feats(feats, name, v1, v2, v_whole)
        except Exception:
            feats.update({f'{name}_left': 0, f'{name}_right': 0, f'{name}_whole': 0, f'{name}_diff': 0, f'{name}_ratio': 0})

    # try:
    #     m1, c1 = antropy.hjorth_params(s1)
    #     m2, c2 = antropy.hjorth_params(s2)
    #     m_whole, c_whole = antropy.hjorth_params(s_whole)
    #     feats.update({
    #         'hjorth_mobility_left': m1, 'hjorth_mobility_right': m2, 'hjorth_mobility_whole': m_whole,
    #         'hjorth_complexity_left': c1, 'hjorth_complexity_right': c2, 'hjorth_complexity_whole': c_whole,
    #     })
    #     _add_diff_ratio_feats(feats, 'hjorth_mobility', m1, m2)
    #     _add_contribution_ratio_feats(feats, 'hjorth_mobility', m1, m2, m_whole)
    #     _add_diff_ratio_feats(feats, 'hjorth_complexity', c1, c2)
    #     _add_contribution_ratio_feats(feats, 'hjorth_complexity', c1, c2, c_whole)
    # except Exception:
    #     feats.update({'hjorth_mobility_left':0, 'hjorth_mobility_right':0, 'hjorth_mobility_whole':0, 'hjorth_mobility_diff':0, 'hjorth_mobility_ratio':0,
    #                  'hjorth_complexity_left':0, 'hjorth_complexity_right':0, 'hjorth_complexity_whole':0, 'hjorth_complexity_diff':0, 'hjorth_complexity_ratio':0})


    def series_to_binary_str(x, method='median'):
        if method == 'median':
            threshold = np.median(x)
            return ''.join(['1' if val > threshold else '0' for val in x])
        return None
    
    # try:
    #     bin_str1 = series_to_binary_str(s1)
    #     bin_str2 = series_to_binary_str(s2)
    #     bin_str_whole = series_to_binary_str(s_whole)
    #     lz1, lz2, lz_whole = antropy.lziv_complexity(bin_str1, normalize=True), antropy.lziv_complexity(bin_str2, normalize=True), antropy.lziv_complexity(bin_str_whole, normalize=True)
    #     feats.update({
    #         'lziv_complexity_left': lz1, 'lziv_complexity_right': lz2, 'lziv_complexity_whole': lz_whole,
    #     })
    #     _add_diff_ratio_feats(feats, 'lziv_complexity', lz1, lz2)
    #     _add_contribution_ratio_feats(feats, 'lziv_complexity', lz1, lz2, lz_whole)
    # except Exception:
    #     feats.update({'lziv_complexity_left':0, 'lziv_complexity_right':0, 'lziv_complexity_whole':0, 'lziv_complexity_diff':0, 'lziv_complexity_ratio':0})


    def estimate_cond_entropy(x, lag=1):
        x = x - np.mean(x)
        x_lag = x[:-lag]
        x_now = x[lag:]
        bins = 10
        joint_hist, _, _ = np.histogram2d(x_lag, x_now, bins=bins, density=True)
        joint_hist = joint_hist[joint_hist > 0]
        H_xy = -np.sum(joint_hist * np.log(joint_hist))
        H_x = -np.sum(np.histogram(x_lag, bins=bins, density=True)[0] * \
                      np.log(np.histogram(x_lag, bins=bins, density=True)[0] + 1e-12))
        return H_xy - H_x
    # try:
    #     ce1, ce2, ce_whole = estimate_cond_entropy(s1), estimate_cond_entropy(s2), estimate_cond_entropy(s_whole)
    #     feats.update({
    #         'cond_entropy_left': ce1, 'cond_entropy_right': ce2, 'cond_entropy_whole': ce_whole,
    #     })
    #     _add_diff_ratio_feats(feats, 'cond_entropy', ce1, ce2)
    #     _add_contribution_ratio_feats(feats, 'cond_entropy', ce1, ce2, ce_whole)
    # except Exception:
    #     feats.update({'cond_entropy_left':0, 'cond_entropy_right':0, 'cond_entropy_whole':0, 'cond_entropy_diff':0, 'cond_entropy_ratio':0})
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 8. tsfresh --- 
@register_feature(func_id="8")
def tsfresh_features(u: pd.DataFrame) -> dict:
    """基于tsfresh的特征工程"""
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    s_whole = u['value'].to_numpy()
    feats = {}

    funcs = {
        tsfresh_fe.ratio_value_number_to_time_series_length: None,
        tsfresh_fe.sum_of_reoccurring_data_points: None,
        tsfresh_fe.percentage_of_reoccurring_values_to_all_values: None,
        tsfresh_fe.percentage_of_reoccurring_datapoints_to_all_datapoints: None,
        tsfresh_fe.last_location_of_maximum: None,
        tsfresh_fe.first_location_of_maximum: None,
        tsfresh_fe.has_duplicate: None,
        tsfresh_fe.benford_correlation: None,
        tsfresh_fe.ratio_beyond_r_sigma: [6, 3, 1.5, 1, 0.5],
        tsfresh_fe.quantile: [0.6, 0.4, 0.1],
        tsfresh_fe.count_above: [0],
        tsfresh_fe.number_peaks: [25, 50],
        tsfresh_fe.partial_autocorrelation: [{"lag": 2}, {"lag": 6}],
        tsfresh_fe.index_mass_quantile: [{"q": 0.1}, {"q": 0.6}, {"q": 0.7}, {"q": 0.8}],
        tsfresh_fe.ar_coefficient: [{"coeff": 0, "k": 10}, {"coeff": 2, "k": 10}, {"coeff": 8, "k": 10}],
        tsfresh_fe.linear_trend: [{"attr": "slope"}, {"attr": "rvalue"}, {"attr": "pvalue"}, {"attr": "intercept"}],
        tsfresh_fe.fft_coefficient: [{"coeff": 3, "attr": "imag"}, {"coeff": 2, "attr": "imag"}, {"coeff": 1, "attr": "imag"}],
        tsfresh_fe.energy_ratio_by_chunks: [{"num_segments": 10, "segment_focus": 9}],
        tsfresh_fe.friedrich_coefficients: [{"m": 3, "r": 30, "coeff": 2}, {"m": 3, "r": 30, "coeff": 3}],
        tsfresh_fe.change_quantiles: [
            {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.4},
            {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.2},
            {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.6},
            {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.4},
            {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.2},
            {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.4},
            {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.2},
            {"f_agg": "var", "isabs": True,  "qh": 0.4, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.4},
            {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.4},
            {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.0},
            {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.4},
            {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 0.4, "ql": 0.2},
            {"f_agg": "mean","isabs": True,  "qh": 1.0, "ql": 0.4},
            {"f_agg": "mean","isabs": True,  "qh": 0.6, "ql": 0.4},
        ],
        tsfresh_fe.agg_linear_trend: [
            {"attr": "slope", "chunk_len": 50, "f_agg": "mean"},
            {"attr": "slope", "chunk_len": 5,  "f_agg": "mean"},
            {"attr": "slope", "chunk_len": 10, "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 50, "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 50, "f_agg": "max"},
            {"attr": "rvalue", "chunk_len": 5,  "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 5,  "f_agg": "max"},
            {"attr": "rvalue", "chunk_len": 10, "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 10, "f_agg": "max"},
            {"attr": "intercept", "chunk_len": 50, "f_agg": "mean"},
            {"attr": "intercept", "chunk_len": 50, "f_agg": "max"},
            {"attr": "intercept", "chunk_len": 5,  "f_agg": "mean"},
            {"attr": "intercept", "chunk_len": 5,  "f_agg": "max"},
            {"attr": "intercept", "chunk_len": 10, "f_agg": "mean"},
            {"attr": "intercept", "chunk_len": 10, "f_agg": "max"},
        ],
    }

    def param_to_str(param):
        if isinstance(param, dict):
            return '_'.join([f"{k}_{v}" for k, v in param.items()])
        else:
            return str(param)

    def calculate_stats_for_feature(func, param=None):
        results = {}
        base_name = func.__name__
        if param is not None:
            base_name += f"_{param_to_str(param)}"

        try:
            # Prepare arguments for each segment
            args_s1 = [s1]
            args_s2 = [s2]
            args_s_whole = [s_whole]
            is_combiner = False

            if param is None: # Simple function, no params
                pass
            elif isinstance(param, dict):
                # Check if it's a combiner function or a function with kwargs
                sig = inspect.signature(func)
                if 'param' in sig.parameters: # Combiner function
                    is_combiner = True
                    args_s1.append([param])
                    args_s2.append([param])
                    args_s_whole.append([param])
                else: # Function with kwargs
                    args_s1.append(param)
                    args_s2.append(param)
                    args_s_whole.append(param)
            else: # Simple function with a single parameter
                args_s1.append(param)
                args_s2.append(param)
                args_s_whole.append(param)

            # Execute function for each segment
            if is_combiner:
                v1_dict = {k: v for k, v in func(*args_s1)}
                v2_dict = {k: v for k, v in func(*args_s2)}
                v_whole_dict = {k: v for k, v in func(*args_s_whole)}
                
                for key in v1_dict:
                    v1, v2, v_whole = v1_dict[key], v2_dict[key], v_whole_dict[key]
                    feat_name_base = f"{func.__name__}_{key}"
                    results[f'{feat_name_base}_left'] = v1
                    results[f'{feat_name_base}_right'] = v2
                    results[f'{feat_name_base}_whole'] = v_whole
                    _add_diff_ratio_feats(feats, feat_name_base, v1, v2)
                    _add_contribution_ratio_feats(results, feat_name_base, v1, v2, v_whole)
                return results

            else:
                if isinstance(param, dict) and not is_combiner:
                    v1, v2, v_whole = func(args_s1[0], **args_s1[1]), func(args_s2[0], **args_s2[1]), func(args_s_whole[0], **args_s_whole[1])
                else:
                    v1, v2, v_whole = func(*args_s1), func(*args_s2), func(*args_s_whole)

                results[f'{base_name}_left'] = v1
                results[f'{base_name}_right'] = v2
                results[f'{base_name}_whole'] = v_whole
                _add_diff_ratio_feats(feats, base_name, v1, v2)
                _add_contribution_ratio_feats(results, base_name, v1, v2, v_whole)
        
        except Exception:
            # For combiner functions, need to know keys to create nulls
            if 'param' in locals() and inspect.isfunction(func) and 'param' in inspect.signature(func).parameters:
                 # It's a combiner, but we can't get keys without running it. Skip for now on error.
                 pass
            else:
                results[f'{base_name}_left'] = np.nan
                results[f'{base_name}_right'] = np.nan
                results[f'{base_name}_whole'] = np.nan
                results[f'{base_name}_diff'] = np.nan
                results[f'{base_name}_ratio'] = np.nan
                
        return results


    for func, params in funcs.items():
        if params is None:
            feats.update(calculate_stats_for_feature(func))
        else:
            for param in params:
                feats.update(calculate_stats_for_feature(func, param))

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 9. 时间序列建模 ---
@register_feature(func_id="9")
def ar_model_features(u: pd.DataFrame) -> dict:
    """
    基于AR模型派生特征。
    1. 在 period 0 上训练模型，预测 period 1，计算残差统计量。
    2. 在 period 1 上训练模型，预测 period 0，计算残差统计量。
    3. 分别在 period 0 和 1 上训练模型，比较模型参数、残差和信息准则(AIC/BIC)。
    """
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    s_whole = u['value'].to_numpy()
    feats = {}
    lags = 5 # 固定阶数以保证可比性

    # # --- 特征组1: 用 s1 训练，预测 s2 ---
    # if len(s1) > lags and len(s2) > 0:
    #     try:
    #         model1_fit = AutoReg(s1, lags=lags).fit()
    #         predictions = model1_fit.predict(start=len(s1), end=len(s1) + len(s2) - 1, dynamic=True)
    #         residuals = s2 - predictions
    #         feats['ar_residuals_s2_pred_mean'] = np.mean(residuals)
    #         feats['ar_residuals_s2_pred_std'] = np.std(residuals)
    #         feats['ar_residuals_s2_pred_skew'] = pd.Series(residuals).skew()
    #         feats['ar_residuals_s2_pred_kurt'] = pd.Series(residuals).kurt()
    #     except Exception:
    #         # 宽泛地捕获异常，防止因数值问题中断
    #         feats.update({'ar_residuals_s2_pred_mean': 0, 'ar_residuals_s2_pred_std': 0, 'ar_residuals_s2_pred_skew': 0, 'ar_residuals_s2_pred_kurt': 0})
    # else:
    #     feats.update({'ar_residuals_s2_pred_mean': 0, 'ar_residuals_s2_pred_std': 0, 'ar_residuals_s2_pred_skew': 0, 'ar_residuals_s2_pred_kurt': 0})

    # # --- 特征组2: 用 s2 训练，预测 s1 ---
    # if len(s2) > lags and len(s1) > 0:
    #     try:
    #         model2_fit = AutoReg(s2, lags=lags).fit()
    #         predictions_on_s1 = model2_fit.predict(start=len(s2), end=len(s2) + len(s1) - 1, dynamic=True)
    #         residuals_s1_pred = s1 - predictions_on_s1
    #         feats['ar_residuals_s1_pred_mean'] = np.mean(residuals_s1_pred)
    #         feats['ar_residuals_s1_pred_std'] = np.std(residuals_s1_pred)
    #         feats['ar_residuals_s1_pred_skew'] = pd.Series(residuals_s1_pred).skew()
    #         feats['ar_residuals_s1_pred_kurt'] = pd.Series(residuals_s1_pred).kurt()
    #     except Exception:
    #         feats.update({'ar_residuals_s1_pred_mean': 0, 'ar_residuals_s1_pred_std': 0, 'ar_residuals_s1_pred_skew': 0, 'ar_residuals_s1_pred_kurt': 0})
    # else:
    #     feats.update({'ar_residuals_s1_pred_mean': 0, 'ar_residuals_s1_pred_std': 0, 'ar_residuals_s1_pred_skew': 0, 'ar_residuals_s1_pred_kurt': 0})


    # --- 特征组3: 分别建模，比较差异 ---
    s1_resid_std, s1_params = np.nan, np.full(lags + 1, np.nan)
    s1_aic, s1_bic = np.nan, np.nan
    if len(s1) > lags:
        try:
            fit1 = AutoReg(s1, lags=lags).fit()
            s1_resid_std = np.std(fit1.resid)
            s1_params = fit1.params
            s1_aic = fit1.aic
            s1_bic = fit1.bic
        except Exception:
            pass

    s2_resid_std, s2_params = np.nan, np.full(lags + 1, np.nan)
    s2_aic, s2_bic = np.nan, np.nan
    if len(s2) > lags:
        try:
            fit2 = AutoReg(s2, lags=lags).fit()
            s2_resid_std = np.std(fit2.resid)
            s2_params = fit2.params
            s2_aic = fit2.aic
            s2_bic = fit2.bic
        except Exception:
            pass

    swhole_resid_std, swhole_params = np.nan, np.full(lags + 1, np.nan)
    swhole_aic, swhole_bic = np.nan, np.nan
    if len(s_whole) > lags:
        try:
            fit_whole = AutoReg(s_whole, lags=lags).fit()
            swhole_resid_std = np.std(fit_whole.resid)
            swhole_params = fit_whole.params
            swhole_aic = fit_whole.aic
            swhole_bic = fit_whole.bic
        except Exception:
            pass
            
    # feats['ar_resid_std_left'] = s1_resid_std
    # feats['ar_resid_std_right'] = s2_resid_std
    # feats['ar_resid_std_whole'] = swhole_resid_std
    # _add_diff_ratio_feats(feats, 'ar_resid_std', s1_resid_std, s2_resid_std)
    # _add_contribution_ratio_feats(feats, 'ar_resid_std', s1_resid_std, s2_resid_std, swhole_resid_std)
    
    # feats['ar_aic_left'] = s1_aic
    # feats['ar_aic_right'] = s2_aic
    # feats['ar_aic_whole'] = swhole_aic
    # _add_diff_ratio_feats(feats, 'ar_aic', s1_aic, s2_aic)
    # _add_contribution_ratio_feats(feats, 'ar_aic', s1_aic, s2_aic, swhole_aic)

    # feats['ar_bic_left'] = s1_bic
    # feats['ar_bic_right'] = s2_bic
    # feats['ar_bic_whole'] = swhole_bic
    # _add_diff_ratio_feats(feats, 'ar_bic', s1_bic, s2_bic)
    # _add_contribution_ratio_feats(feats, 'ar_bic', s1_bic, s2_bic, swhole_bic)
    
    # 比较模型系数
    for i in range(len(s1_params)):
        feats[f'ar_param_{i}_left'] = s1_params[i]
        feats[f'ar_param_{i}_right'] = s2_params[i]
        feats[f'ar_param_{i}_whole'] = swhole_params[i]
        _add_diff_ratio_feats(feats, f'ar_param_{i}', s1_params[i], s2_params[i])
        _add_contribution_ratio_feats(feats, f'ar_param_{i}', s1_params[i], s2_params[i], swhole_params[i])

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# --- 10. 分段损失 ---
class RPTFeatureExtractor:
    def __init__(self):
        # 所有可用的cost类及其名称
        self.cost_classes = {
            # 'l1': rpt.costs.CostL1,               # 中位数
            # 'l2': rpt.costs.CostL2,               # 均值
            # 'clinear': rpt.costs.CostCLinear,     # 线性协方差
            # 'rbf': rpt.costs.CostRbf,             # RBF核
            # 'normal': rpt.costs.CostNormal,       # 协方差
            # 'ar': rpt.costs.CostAR,               # 自回归
            # 'mahalanobis': rpt.costs.CostMl,      # 马氏距离
            # 'rank': rpt.costs.CostRank,           # 排名
            'cosine': rpt.costs.CostCosine,       # 余弦距离
        }

    def calculate(self, cost, start, end):
        result = cost.error(start, end)
        if isinstance(result, (np.ndarray, list)) and np.array(result).size == 1:
            return float(np.array(result).squeeze())
        return result

    def extract(self, signal, boundary):
        """
        输入：
            signal: 1D numpy array，单变量时间序列
            boundary: int，分割点
        输出：
            result: dict，格式为 {cost_name: {'left': value, 'right': value}}
        """
        signal = np.asarray(signal)
        n = len(signal)
        result = {}
        for name, cls in self.cost_classes.items():
            try:
                if name == 'ar':
                    cost = cls(order=4)
                else:
                    cost = cls()
                cost.fit(signal)
                left = self.calculate(cost, 0, boundary)
                right = self.calculate(cost, boundary, n)
                whole = self.calculate(cost, 0, n)
                # diff = right - left if left is not None and right is not None else None
                # ratio = right / (left + 1e-6) if left is not None and right is not None else None
            except Exception:
                left = None
                right = None
                whole = None
                # diff = None
                # ratio = None
            # Move to _add_diff_ratio_feats, 'diff': diff, 'ratio': ratio
            result[name] = {'left': left, 'right': right, 'whole': whole}
        return result

@register_feature(func_id="10")
def rupture_cost_features(u: pd.DataFrame) -> dict:
    value = u['value'].values.astype(np.float32)
    period = u['period'].values.astype(np.float32)
    boundary = np.where(np.diff(period) != 0)[0].item()
    feats = {}

    extractor = RPTFeatureExtractor()
    features = extractor.extract(value, boundary)

    feats = {}
    for k, v in features.items():
        for seg, value in v.items():
            feats[f'rpt_cost_{k}_{seg}'] = value
        # _add_diff_ratio_feats(feats, f'rpt_cost_{k}', v['left'], v['right'])
        # _add_contribution_ratio_feats(feats, f'rpt_cost_{k}', v['left'], v['right'], v['whole'])

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

In [8]:
def _apply_feature_func_sequential(func, X_df: pd.DataFrame, use_tqdm: bool = False) -> pd.DataFrame:
    """顺序应用单个特征函数"""
    all_ids = X_df.index.get_level_values("id").unique()
    iterator = (
        tqdm(all_ids, desc=f"Running {func.__name__} (sequentially)")
        if use_tqdm else all_ids
    )
    results = [
        {**{'id': id_val}, **func(X_df.loc[id_val])}
        for id_val in iterator
    ]
    return pd.DataFrame(results).set_index('id')

def _apply_feature_func_parallel(func, X_df: pd.DataFrame) -> pd.DataFrame:
    """并行应用单个特征函数"""
    all_ids = X_df.index.get_level_values("id").unique()
    results = Parallel(n_jobs=config.N_JOBS)(
        delayed(lambda df_id, id_val: {**{'id': id_val}, **func(df_id)})(X_df.loc[id_val], id_val)
        for id_val in tqdm(all_ids, desc=f"Running {func.__name__}")
    )
    return pd.DataFrame(results).set_index('id')

def _apply_transform_func(func, X_df: pd.DataFrame) -> List[pd.DataFrame]:
    """执行变换函数"""
    return func(X_df)

def apply_transformation(X_df: pd.DataFrame, transform_funcs: List[str] = None) -> Dict[str, pd.DataFrame]:
    """
    应用时序变换
    
    Args:
        X_df: 输入数据框
        transform_funcs: 要应用的变换函数名称列表，如果为None则应用所有注册的变换函数
        
    Returns:
        Dict[str, pd.DataFrame]: 键为模态名称，值为对应的数据框
    """
    if transform_funcs is None:
        transform_funcs = list(TRANSFORM_REGISTRY.keys())
    
    # 验证变换函数是否存在
    valid_transform_funcs = []
    for func_name in transform_funcs:
        if func_name not in TRANSFORM_REGISTRY:
            pass
            # logger.warning(f"变换函数 {func_name} 未在注册表中找到，已跳过。")
        else:
            valid_transform_funcs.append(func_name)
    
    transform_funcs = valid_transform_funcs
    
    # 存储所有模态的数据框
    transformed_data = {}
    
    for func_name in transform_funcs:
        # logger.info(f"--- 开始应用变换函数: {func_name} ---")
        start_time = time.time()
        
        transform_info = TRANSFORM_REGISTRY[func_name]
        func = transform_info['func']
        output_mode_names = transform_info['output_mode_names']
        
        # 执行变换
        transformed_results = _apply_transform_func(func, X_df)
        
        # 存储结果
        for mode_name, mode_df in zip(output_mode_names, transformed_results):
            transformed_data[mode_name] = mode_df
        
        duration = time.time() - start_time
        # logger.info(f"'{func_name}' 变换完毕，耗时: {duration:.2f} 秒，生成模态: {output_mode_names}")
    
    return transformed_data

def clean_feature_names(df: pd.DataFrame, prefix: str = "f") -> pd.DataFrame:
    """清理特征名称，确保它们是合法的列名。"""
    cleaned_columns = []
    for i, col in enumerate(df.columns):
        # 替换非法字符为 _
        cleaned = re.sub(r'[^\w]', '_', col)
        # 防止开头是数字（如 "123_feature"）非法
        if re.match(r'^\d', cleaned):
            cleaned = f"{prefix}_{cleaned}"
        # 多个连续 _ 合并为一个
        cleaned = re.sub(r'__+', '_', cleaned)
        cleaned_columns.append(cleaned)
    df.columns = cleaned_columns
    return df

In [9]:
def extract_feature_pairs(
    operator_flag='mul', 
    mode_flag='RAW',
    target_feature=['RAW_1_stats_cv_whole']
):
    result = {tf: [] for tf in target_feature}

    for feat in REMAIN_FEATURES:
        # 1. 只处理以 operator_flag 中任意前缀开头的字符串
        if not feat.startswith(operator_flag + '_'):
            continue

        # 2. 只处理包含任意 target_feature 的字符串
        if not any(tf in feat for tf in target_feature):
            continue

        # 3. 解析格式
        def extract_raw_features(feat, mode_flag):
            if mode_flag == 'RAW':
                split = feat.split('_RAW')
                raw_parts = []
                for part in split[1:]:
                    tokens = part.split('_')
                    raw = 'RAW' + '_'.join(tokens)
                    raw_parts.append(raw)
                return raw_parts
            return []

        parts = extract_raw_features(feat, mode_flag)
        # print(parts)
        if len(parts) != 2:
            continue  # 结构不合法
        raw1, raw2 = parts

        # 4. 如果两个都在 top_features，丢弃
        if raw1 in TOP_FEATURES and raw2 in TOP_FEATURES:
            continue

        # 5. 找出包含的 target_feature 是哪个
        for tf in target_feature:
            if tf == raw1 and raw2 not in result[tf]:
                result[tf].append(raw2)
            elif tf == raw2 and raw1 not in result[tf]:
                result[tf].append(raw1)

    return result

In [10]:
def generate_interaction_features(
    feature_df: pd.DataFrame, 
    create_mul: bool = True,
    create_sqmul: bool = False,
    create_add: bool = False,
    create_sub: bool = False,
    create_div: bool = False,
    create_sq: bool = False,
    create_onemulall: bool = False,
    target_feature: str = ['RAW_1_stats_cv_whole']
):
    """
    根据特征重要性文件生成交互特征。
    支持字典格式的特征数据。

    Args:
        feature_df (pd.DataFrame): 特征数据框。
        top_features (list): 要创建交互项的特征列名。
        create_mul (bool): 是否创建乘法交互项。默认为 True。
        create_sqmul (bool): 是否创建乘法平方交互项。默认为 False。
        create_add (bool): 是否创建加法交互项。默认为 False。
        create_sub (bool): 是否创建减法交互项。默认为 False。
        create_div (bool): 是否创建除法交互项。默认为 False。
        create_sq (bool): 是否创建平方交互项。默认为 False。
    """
    # 1. 加载重要性文件并获取 Top N 特征
    logger.info(f"选择 Top {len(TOP_FEATURES)} 特征进行交互项生成: {TOP_FEATURES}")

    # 2. 加载基础特征文件
    initial_feature_count = len(feature_df.columns)

    missing_features = [f for f in TOP_FEATURES if f not in feature_df.columns]
    if missing_features:
        logger.error(f"以下 Top 特征在基础特征文件中缺失，无法创建交互项: {missing_features}")
        return

    # 3. 创建交互特征
    epsilon = 1e-6
    interaction_features_dict = {}
    
    for f1, f2 in combinations(TOP_FEATURES, 2):
        if create_mul:
            interaction_features_dict[f'mul_{f1}_{f2}'] = feature_df[f1] * feature_df[f2]
        if create_sqmul:
            interaction_features_dict[f'sqmul_{f1}_{f2}'] = feature_df[f1] * (feature_df[f2] ** 2)
            interaction_features_dict[f'sqmul_{f2}_{f1}'] = feature_df[f2] * (feature_df[f1] ** 2)
        if create_sub:
            interaction_features_dict[f'sub_{f1}_{f2}'] = feature_df[f1] - feature_df[f2]
        if create_add:
            interaction_features_dict[f'add_{f1}_{f2}'] = feature_df[f1] + feature_df[f2]
        if create_div:
            interaction_features_dict[f'div_{f1}_{f2}'] = feature_df[f1] / (feature_df[f2] + epsilon)
            interaction_features_dict[f'div_{f2}_{f1}'] = feature_df[f2] / (feature_df[f1] + epsilon)

    for f in TOP_FEATURES:
        if create_sq:
            interaction_features_dict[f'sq_{f}'] = feature_df[f] ** 2

    onemulall_dict = extract_feature_pairs(operator_flag='mul', mode_flag='RAW', target_feature=target_feature)
    for k, v in onemulall_dict.items():
        logger.info(f"target {k} onemulall 交互 {len(v)} 个特征。")
        for f in v:
            if create_onemulall:
                interaction_features_dict[f'mul_{k}_{f}'] = feature_df[k] * feature_df[f]
    
    # 一次性创建DataFrame，避免碎片化
    if interaction_features_dict:
        interaction_features = pd.DataFrame(interaction_features_dict, index=feature_df.index)
    else:
        interaction_features = pd.DataFrame(index=feature_df.index)
    
    if interaction_features.empty:
        logger.info("没有选择任何交互项类型，操作中止。")
        return
    
    logger.info(f"成功创建 {len(interaction_features.columns)} 个交互特征。")
    logger.info("--- 新生成的交互特征列表 ---")
    logger.info(interaction_features.columns.tolist())
    logger.info("------------------------------")
    
    # 4. 合并并保存
    feature_df = feature_df.drop(columns=interaction_features.columns, errors='ignore')
    feature_df = feature_df.merge(interaction_features, left_index=True, right_index=True, how='left')
    feature_df = clean_feature_names(feature_df)

    return feature_df

In [11]:
def train(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    model_directory_path: str,
):
    # For our baseline t-test approach, we don't need to train a model
    # This is essentially an unsupervised approach calculated at inference time
    model = None

    # You could enhance this by training an actual model, for example:
    # 1. Extract features from before/after segments of each time series
    # 2. Train a classifier using these features and y_train labels
    # 3. Save the trained model

    joblib.dump(model, os.path.join(model_directory_path, 'none.joblib'))

In [12]:
def load_models(model_directory_path):
    """Load all LightGBM model files saved with joblib and prepare them for ensemble"""
    models = []
    dirpath = Path(model_directory_path)
    model_files = list(dirpath.glob('*.pkl'))
    
    if not model_files:
        logger.warning(f"Warning: No model files found under {model_directory_path}!")
        return models
    logger.info(f"Found a total of {len(model_files)} model files.")
    
    for model_path in model_files:
        try:
            logger.info(f"Loading model: {model_path}")
            model = joblib.load(model_path)
            models.append(model)
        except Exception as e:
            logger.warning(f"Error loading model {model_path}: {e}")
    
    return models

In [13]:
def infer(
    X_test: typing.Iterable[pd.DataFrame],
    model_directory_path: str,
):
    global logger, log_file_path
    if logger is None:  # 防止重复初始化
        logger, log_file_path = get_logger('Inference', Path(os.path.join(model_directory_path, 'infer_logs')), verbose=False)

    # Load models
    models = load_models(model_directory_path)

    # Funcs to run
    funcs_to_run = [
        f for f in FEATURE_REGISTRY.keys() 
        if f not in EXPERIMENTAL_FEATURES
    ]
    trans_to_run = None
    logger.info(f"未指定特征函数，将运行所有 {len(funcs_to_run)} 个非实验性特征。")

    yield  # Mark as ready

    # X_test can only be iterated once.
    # Before getting the next dataset, you must predict the current one.
    for X_df in tqdm(X_test, desc="Inference Progress"):
        logger.info(X_df)
        logger.info("未找到基础特征文件，将创建全新的特征集。")
        feature_df, metadata = pd.DataFrame(index=X_df.index.get_level_values('id').unique()), {}
        logger.info(feature_df)
        logger.info("=== 开始时序分解 ===")
        transformed_data = apply_transformation(X_df, trans_to_run)
        logger.info(f"分解完成，共生成 {len(transformed_data)} 个模态: {list(transformed_data.keys())}")

        loaded_features = feature_df.columns.tolist()
        initial_feature_count = len(feature_df.columns)

        for mode_name, mode_df in transformed_data.items():
            logger.info(f"=== 开始为模态 '{mode_name}' 生成特征 ===")
            for func_name in funcs_to_run:
                logger.info(f"--- 开始生成特征: {func_name} ---")
                start_time = time.time()
                
                feature_info = FEATURE_REGISTRY[func_name]
                func = feature_info['func']
                is_parallelizable = feature_info['parallelizable']
                is_parallelizable = None  # 强制禁用并行化
                func_id = feature_info['func_id']
                
                if is_parallelizable:
                    new_features_df = _apply_feature_func_parallel(func, mode_df)
                else:
                    logger.info(f"函数 '{func_name}' 不可并行化，将顺序执行。")
                    new_features_df = _apply_feature_func_sequential(func, mode_df)
                new_features_df.columns = [f"{mode_name}_{func_id}_{col}" for col in new_features_df.columns]

                # 记录日志
                duration = time.time() - start_time
                logger.info(f"'{func_name}' 生成完毕，耗时: {duration:.2f} 秒。")
                logger.info(f"  新生成特征列名: {new_features_df.columns.tolist()}")
                
                for col in new_features_df.columns:
                    null_ratio = new_features_df[col].isnull().sum() / len(new_features_df)
                    zero_ratio = (new_features_df[col] == 0).sum() / len(new_features_df)
                    if null_ratio > 0.1:
                        logger.warning(f"    - '{col}': 空值比例={null_ratio:.2%}, 零值比例={zero_ratio:.2%}")

                # 删除旧版本特征（如果存在），然后合并
                feature_df = feature_df.drop(columns=new_features_df.columns, errors='ignore')
                feature_df = feature_df.merge(new_features_df, left_index=True, right_index=True, how='left')
                # feature_df, removed_features = check_new_features_corr(feature_df, loaded_features, drop_flag=True, threshold=0.95)
                feature_df = clean_feature_names(feature_df)
                loaded_features = feature_df.columns.tolist()

        missing_features = [f for f in TOP_FEATURES if f not in feature_df.columns]
        if missing_features:
            logger.warning(f"Missing TOP_FEATURES before interaction: {missing_features}")
        feature_df = generate_interaction_features(
            feature_df, 
            create_mul=True, create_sqmul=True, create_add=True, create_sub=True, create_div=True, create_sq=True, create_onemulall=True,
            target_feature = ['RAW_1_stats_cv_whole']
        )
        missing_features = [f for f in REMAIN_FEATURES if f not in feature_df.columns]
        if missing_features:
            logger.warning(f"Missing REMAIN_FEATURES before filter: {missing_features}")
        feature_df = feature_df[REMAIN_FEATURES]
        
        logger.info(feature_df)
        logger.info("--- 生成后完整特征列表 ---")
        logger.info(f"{feature_df.columns.tolist()}")
        logger.info("-----------------------------")
        logger.info(f"生成/更新完成。总特征数: {len(feature_df.columns)}")

        def ensemble_predict(models, X):
            preds = [model.predict_proba(X)[:, 1] for model in models]
            if len(preds) == 0:
                logger.warning("No predictions generated, returning zeros.")
                return np.zeros(len(X))
            return np.mean(preds, axis=0)
        prediction = ensemble_predict(models, feature_df)
        prediction = 1 - prediction

        yield prediction  # Send the prediction for the current dataset

In [14]:
crunch.test(
    # Uncomment to disable the train
    # force_first_train=False,

    # Uncomment to disable the determinism check
    # no_determinism_check=True,
)

[32m10:02:07[0m [33mno forbidden library found[0m
[32m10:02:07[0m [33m[0m
[32m10:02:11[0m started
[32m10:02:11[0m running local test
[32m10:02:11[0m [33minternet access isn't restricted, no check will be done[0m
[32m10:02:11[0m 
[32m10:02:15[0m starting unstructured loop...
[32m10:02:15[0m executing - command=train


data\X_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_train.parquet (204327238 bytes)
data\X_train.parquet: already exists, file length match
data\X_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_test.reduced.parquet (2380918 bytes)
data\X_test.reduced.parquet: already exists, file length match
data\y_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_train.parquet (61003 bytes)
data\y_train.parquet: already exists, file length match
data\y_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_test.reduced.parquet (2655 bytes)
data\y_test.reduced.parquet: already exists, file length match


[32m10:02:16[0m executing - command=infer
Inference Progress: 101it [00:55,  1.81it/s]
[32m10:03:13[0m checking determinism by executing the inference again with 30% of the data (tolerance: 1e-08)
[32m10:03:13[0m executing - command=infer
Inference Progress: 30it [00:16,  1.78it/s]
[32m10:03:30[0m determinism check: passed
[32m10:03:30[0m [33msave prediction - path=data\prediction.parquet[0m
[32m10:03:30[0m ended
[32m10:03:30[0m [33mduration - time=00:01:19[0m
[32m10:03:30[0m [33mmemory - before="321.32 MB" after="405.96 MB" consumed="84.64 MB"[0m


In [15]:
prediction = pd.read_parquet("data/prediction.parquet")
prediction

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
10001,0.928542
10002,0.914761
10003,0.817947
10004,0.890018
10005,0.696638
...,...
10097,0.931696
10098,0.958928
10099,0.822730
10100,0.969648


In [16]:
# Load the targets
target = pd.read_parquet("data/y_test.reduced.parquet")["structural_breakpoint"]

# Call the scoring function
sklearn.metrics.roc_auc_score(
    target,
    prediction,
)

np.float64(0.0812206572769953)