In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.ar_model import AutoReg
from scipy.stats import theilslopes
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. read dataset
print("Loading data...")
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
print("Data loaded.")

Loading data...
Data loaded.


In [None]:
def sktime_cpd_template(u: pd.DataFrame) -> dict:
    """使用CPD检测器计算最显著changepoint位置与boundary的差异"""
    signal = u['value'].to_numpy()
    s1 = u['value'][u['period'] == 0].reset_index(drop=True)
    s2 = u['value'][u['period'] == 1].reset_index(drop=True)
    feats = {}

    s1_length = len(s1)
    s2_length = len(s2)
    full_length = len(signal)
    boundary_idx = s1_length                              # 边界点的idx

    n_bkps = 1

    detected_cp_idx = 
    feats[f'CPD_{method}_distance'] = (detected_cp_idx - boundary_idx)

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

In [None]:
from skchange.change_detectors import MovingWindow, PELT
from skchange.costs import L2Cost, L1Cost

def sktime_cpd_movingwindow(u: pd.DataFrame, bandwidth: int = 30, threshold_scale: float = 2.0) -> dict:
    """使用MovingWindow算法进行CPD特征工程"""
    signal = u['value'].to_numpy()
    s1 = u['value'][u['period'] == 0].reset_index(drop=True)
    s2 = u['value'][u['period'] == 1].reset_index(drop=True)
    feats = {}

    s1_length = len(s1)
    s2_length = len(s2)
    full_length = len(signal)
    boundary_idx = s1_length

    try:
        # 调整bandwidth以适应数据长度
        effective_bandwidth = min(bandwidth, full_length // 4)
        
        # MovingWindow with L2Cost
        detector_l2 = MovingWindow(
            change_score=L2Cost(),
            bandwidth=effective_bandwidth,
            threshold_scale=threshold_scale,
            level=0.01,
            min_detection_interval=1
        )
        
        # 拟合并预测
        signal_df = pd.DataFrame({'value': signal})
        changepoints_l2 = detector_l2.fit_predict(signal_df)
        
        # 计算基础特征
        if len(changepoints_l2) > 0:
            detected_cp_idx = changepoints_l2.iloc[0]  # 第一个检测到的changepoint
            feats['MovingWindow_L2_distance'] = detected_cp_idx - boundary_idx
            feats['MovingWindow_L2_relative_distance'] = (detected_cp_idx - boundary_idx) / full_length
            feats['MovingWindow_L2_cp_count'] = len(changepoints_l2)
            
            # 计算检测到的changepoint与真实边界的相对位置
            feats['MovingWindow_L2_cp_position_ratio'] = detected_cp_idx / full_length
            feats['MovingWindow_L2_boundary_ratio'] = boundary_idx / full_length
            
            # 计算最近的changepoint
            distances = np.abs(changepoints_l2.values - boundary_idx)
            closest_cp_idx = changepoints_l2.iloc[np.argmin(distances)]
            feats['MovingWindow_L2_closest_distance'] = closest_cp_idx - boundary_idx
            feats['MovingWindow_L2_min_distance'] = np.min(distances)
            
        else:
            feats['MovingWindow_L2_distance'] = 0
            feats['MovingWindow_L2_relative_distance'] = 0
            feats['MovingWindow_L2_cp_count'] = 0
            feats['MovingWindow_L2_cp_position_ratio'] = 0
            feats['MovingWindow_L2_boundary_ratio'] = boundary_idx / full_length
            feats['MovingWindow_L2_closest_distance'] = 0
            feats['MovingWindow_L2_min_distance'] = 0
        
        # 获取预测分数（如果可用）
        try:
            scores = detector_l2.predict_scores(signal_df)
            if len(scores) > 0:
                feats['MovingWindow_L2_max_score'] = np.max(scores)
                feats['MovingWindow_L2_mean_score'] = np.mean(scores)
                feats['MovingWindow_L2_score_std'] = np.std(scores)
                
                # 边界附近的分数
                boundary_window = 10
                start_idx = max(0, boundary_idx - boundary_window)
                end_idx = min(len(scores), boundary_idx + boundary_window)
                if start_idx < end_idx:
                    boundary_scores = scores[start_idx:end_idx]
                    feats['MovingWindow_L2_boundary_max_score'] = np.max(boundary_scores)
                    feats['MovingWindow_L2_boundary_mean_score'] = np.mean(boundary_scores)
                else:
                    feats['MovingWindow_L2_boundary_max_score'] = 0
                    feats['MovingWindow_L2_boundary_mean_score'] = 0
            else:
                feats['MovingWindow_L2_max_score'] = 0
                feats['MovingWindow_L2_mean_score'] = 0
                feats['MovingWindow_L2_score_std'] = 0
                feats['MovingWindow_L2_boundary_max_score'] = 0
                feats['MovingWindow_L2_boundary_mean_score'] = 0
        except:
            feats['MovingWindow_L2_max_score'] = 0
            feats['MovingWindow_L2_mean_score'] = 0
            feats['MovingWindow_L2_score_std'] = 0
            feats['MovingWindow_L2_boundary_max_score'] = 0
            feats['MovingWindow_L2_boundary_mean_score'] = 0
        
        # MovingWindow with L1Cost
        try:
            detector_l1 = MovingWindow(
                change_score=L1Cost(),
                bandwidth=effective_bandwidth,
                threshold_scale=threshold_scale,
                level=0.01,
                min_detection_interval=1
            )
            
            changepoints_l1 = detector_l1.fit_predict(signal_df)
            
            if len(changepoints_l1) > 0:
                detected_cp_idx_l1 = changepoints_l1.iloc[0]
                feats['MovingWindow_L1_distance'] = detected_cp_idx_l1 - boundary_idx
                feats['MovingWindow_L1_relative_distance'] = (detected_cp_idx_l1 - boundary_idx) / full_length
                feats['MovingWindow_L1_cp_count'] = len(changepoints_l1)
                
                # 最近的changepoint
                distances_l1 = np.abs(changepoints_l1.values - boundary_idx)
                feats['MovingWindow_L1_min_distance'] = np.min(distances_l1)
            else:
                feats['MovingWindow_L1_distance'] = 0
                feats['MovingWindow_L1_relative_distance'] = 0
                feats['MovingWindow_L1_cp_count'] = 0
                feats['MovingWindow_L1_min_distance'] = 0
        except:
            feats['MovingWindow_L1_distance'] = 0
            feats['MovingWindow_L1_relative_distance'] = 0
            feats['MovingWindow_L1_cp_count'] = 0
            feats['MovingWindow_L1_min_distance'] = 0
        
    except Exception as e:
        # 如果出错，返回默认值
        default_features = [
            'MovingWindow_L2_distance', 'MovingWindow_L2_relative_distance', 'MovingWindow_L2_cp_count',
            'MovingWindow_L2_cp_position_ratio', 'MovingWindow_L2_boundary_ratio', 'MovingWindow_L2_closest_distance',
            'MovingWindow_L2_min_distance', 'MovingWindow_L2_max_score', 'MovingWindow_L2_mean_score',
            'MovingWindow_L2_score_std', 'MovingWindow_L2_boundary_max_score', 'MovingWindow_L2_boundary_mean_score',
            'MovingWindow_L1_distance', 'MovingWindow_L1_relative_distance', 'MovingWindow_L1_cp_count',
            'MovingWindow_L1_min_distance'
        ]
        for feat in default_features:
            feats[feat] = 0

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}


def sktime_cpd_pelt(u: pd.DataFrame, penalty_scale: float = 2.0, min_segment_length: int = 2) -> dict:
    """使用PELT算法进行CPD特征工程"""
    signal = u['value'].to_numpy()
    s1 = u['value'][u['period'] == 0].reset_index(drop=True)
    s2 = u['value'][u['period'] == 1].reset_index(drop=True)
    feats = {}

    s1_length = len(s1)
    s2_length = len(s2)
    full_length = len(signal)
    boundary_idx = s1_length

    try:
        # 调整min_segment_length以适应数据长度
        effective_min_segment_length = min(min_segment_length, full_length // 10)
        
        # PELT with L2Cost
        detector_l2 = PELT(
            cost=L2Cost(),
            penalty_scale=penalty_scale,
            min_segment_length=effective_min_segment_length
        )
        
        # 拟合并预测
        signal_df = pd.DataFrame({'value': signal})
        changepoints_l2 = detector_l2.fit_predict(signal_df)
        
        # 计算基础特征
        if len(changepoints_l2) > 0:
            detected_cp_idx = changepoints_l2.iloc[0]  # 第一个检测到的changepoint
            feats['PELT_L2_distance'] = detected_cp_idx - boundary_idx
            feats['PELT_L2_relative_distance'] = (detected_cp_idx - boundary_idx) / full_length
            feats['PELT_L2_cp_count'] = len(changepoints_l2)
            
            # 计算检测到的changepoint与真实边界的相对位置
            feats['PELT_L2_cp_position_ratio'] = detected_cp_idx / full_length
            feats['PELT_L2_boundary_ratio'] = boundary_idx / full_length
            
            # 计算最近的changepoint
            distances = np.abs(changepoints_l2.values - boundary_idx)
            closest_cp_idx = changepoints_l2.iloc[np.argmin(distances)]
            feats['PELT_L2_closest_distance'] = closest_cp_idx - boundary_idx
            feats['PELT_L2_min_distance'] = np.min(distances)
            
            # 计算changepoint密度
            if len(changepoints_l2) > 1:
                cp_intervals = np.diff(changepoints_l2.values)
                feats['PELT_L2_mean_interval'] = np.mean(cp_intervals)
                feats['PELT_L2_std_interval'] = np.std(cp_intervals)
                feats['PELT_L2_min_interval'] = np.min(cp_intervals)
            else:
                feats['PELT_L2_mean_interval'] = full_length
                feats['PELT_L2_std_interval'] = 0
                feats['PELT_L2_min_interval'] = full_length
                
        else:
            feats['PELT_L2_distance'] = 0
            feats['PELT_L2_relative_distance'] = 0
            feats['PELT_L2_cp_count'] = 0
            feats['PELT_L2_cp_position_ratio'] = 0
            feats['PELT_L2_boundary_ratio'] = boundary_idx / full_length
            feats['PELT_L2_closest_distance'] = 0
            feats['PELT_L2_min_distance'] = 0
            feats['PELT_L2_mean_interval'] = full_length
            feats['PELT_L2_std_interval'] = 0
            feats['PELT_L2_min_interval'] = full_length
        
        # 获取预测分数（如果可用）
        try:
            scores = detector_l2.predict_scores(signal_df)
            if len(scores) > 0:
                feats['PELT_L2_max_score'] = np.max(scores)
                feats['PELT_L2_mean_score'] = np.mean(scores)
                feats['PELT_L2_score_std'] = np.std(scores)
                
                # 边界附近的分数
                boundary_window = 10
                start_idx = max(0, boundary_idx - boundary_window)
                end_idx = min(len(scores), boundary_idx + boundary_window)
                if start_idx < end_idx:
                    boundary_scores = scores[start_idx:end_idx]
                    feats['PELT_L2_boundary_max_score'] = np.max(boundary_scores)
                    feats['PELT_L2_boundary_mean_score'] = np.mean(boundary_scores)
                else:
                    feats['PELT_L2_boundary_max_score'] = 0
                    feats['PELT_L2_boundary_mean_score'] = 0
            else:
                feats['PELT_L2_max_score'] = 0
                feats['PELT_L2_mean_score'] = 0
                feats['PELT_L2_score_std'] = 0
                feats['PELT_L2_boundary_max_score'] = 0
                feats['PELT_L2_boundary_mean_score'] = 0
        except:
            feats['PELT_L2_max_score'] = 0
            feats['PELT_L2_mean_score'] = 0
            feats['PELT_L2_score_std'] = 0
            feats['PELT_L2_boundary_max_score'] = 0
            feats['PELT_L2_boundary_mean_score'] = 0
        
        # PELT with L1Cost
        try:
            detector_l1 = PELT(
                cost=L1Cost(),
                penalty_scale=penalty_scale,
                min_segment_length=effective_min_segment_length
            )
            
            changepoints_l1 = detector_l1.fit_predict(signal_df)
            
            if len(changepoints_l1) > 0:
                detected_cp_idx_l1 = changepoints_l1.iloc[0]
                feats['PELT_L1_distance'] = detected_cp_idx_l1 - boundary_idx
                feats['PELT_L1_relative_distance'] = (detected_cp_idx_l1 - boundary_idx) / full_length
                feats['PELT_L1_cp_count'] = len(changepoints_l1)
                
                # 最近的changepoint
                distances_l1 = np.abs(changepoints_l1.values - boundary_idx)
                feats['PELT_L1_min_distance'] = np.min(distances_l1)
            else:
                feats['PELT_L1_distance'] = 0
                feats['PELT_L1_relative_distance'] = 0
                feats['PELT_L1_cp_count'] = 0
                feats['PELT_L1_min_distance'] = 0
        except:
            feats['PELT_L1_distance'] = 0
            feats['PELT_L1_relative_distance'] = 0
            feats['PELT_L1_cp_count'] = 0
            feats['PELT_L1_min_distance'] = 0
            
    except Exception as e:
        # 如果出错，返回默认值
        default_features = [
            'PELT_L2_distance', 'PELT_L2_relative_distance', 'PELT_L2_cp_count',
            'PELT_L2_cp_position_ratio', 'PELT_L2_boundary_ratio', 'PELT_L2_closest_distance',
            'PELT_L2_min_distance', 'PELT_L2_mean_interval', 'PELT_L2_std_interval', 'PELT_L2_min_interval',
            'PELT_L2_max_score', 'PELT_L2_mean_score', 'PELT_L2_score_std', 
            'PELT_L2_boundary_max_score', 'PELT_L2_boundary_mean_score',
            'PELT_L1_distance', 'PELT_L1_relative_distance', 'PELT_L1_cp_count', 'PELT_L1_min_distance'
        ]
        for feat in default_features:
            feats[feat] = 0

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

def sktime_cpd_combined(u: pd.DataFrame) -> dict:
    """结合多个CPD算法的特征工程"""
    feats = {}
    
    # 获取MovingWindow特征
    mw_feats = sktime_cpd_movingwindow(u)
    feats.update(mw_feats)
    
    # 获取PELT特征
    pelt_feats = sktime_cpd_pelt(u)
    feats.update(pelt_feats)
    
    # 计算算法间的一致性特征
    try:
        # 比较不同算法检测到的changepoint
        mw_l2_distance = mw_feats.get('MovingWindow_L2_distance', 0)
        pelt_l2_distance = pelt_feats.get('PELT_L2_distance', 0)
        
        # 算法间的距离差异
        feats['CPD_algorithm_distance_diff'] = abs(mw_l2_distance - pelt_l2_distance)
        
        # 算法间的相对距离差异
        if mw_l2_distance != 0 or pelt_l2_distance != 0:
            feats['CPD_algorithm_relative_diff'] = abs(mw_l2_distance - pelt_l2_distance) / (abs(mw_l2_distance) + abs(pelt_l2_distance) + 1e-8)
        else:
            feats['CPD_algorithm_relative_diff'] = 0
        
        # 算法检测到的changepoint数量差异
        mw_count = mw_feats.get('MovingWindow_L2_cp_count', 0)
        pelt_count = pelt_feats.get('PELT_L2_cp_count', 0)
        feats['CPD_algorithm_count_diff'] = abs(mw_count - pelt_count)
        
        # 算法一致性指标
        if mw_l2_distance == 0 and pelt_l2_distance == 0:
            feats['CPD_algorithm_consensus'] = 1.0  # 都没检测到
        elif abs(mw_l2_distance - pelt_l2_distance) <= 5:  # 差异在5个时间点内
            feats['CPD_algorithm_consensus'] = 1.0
        else:
            feats['CPD_algorithm_consensus'] = 0.0
            
    except:
        feats['CPD_algorithm_distance_diff'] = 0
        feats['CPD_algorithm_relative_diff'] = 0
        feats['CPD_algorithm_count_diff'] = 0
        feats['CPD_algorithm_consensus'] = 0
    
    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

In [8]:
for idx in range(10):
    feats = sktime_cpd_combined(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

{'MovingWindow_L2_distance': 0.0, 'MovingWindow_L2_relative_distance': 0.0, 'MovingWindow_L2_cp_count': 0.0, 'MovingWindow_L2_cp_position_ratio': 0.0, 'MovingWindow_L2_boundary_ratio': 0.0, 'MovingWindow_L2_closest_distance': 0.0, 'MovingWindow_L2_min_distance': 0.0, 'MovingWindow_L2_max_score': 0.0, 'MovingWindow_L2_mean_score': 0.0, 'MovingWindow_L2_score_std': 0.0, 'MovingWindow_L2_boundary_max_score': 0.0, 'MovingWindow_L2_boundary_mean_score': 0.0, 'MovingWindow_L1_distance': 0.0, 'MovingWindow_L1_relative_distance': 0.0, 'MovingWindow_L1_cp_count': 0.0, 'MovingWindow_L1_min_distance': 0.0, 'PELT_L2_distance': 0.0, 'PELT_L2_relative_distance': 0.0, 'PELT_L2_cp_count': 0.0, 'PELT_L2_cp_position_ratio': 0.0, 'PELT_L2_boundary_ratio': 0.0, 'PELT_L2_closest_distance': 0.0, 'PELT_L2_min_distance': 0.0, 'PELT_L2_mean_interval': 0.0, 'PELT_L2_std_interval': 0.0, 'PELT_L2_min_interval': 0.0, 'PELT_L2_max_score': 0.0, 'PELT_L2_mean_score': 0.0, 'PELT_L2_score_std': 0.0, 'PELT_L2_boundary_m

In [None]:
from skchange.change_detectors import MovingWindow, PELT
from skchange.costs import L2Cost, L1Cost

In [18]:
u = X_train.loc[0]
bandwidth = 50

"""使用MovingWindow算法进行CPD特征工程"""
signal = u['value'].to_numpy()
s1 = u['value'][u['period'] == 0].reset_index(drop=True)
s2 = u['value'][u['period'] == 1].reset_index(drop=True)
feats = {}

s1_length = len(s1)
s2_length = len(s2)
full_length = len(signal)
boundary_idx = s1_length

# 调整bandwidth以适应数据长度
effective_bandwidth = min(bandwidth, full_length // 4)

# MovingWindow with L2Cost
detector_l2 = MovingWindow(
    change_score=L2Cost(),
    bandwidth=effective_bandwidth
)

# 拟合并预测
signal_df = pd.DataFrame({'value': signal})
changepoints_l2 = detector_l2.fit_predict(signal_df)

# 计算基础特征
if len(changepoints_l2) > 0:
    detected_cp_idx = changepoints_l2.iloc[0]  # 第一个检测到的changepoint
    feats['MovingWindow_L2_distance'] = detected_cp_idx - boundary_idx
    feats['MovingWindow_L2_relative_distance'] = (detected_cp_idx - boundary_idx) / full_length
    feats['MovingWindow_L2_cp_count'] = len(changepoints_l2)
    
    # 计算检测到的changepoint与真实边界的相对位置
    feats['MovingWindow_L2_cp_position_ratio'] = detected_cp_idx / full_length
    feats['MovingWindow_L2_boundary_ratio'] = boundary_idx / full_length
    
    # 计算最近的changepoint
    distances = np.abs(changepoints_l2.values - boundary_idx)
    closest_cp_idx = changepoints_l2.iloc[np.argmin(distances)]
    feats['MovingWindow_L2_closest_distance'] = closest_cp_idx - boundary_idx
    feats['MovingWindow_L2_min_distance'] = np.min(distances)

Testing comprehensive CPD features:
Total features: 4
CPD_consensus_std: 0.0
CPD_consensus_range: 0.0
CPD_consensus_mean: 0.0
