In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.ar_model import AutoReg
from scipy.stats import theilslopes
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. read dataset
print("Loading data...")
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
print("Data loaded.")

Loading data...
Data loaded.


In [3]:
import ruptures as rpt

In [41]:
def ruptures_kernelCPD(u: pd.DataFrame) -> dict:
    """使用Ruptures的kernelCPD检测器计算最显著changepoint位置与boundary的差异"""
    signal = u['value'].to_numpy()
    s1 = u['value'][u['period'] == 0].reset_index(drop=True)
    s2 = u['value'][u['period'] == 1].reset_index(drop=True)
    feats = {}

    s1_length = len(s1)
    s2_length = len(s2)
    full_length = len(signal)
    boundary_idx = s1_length                              # 边界点的idx
    start_idx = max(0, boundary_idx - s2_length)          # 检查区域开始的idx
    end_idx = min(full_length, boundary_idx + s2_length)  # 检查区域结束的idx
    symmetrical_signal = signal[start_idx:end_idx]

    n_bkps = 1
    kernels = {
        "linear": None, 
        "rbf": {"gamma": 0.1}, 
        "cosine": None
    }
    for kernel in kernels.keys():
        try:
            model = rpt.KernelCPD(kernel=kernel, min_size=len(s2)//2, jump=5, params=kernels[kernel]).fit(symmetrical_signal)
            bkps = model.predict(n_bkps=n_bkps)
            
            for i in range(n_bkps):
                detected_cp_idx = bkps[i] + start_idx if bkps else full_length
                feats[f'CPD_kernel_{kernel}_distance_{i}'] = (detected_cp_idx - boundary_idx) / s2_length
        except Exception as e:
            # print(f"Error occurred: {e}")
            feats[f'CPD_kernel_{kernel}_distance'] = 1.0  # 最大偏移

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

def ruptures_bottomup(u: pd.DataFrame) -> dict:
    """使用Ruptures的Bottom-up segmentation检测器计算最显著changepoint位置与boundary的差异"""
    signal = u['value'].to_numpy()
    s1 = u['value'][u['period'] == 0].reset_index(drop=True)
    s2 = u['value'][u['period'] == 1].reset_index(drop=True)
    feats = {}

    s1_length = len(s1)
    s2_length = len(s2)
    full_length = len(signal)
    boundary_idx = s1_length                              # 边界点的idx
    start_idx = max(0, boundary_idx - s2_length)          # 检查区域开始的idx
    end_idx = min(full_length, boundary_idx + s2_length)  # 检查区域结束的idx
    symmetrical_signal = signal[start_idx:end_idx]

    n_bkps = 1
    # cost_funcs = {
    #     rpt.costs.CostL1: None,
    #     rpt.costs.CostL2: None,
    #     rpt.costs.CostNormal: None,
    #     rpt.costs.CostRbf: None,
    #     rpt.costs.CostCosine: None,
    #     rpt.costs.CostLinear: None,
    #     rpt.costs.CostCLinear: None,
    #     rpt.costs.CostRank: None,
    #     rpt.costs.CostMl: None,
    #     rpt.costs.CostAR: {"order": 4}
    # }
    # for cost in cost_funcs.keys():
    models = {
        "l1": None, 
        "l2": None, 
        "rbf": {"gamma": 0.1}, 
        "linear": None, 
        "normal": None, 
        "ar": None,
    }
    for model_name in models.keys():
        try:
            # model = rpt.BottomUp(custom_cost=cost, min_size=len(s2)//2, jump=5, params=cost_funcs[cost]).fit(symmetrical_signal)
            model = rpt.BottomUp(model=model_name, min_size=len(s2)//2, jump=5, params=models[model_name]).fit(symmetrical_signal)
            bkps = model.predict(n_bkps=n_bkps)
            
            detected_cp_idx = bkps[0] + start_idx if bkps else full_length
            feats[f'CPD_bottomup_{model_name}_distance'] = (detected_cp_idx - boundary_idx) / s2_length
        except Exception as e:
            print(f"Error occurred: {e}")
            feats[f'CPD_bottomup_{model_name}_distance'] = 1.0  # 最大偏移

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

def ruptures_window(u: pd.DataFrame) -> dict:
    """使用Ruptures的Window-based change point detection检测器计算最显著changepoint位置与boundary的差异"""
    signal = u['value'].to_numpy()
    s1 = u['value'][u['period'] == 0].reset_index(drop=True)
    s2 = u['value'][u['period'] == 1].reset_index(drop=True)
    feats = {}

    s1_length = len(s1)
    s2_length = len(s2)
    full_length = len(signal)
    boundary_idx = s1_length                              # 边界点的idx
    start_idx = max(0, boundary_idx - s2_length)          # 检查区域开始的idx
    end_idx = min(full_length, boundary_idx + s2_length)  # 检查区域结束的idx
    symmetrical_signal = signal[start_idx:end_idx]

    n_bkps = 1
    # cost_funcs = {
    #     rpt.costs.CostL1: None,
    #     rpt.costs.CostL2: None,
    #     rpt.costs.CostNormal: None,
    #     rpt.costs.CostRbf: None,
    #     rpt.costs.CostCosine: None,
    #     rpt.costs.CostLinear: None,
    #     rpt.costs.CostCLinear: None,
    #     rpt.costs.CostRank: None,
    #     rpt.costs.CostMl: None,
    #     rpt.costs.CostAR: {"order": 4}
    # }
    # for cost in cost_funcs.keys():
    models = {
        "l1": None, 
        "l2": None, 
        "rbf": {"gamma": 0.1}, 
        "linear": None, 
        "normal": None, 
        "ar": None,
    }
    for model_name in models.keys():
        try:
            # model = rpt.Window(custom_cost=cost, min_size=len(s2)//2, jump=5, params=cost_funcs[cost]).fit(symmetrical_signal)
            model = rpt.Window(width=len(s2)//8, model=model_name, jump=1, params=models[model_name]).fit(symmetrical_signal)
            bkps = model.predict(n_bkps=n_bkps)
            
            detected_cp_idx = bkps[0] + start_idx if bkps else full_length
            feats[f'CPD_window_{model_name}_distance'] = (detected_cp_idx - boundary_idx) / s2_length
        except Exception as e:
            print(f"Error occurred: {e}")
            feats[f'CPD_window_{model_name}_distance'] = 1.0  # 最大偏移

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

In [18]:
for idx in range(10):
    feats = ruptures_kernelCPD(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

{'CPD_kernel_linear_distance_0': 0.47959183673469385, 'CPD_kernel_rbf_distance_0': -0.5, 'CPD_kernel_cosine_distance_0': 0.17687074829931973}
structural_breakpoint    False
Name: 0, dtype: bool
{'CPD_kernel_linear_distance_0': 0.1524822695035461, 'CPD_kernel_rbf_distance_0': -0.5, 'CPD_kernel_cosine_distance_0': -0.49645390070921985}
structural_breakpoint    False
Name: 1, dtype: bool
{'CPD_kernel_linear_distance_0': -0.2854368932038835, 'CPD_kernel_rbf_distance_0': -0.5009708737864078, 'CPD_kernel_cosine_distance_0': -0.007766990291262136}
structural_breakpoint    True
Name: 2, dtype: bool
{'CPD_kernel_linear_distance_0': -0.5007949125596184, 'CPD_kernel_rbf_distance_0': -0.5007949125596184, 'CPD_kernel_cosine_distance_0': 0.44038155802861684}
structural_breakpoint    False
Name: 3, dtype: bool
{'CPD_kernel_linear_distance_0': 0.4473684210526316, 'CPD_kernel_rbf_distance_0': -0.4934210526315789, 'CPD_kernel_cosine_distance_0': 0.4473684210526316}
structural_breakpoint    False
Name: 4

In [42]:
for idx in range(10):
    feats = ruptures_bottomup(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

Error occurred: Not enough dimensions
{'CPD_bottomup_l1_distance': 0.003401360544217687, 'CPD_bottomup_l2_distance': 0.003401360544217687, 'CPD_bottomup_rbf_distance': 0.003401360544217687, 'CPD_bottomup_linear_distance': 1.0, 'CPD_bottomup_normal_distance': 0.003401360544217687, 'CPD_bottomup_ar_distance': 0.003401360544217687}
structural_breakpoint    False
Name: 0, dtype: bool
Error occurred: Not enough dimensions
{'CPD_bottomup_l1_distance': -0.0070921985815602835, 'CPD_bottomup_l2_distance': -0.0070921985815602835, 'CPD_bottomup_rbf_distance': -0.0070921985815602835, 'CPD_bottomup_linear_distance': 1.0, 'CPD_bottomup_normal_distance': -0.0070921985815602835, 'CPD_bottomup_ar_distance': -0.0070921985815602835}
structural_breakpoint    False
Name: 1, dtype: bool
Error occurred: Not enough dimensions
{'CPD_bottomup_l1_distance': 0.0, 'CPD_bottomup_l2_distance': 0.0, 'CPD_bottomup_rbf_distance': 0.0, 'CPD_bottomup_linear_distance': 1.0, 'CPD_bottomup_normal_distance': 0.0, 'CPD_bottom

In [40]:
for idx in range(10):
    feats = ruptures_window(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

Error occurred: Not enough dimensions
{'CPD_window_l1_distance': 0.4523809523809524, 'CPD_window_l2_distance': 0.391156462585034, 'CPD_window_rbf_distance': 1.0, 'CPD_window_linear_distance': 1.0, 'CPD_window_normal_distance': -0.35034013605442177, 'CPD_window_ar_distance': -0.05782312925170068}
structural_breakpoint    False
Name: 0, dtype: bool
Error occurred: Not enough dimensions
{'CPD_window_l1_distance': -0.44680851063829785, 'CPD_window_l2_distance': -0.44680851063829785, 'CPD_window_rbf_distance': 1.0, 'CPD_window_linear_distance': 1.0, 'CPD_window_normal_distance': 0.7127659574468085, 'CPD_window_ar_distance': 0.7340425531914894}
structural_breakpoint    False
Name: 1, dtype: bool
Error occurred: Not enough dimensions
{'CPD_window_l1_distance': 0.537864077669903, 'CPD_window_l2_distance': 0.6951456310679611, 'CPD_window_rbf_distance': 1.0, 'CPD_window_linear_distance': 1.0, 'CPD_window_normal_distance': 0.6524271844660194, 'CPD_window_ar_distance': 0.7223300970873786}
structur