In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.ar_model import AutoReg
from scipy.stats import theilslopes
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 1. read dataset
print("Loading data...")
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
print("Data loaded.")

Loading data...
Data loaded.


In [3]:
import ruptures as rpt

In [None]:
# --- 8. change point detection with ruptures ---
# @feature
def ruptures_dynp_feature(u: pd.DataFrame) -> dict:
    """使用Ruptures的Dynp检测器计算最显著断点位置与中点的差异"""
    signal = u['value'].to_numpy()
    # print(signal)
    n = len(signal)
    # print(n)
    feats = {}

    try:
        n_bkps = 1
        model = rpt.Dynp(model="l2", min_size=np.sum(u['period'] == 1), jump=1).fit(signal)
        bkps = model.predict(n_bkps=n_bkps)  # 只找1个断点
        # print(bkps)
        
        # print(cp)
        middle = np.sum(u['period'] == 0)
        # print(middle)
        for i in range(n_bkps):
            cp = bkps[i] if bkps else n
            feats[f'dynp_cp_distance_{i}'] = abs(cp - middle) / n  # 归一化后的变点位置偏移
    except Exception as e:
        # print(f"Error occurred: {e}")
        feats['dynp_cp_distance'] = 1.0  # 最大偏移

    return feats

In [13]:
# @feature
def ruptures_pelt_feature(u: pd.DataFrame) -> dict:
    """使用Ruptures的Pelt检测器计算变点位置与boundary的差异"""
    signal = u['value'].to_numpy()
    # n = len(signal)
    feats = {}

    try:
        pen = 3
        model = rpt.Pelt(model="l2", min_size=np.sum(u['period'] == 1)-88, jump=1).fit(signal)
        bkps = model.predict(pen=pen)
        boundary = np.sum(u['period'] == 0)
        for i, cp in enumerate(bkps):
            feats[f'pelt_cp_distance_{i}'] = cp - boundary #/ n
    except Exception as e:
        feats['pelt_cp_distance_0'] = 1.0

    return feats

In [14]:
for idx in range(15):
    feats = ruptures_pelt_feature(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

{'pelt_cp_distance_0': np.int64(294)}
structural_breakpoint    False
Name: 0, dtype: bool
{'pelt_cp_distance_0': np.int64(282)}
structural_breakpoint    False
Name: 1, dtype: bool
{'pelt_cp_distance_0': np.int64(515)}
structural_breakpoint    True
Name: 2, dtype: bool
{'pelt_cp_distance_0': np.int64(629)}
structural_breakpoint    False
Name: 3, dtype: bool
{'pelt_cp_distance_0': np.int64(456)}
structural_breakpoint    False
Name: 4, dtype: bool
{'pelt_cp_distance_0': np.int64(682)}
structural_breakpoint    False
Name: 5, dtype: bool
{'pelt_cp_distance_0': np.int64(968)}
structural_breakpoint    True
Name: 6, dtype: bool
{'pelt_cp_distance_0': np.int64(751)}
structural_breakpoint    False
Name: 7, dtype: bool
{'pelt_cp_distance_0': np.int64(302)}
structural_breakpoint    False
Name: 8, dtype: bool
{'pelt_cp_distance_0': np.int64(754)}
structural_breakpoint    False
Name: 9, dtype: bool
{'pelt_cp_distance_0': np.int64(410)}
structural_breakpoint    False
Name: 10, dtype: bool
{'pelt_cp_d