In [None]:
import pandas as pd
import numpy as np
import scipy.stats
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.ar_model import AutoReg
from scipy.stats import theilslopes
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 1. read dataset
print("Loading data...")
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
print("Data loaded.")

Loading data...
Data loaded.


In [6]:
import pywt

In [7]:
# @feature
def stft_features(u: pd.DataFrame) -> dict:
    """特征提取：基于 STFT 比较两个周期局部频率能量差异"""
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    feats = {}

    def get_stft_energy(signal):
        if len(signal) < 4:
            return 0
        f, t, Zxx = scipy.signal.stft(signal, nperseg=min(len(signal), 64))
        energy = np.abs(Zxx) ** 2
        return np.sum(energy)

    try:
        e1 = get_stft_energy(s1)
        e2 = get_stft_energy(s2)
        feats['stft_energy_diff'] = e2 - e1
        feats['stft_energy_ratio'] = e2 / (e1 + 1e-6)
    except:
        feats['stft_energy_diff'] = 0
        feats['stft_energy_ratio'] = 1

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

# @feature
def wavelet_features(u: pd.DataFrame) -> dict:
    """特征提取：基于小波分解能量的前后周期差异"""
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    feats = {}

    def wavelet_energy(signal, wavelet='db2', level=2):
        if len(signal) < 4:
            return 0, 0
        coeffs = pywt.wavedec(signal, wavelet, level=level)
        approx_energy = np.sum(np.square(coeffs[0]))  # Approximation coeffs
        detail_energy = sum(np.sum(np.square(c)) for c in coeffs[1:])  # Detail coeffs
        return approx_energy, detail_energy

    try:
        a1, d1 = wavelet_energy(s1)
        a2, d2 = wavelet_energy(s2)

        feats['wavelet_approx_energy_diff'] = a2 - a1
        feats['wavelet_detail_energy_diff'] = d2 - d1
        feats['wavelet_energy_ratio'] = (a2 + d2) / (a1 + d1 + 1e-6)
    except:
        feats['wavelet_approx_energy_diff'] = 0
        feats['wavelet_detail_energy_diff'] = 0
        feats['wavelet_energy_ratio'] = 1

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}


In [8]:
for idx in range(15):
    feats = wavelet_features(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

{'wavelet_approx_energy_diff': -0.006383498584625189, 'wavelet_detail_energy_diff': -0.04511156845154803, 'wavelet_energy_ratio': 0.2217095318599085}
structural_breakpoint    False
Name: 0, dtype: bool
{'wavelet_approx_energy_diff': -0.003571552984657197, 'wavelet_detail_energy_diff': -0.009618177013221756, 'wavelet_energy_ratio': 0.08187908356240722}
structural_breakpoint    False
Name: 1, dtype: bool
{'wavelet_approx_energy_diff': -0.09000022461104912, 'wavelet_detail_energy_diff': -0.3313058260279748, 'wavelet_energy_ratio': 0.3929492333778162}
structural_breakpoint    True
Name: 2, dtype: bool
{'wavelet_approx_energy_diff': -0.024666217581779976, 'wavelet_detail_energy_diff': -0.08492856011343104, 'wavelet_energy_ratio': 0.33112258844764514}
structural_breakpoint    False
Name: 3, dtype: bool
{'wavelet_approx_energy_diff': -0.007531739927623777, 'wavelet_detail_energy_diff': -0.005416813616362368, 'wavelet_energy_ratio': 0.2906499916979219}
structural_breakpoint    False
Name: 4, d