In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels as sm
import antropy

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from joblib import Parallel, delayed
from tqdm.auto import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import statsmodels.tsa.api as tsa

In [3]:
# 1. read dataset
print("Loading data...")
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
print("Data loaded.")

Loading data...
Data loaded.


In [None]:
# --- 1. 分布统计特征-更新 ---
# @feature
def more_distributional_stats(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0]
    s2 = u['value'][u['period'] == 1]
    feats = {}
    
    # Mann-Whitney U检验（非参数版的t检验）
    mw_stat, mw_pvalue = scipy.stats.mannwhitneyu(s1, s2, alternative='two-sided')
    feats['mw_pvalue'] = -mw_pvalue if not np.isnan(mw_pvalue) else 0

    # Wilcoxon符号秩检验（适用于配对样本）
    w_stat, w_pvalue = scipy.stats.wilcoxon(s1[-len(s2):], s2)
    feats['w_pvalue'] = -w_pvalue if not np.isnan(w_pvalue) else 0

    # 方差齐性检验（Levene's test）
    lev_stat, lev_pvalue = scipy.stats.levene(s1, s2)
    feats['lev_pvalue'] = -lev_pvalue if not np.isnan(lev_pvalue) else 0

    # 自相关特征
    def autocorr_features(s, max_lag=5):
        return {f'acf_lag_{i}': tsa.stattools.acf(s, nlags=i)[-1] for i in range(1, max_lag+1)}
    feats.update({f'acf1_{k}': v for k, v in autocorr_features(s1).items()})
    feats.update({f'acf2_{k}': v for k, v in autocorr_features(s2).items()})
    print(feats)
    feats['acf_diff'] = feats['acf2_acf_lag_1'] - feats['acf1_acf_lag_1']

    # 单位根检验（平稳性检验）
    adf1 = tsa.adfuller(s1)[1]
    adf2 = tsa.adfuller(s2)[1]
    feats['adf_diff'] = adf2 - adf1

    # 分位数比较
    for q in [0.1, 0.25, 0.5, 0.75, 0.9]:
        feats[f'q{q}_diff'] = s2.quantile(q) - s1.quantile(q)

    # 异常值检测
    def outlier_ratio(s, threshold=3):
        z = np.abs(scipy.stats.zscore(s))
        return np.mean(z > threshold)
    feats['outlier_ratio_diff'] = outlier_ratio(s2) - outlier_ratio(s1)

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

In [13]:
for idx in range(15):
    feats = more_distributional_stats(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

{'mw_pvalue': np.float64(-0.9919468956184949), 'w_pvalue': np.float64(-0.8459257251469343), 'lev_pvalue': np.float64(-0.7580245444471796), 'acf1_acf_lag_1': np.float64(-0.15594736318987157), 'acf1_acf_lag_2': np.float64(-0.1255051731969833), 'acf1_acf_lag_3': np.float64(-0.04874564371686685), 'acf1_acf_lag_4': np.float64(-0.03239862325422635), 'acf1_acf_lag_5': np.float64(-0.06347647784805939), 'acf2_acf_lag_1': np.float64(-0.09180233405573138), 'acf2_acf_lag_2': np.float64(-0.06164797360083063), 'acf2_acf_lag_3': np.float64(-0.09556167360062365), 'acf2_acf_lag_4': np.float64(-0.11136261276715236), 'acf2_acf_lag_5': np.float64(-0.06916865253729489)}


AttributeError: module 'statsmodels.tsa.statespace.tools' has no attribute 'cusum_squares'