In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.ar_model import AutoReg
from scipy.stats import theilslopes
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 1. read dataset
print("Loading data...")
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
print("Data loaded.")

Loading data...
Data loaded.


In [None]:
# --- 7. 时间序列建模：比较period=0和1下的AR模型残差、AIC差异 ---
# @feature
def ar_model_residual_features(u: pd.DataFrame) -> dict:
    s1 = u['value'][u['period'] == 0].reset_index(drop=True)
    s2 = u['value'][u['period'] == 1].reset_index(drop=True)
    feats = {}

    def fit_ar(s, lags=10):
        if len(s) <= lags + 1:
            return None
        try:
            return AutoReg(s, lags=lags, old_names=False).fit()
        except Exception:
            return None

    model1 = fit_ar(s1)
    model2 = fit_ar(s2)

    # 残差标准差 & AIC 差异
    if model1 is not None and model2 is not None:
        feats['ar_resid_std_diff'] = model2.resid.std() - model1.resid.std()
        feats['ar_aic_diff'] = model2.aic - model1.aic
    else:
        feats['ar_resid_std_diff'] = 0.0
        feats['ar_aic_diff'] = 0.0

    # period=0 拟合后预测 period=1 前 len(s2) 步
    if model1 is not None and len(s2) > 0:
        try:
            max_lag = max(model1.model.ar_lags)
            # 获取 period=0 的尾部作为预测初值
            history = s1.iloc[-max_lag:].tolist()
            preds = []

            for t in range(len(s2)):
                lagged_vals = history[-max_lag:]
                pred = model1.params['const'] if 'const' in model1.params else 0.0
                for i, lag in enumerate(model1.model.ar_lags):
                    pred += model1.params[f'value.L{lag}'] * lagged_vals[-lag]
                preds.append(pred)
                history.append(s2.iloc[t])  # 模拟滚动更新

            preds = np.array(preds)
            mse = np.mean((preds - s2.values[:len(preds)]) ** 2)
            feats['ar_predict_mse'] = mse
        except Exception as e:
            print(f"[WARN] Prediction error: {e}")
            feats['ar_predict_mse'] = 0.0
    else:
        feats['ar_predict_mse'] = 0.0

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

In [None]:
u = X_train.loc[0]
s1 = u['value'][u['period'] == 0].reset_index(drop=True)
s2 = u['value'][u['period'] == 1].reset_index(drop=True)
feats = {}

def fit_ar(s, lags=10):
    if len(s) <= lags + 1:
        return None
    try:
        return AutoReg(s, lags=lags, old_names=False).fit()
    except Exception:
        return None

model1 = fit_ar(s1)
model2 = fit_ar(s2)
print(model1.params.index)
print(model1.model.ar_lags)

# 残差标准差 & AIC 差异
if model1 is not None and model2 is not None:
    feats['ar_resid_std_diff'] = model2.resid.std() - model1.resid.std()
    feats['ar_aic_diff'] = model2.aic - model1.aic
else:
    feats['ar_resid_std_diff'] = 0.0
    feats['ar_aic_diff'] = 0.0

# period=0 拟合后预测 period=1 前 len(s2) 步
if model1 is not None and len(s2) > 0:
    max_lag = max(model1.model.ar_lags)
    # 获取 period=0 的尾部作为预测初值
    history = s1.iloc[-max_lag:].tolist()
    print(history)
    preds = []

    for t in range(len(s2)):
        lagged_vals = history[-max_lag:]
        pred = model1.params['const'] if 'const' in model1.params else 0.0
        for i, lag in enumerate(model1.model.ar_lags):
            pred += model1.params[f'value.L{lag}'] * lagged_vals[-lag]
        preds.append(pred)
        history.append(s2.iloc[t])  # 模拟滚动更新

    preds = np.array(preds)
    mse = np.mean((preds - s2.values[:len(preds)]) ** 2)
    feats['ar_predict_mse'] = mse

Index(['const', 'value.L1', 'value.L2', 'value.L3', 'value.L4', 'value.L5',
       'value.L6', 'value.L7', 'value.L8', 'value.L9', 'value.L10'],
      dtype='object')
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[-0.0021371704055048336, 0.0034444812357458145, -0.0007959912346892023, -0.00649693911808705, -0.005802325676278576, 0.003705333221070089, 0.006672214155667655, 0.0027958016406231757, 0.009656475695492998, 0.006687728106747254]


In [19]:
for idx in range(15):
    feats = ar_model_residual_features(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

{'ar_resid_std_diff': -0.00031172340251863374, 'ar_aic_diff': 7587.785153685026, 'ar_predict_mse': 4.68083510340371e-05}
structural_breakpoint    False
Name: 0, dtype: bool
{'ar_resid_std_diff': -0.0005283595784692581, 'ar_aic_diff': 17830.922671242755, 'ar_predict_mse': 4.176902626786393e-06}
structural_breakpoint    False
Name: 1, dtype: bool
{'ar_resid_std_diff': 0.005127777010880911, 'ar_aic_diff': 9907.481308450015, 'ar_predict_mse': 0.000525254776250473}
structural_breakpoint    True
Name: 2, dtype: bool
{'ar_resid_std_diff': 0.0009047255387457775, 'ar_aic_diff': 11181.509177588478, 'ar_predict_mse': 8.609038147388063e-05}
structural_breakpoint    False
Name: 3, dtype: bool
{'ar_resid_std_diff': 1.5165275062545367e-05, 'ar_aic_diff': 10722.561050052082, 'ar_predict_mse': 1.1594036510249491e-05}
structural_breakpoint    False
Name: 4, dtype: bool
{'ar_resid_std_diff': -0.0002536319162173596, 'ar_aic_diff': 3423.2176698377243, 'ar_predict_mse': 0.0003467299248636689}
structural_bre