In [None]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels as sm
import statsmodels.tsa.api as tsa
import antropy

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import os
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

In [None]:
# 1. read dataset
print("Loading data...")
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
print("Data loaded.")

In [None]:
y_train

In [None]:
extracted_path = '../feature_dfs/features-tsfresh_autoextract.parquet'
if os.path.exists(extracted_path):
    extracted_features = pd.read_parquet(extracted_path)
    print(f"Loaded existing extracted features: {extracted_features.shape}")

In [None]:
extracted_features

In [None]:
def compute_period_diff_features(extracted_features: pd.DataFrame) -> pd.DataFrame:
    df = extracted_features.copy().reset_index()
    df[['raw_id', 'period']] = df['index'].str.extract(r'^(.*)_(\d)$')
    df['raw_id'] = df['raw_id'].astype(int)
    df['period'] = df['period'].astype(int)

    # 分 period
    df_0 = df[df['period'] == 0].drop(columns=['period', 'index'])
    df_1 = df[df['period'] == 1].drop(columns=['period', 'index'])

    # 设置 raw_id 为索引
    df_0.set_index('raw_id', inplace=True)
    df_1.set_index('raw_id', inplace=True)

    # 差值
    diff_df = df_1.subtract(df_0)
    diff_df.sort_index(inplace=True)
    return diff_df

diff_features = compute_period_diff_features(extracted_features)
print(diff_features.shape)

In [None]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, y_train)