In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels as sm
import statsmodels.tsa.api as tsa
import antropy

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import os
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. read dataset
print("Loading data...")
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
print("Data loaded.")

Loading data...
Data loaded.


In [42]:
from tsfresh.feature_extraction import feature_calculators as tsfresh_fe

In [44]:
# --- 10. tsfresh ---
# @feature
def tsfresh_features(u: pd.DataFrame) -> dict:
    """基于tsfresh的特征工程"""
    s1 = u['value'][u['period'] == 0].to_numpy()
    s2 = u['value'][u['period'] == 1].to_numpy()
    feats = {}

    funcs = {
        tsfresh_fe.ratio_value_number_to_time_series_length: None,
        tsfresh_fe.ratio_beyond_r_sigma: [6, 1.5],
        tsfresh_fe.quantile: [0.6, 0.4, 0.1],
        tsfresh_fe.percentage_of_reoccurring_values_to_all_values: None,
        tsfresh_fe.percentage_of_reoccurring_datapoints_to_all_datapoints: None,
        tsfresh_fe.last_location_of_maximum: None,
        tsfresh_fe.first_location_of_maximum: None,
        tsfresh_fe.partial_autocorrelation: [{"lag": 2}],
        tsfresh_fe.linear_trend: [{"attr": "slope"}, {"attr": "rvalue"}, {"attr": "intercept"}],
        tsfresh_fe.fft_coefficient: [{"coeff": 3, "attr": "imag"}, {"coeff": 2, "attr": "imag"}, {"coeff": 1, "attr": "imag"}],
        tsfresh_fe.change_quantiles: [
            {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.4},
            {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.2},
            {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.6},
            {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.4},
            {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.2},
            {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.4},
            {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.2},
            {"f_agg": "var", "isabs": True,  "qh": 0.4, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.4},
            {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.4},
            {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.0},
            {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.4},
            {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.2},
            {"f_agg": "var", "isabs": False, "qh": 0.4, "ql": 0.2},
            {"f_agg": "mean","isabs": True,  "qh": 1.0, "ql": 0.4},
            {"f_agg": "mean","isabs": True,  "qh": 0.6, "ql": 0.4},
        ],
        tsfresh_fe.ar_coefficient: [{"coeff": 2, "k": 10}],
        tsfresh_fe.agg_linear_trend: [
            {"attr": "slope", "chunk_len": 50, "f_agg": "mean"},
            {"attr": "slope", "chunk_len": 5,  "f_agg": "mean"},
            {"attr": "slope", "chunk_len": 10, "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 50, "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 50, "f_agg": "max"},
            {"attr": "rvalue", "chunk_len": 5,  "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 5,  "f_agg": "max"},
            {"attr": "rvalue", "chunk_len": 10, "f_agg": "mean"},
            {"attr": "rvalue", "chunk_len": 10, "f_agg": "max"},
            {"attr": "intercept", "chunk_len": 50, "f_agg": "mean"},
            {"attr": "intercept", "chunk_len": 50, "f_agg": "max"},
            {"attr": "intercept", "chunk_len": 5,  "f_agg": "mean"},
            {"attr": "intercept", "chunk_len": 5,  "f_agg": "max"},
            {"attr": "intercept", "chunk_len": 10, "f_agg": "mean"},
            {"attr": "intercept", "chunk_len": 10, "f_agg": "max"},
        ]
    }

    def param_to_str(param):
        if isinstance(param, dict):
            return '_'.join([f"{k}_{v}" for k, v in param.items()])
        else:
            return str(param)

    def cal_func_diff_as_feature(func, param=None):
        if param is None:
            # Simple function: return a scalar
            try:
                return {func.__name__: func(s2) - func(s1)}
            except:
                return {func.__name__: np.nan}
            
        elif isinstance(param, dict):
            try:
                # Combiner function: return a list of (name, value)
                s1_result = dict(func(s1, [param]))
                s2_result = dict(func(s2, [param]))
                result = {}
                for k in s1_result.keys():
                    # print(s1_result[k], s2_result[k], k)
                    feat_name = f"{func.__name__}_{k}"
                    result[feat_name] = s2_result[k] - s1_result[k]
                return result
            except TypeError:
                # Simple function with multiple kwargs
                val = func(s2, **param) - func(s1, **param)
                feat_name = f"{func.__name__}_{param_to_str(param)}"
                return {feat_name: val}
            except Exception as e:
                # print(e)
                feat_name = f"{func.__name__}_{param_to_str(param)}"
                return {feat_name: np.nan}
            
        else:
            # Simple function with parameter
            try:
                feat_name = f"{func.__name__}_{param_to_str(param)}"
                return {feat_name: func(s2, param) - func(s1, param)}
            except:
                return {feat_name: np.nan}

    for func, params in funcs.items():
        if params is None:
            feats.update(cal_func_diff_as_feature(func))
        else:
            for param in params:
                feats.update(cal_func_diff_as_feature(func, param))

    return {k: float(v) if not np.isnan(v) else 0 for k, v in feats.items()}

In [None]:
u = X_train.loc[0]
s1 = u['value'][u['period'] == 0].to_numpy()
s2 = u['value'][u['period'] == 1].to_numpy()
feats = {}
from tsfresh.feature_extraction import feature_calculators as tsfresh_fe
funcs = {
    tsfresh_fe.ratio_value_number_to_time_series_length: None,
    tsfresh_fe.ratio_beyond_r_sigma: [6, 1.5],
    tsfresh_fe.quantile: [0.6, 0.4, 0.1],
    tsfresh_fe.percentage_of_reoccurring_values_to_all_values: None,
    tsfresh_fe.percentage_of_reoccurring_datapoints_to_all_datapoints: None,
    tsfresh_fe.last_location_of_maximum: None,
    tsfresh_fe.first_location_of_maximum: None,
    tsfresh_fe.partial_autocorrelation: [{"lag": 2}],
    tsfresh_fe.linear_trend: [{"attr": "slope"}, {"attr": "rvalue"}, {"attr": "intercept"}],
    tsfresh_fe.fft_coefficient: [{"coeff": 3, "attr": "imag"}, {"coeff": 2, "attr": "imag"}, {"coeff": 1, "attr": "imag"}],
    tsfresh_fe.change_quantiles: [
        {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.4},
        {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.2},
        {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.6},
        {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.4},
        {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.2},
        {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.4},
        {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.2},
        {"f_agg": "var", "isabs": True,  "qh": 0.4, "ql": 0.2},
        {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.4},
        {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.2},
        {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.4},
        {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.2},
        {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.0},
        {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.4},
        {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.2},
        {"f_agg": "var", "isabs": False, "qh": 0.4, "ql": 0.2},
        {"f_agg": "mean","isabs": True,  "qh": 1.0, "ql": 0.4},
        {"f_agg": "mean","isabs": True,  "qh": 0.6, "ql": 0.4},
    ],
    tsfresh_fe.ar_coefficient: [{"coeff": 2, "k": 10}],
    tsfresh_fe.agg_linear_trend: [
        {"attr": "slope", "chunk_len": 50, "f_agg": "mean"},
        {"attr": "slope", "chunk_len": 5,  "f_agg": "mean"},
        {"attr": "slope", "chunk_len": 10, "f_agg": "mean"},
        {"attr": "rvalue", "chunk_len": 50, "f_agg": "mean"},
        {"attr": "rvalue", "chunk_len": 50, "f_agg": "max"},
        {"attr": "rvalue", "chunk_len": 5,  "f_agg": "mean"},
        {"attr": "rvalue", "chunk_len": 5,  "f_agg": "max"},
        {"attr": "rvalue", "chunk_len": 10, "f_agg": "mean"},
        {"attr": "rvalue", "chunk_len": 10, "f_agg": "max"},
        {"attr": "intercept", "chunk_len": 50, "f_agg": "mean"},
        {"attr": "intercept", "chunk_len": 50, "f_agg": "max"},
        {"attr": "intercept", "chunk_len": 5,  "f_agg": "mean"},
        {"attr": "intercept", "chunk_len": 5,  "f_agg": "max"},
        {"attr": "intercept", "chunk_len": 10, "f_agg": "mean"},
        {"attr": "intercept", "chunk_len": 10, "f_agg": "max"},
    ]
}

def param_to_str(param):
    if isinstance(param, dict):
        return '_'.join([f"{k}_{v}" for k, v in param.items()])
    else:
        return str(param)

def cal_func_diff_as_feature(func, param=None):
    if param is None:
        # Simple function: return a scalar
        try:
            return {func.__name__: func(s2) - func(s1)}
        except:
            return {func.__name__: np.nan}
        
    elif isinstance(param, dict):
        try:
            # Combiner function: return a list of (name, value)
            s1_result = dict(func(s1, [param]))
            s2_result = dict(func(s2, [param]))
            result = {}
            for k in s1_result.keys():
                # print(s1_result[k], s2_result[k], k)
                feat_name = f"{func.__name__}_{k}"
                result[feat_name] = s2_result[k] - s1_result[k]
            return result
        except TypeError:
            # Simple function with multiple kwargs
            val = func(s2, **param) - func(s1, **param)
            feat_name = f"{func.__name__}_{param_to_str(param)}"
            return {feat_name: val}
        except Exception as e:
            # print(e)
            feat_name = f"{func.__name__}_{param_to_str(param)}"
            return {feat_name: np.nan}
        
    else:
        # Simple function with parameter
        try:
            feat_name = f"{func.__name__}_{param_to_str(param)}"
            return {feat_name: func(s2, param) - func(s1, param)}
        except:
            return {feat_name: np.nan}

for func, params in funcs.items():
    if params is None:
        feats.update(cal_func_diff_as_feature(func))
    else:
        for param in params:
            feats.update(cal_func_diff_as_feature(func, param))
feats

{'ratio_value_number_to_time_series_length': 0.0,
 'ratio_beyond_r_sigma_6': np.float64(0.0),
 'ratio_beyond_r_sigma_1.5': np.float64(0.0362518064220507),
 'quantile_0.6': np.float64(-0.00031144062471214515),
 'quantile_0.4': np.float64(-9.904180621324071e-05),
 'quantile_0.1': np.float64(0.00048082909085806445),
 'percentage_of_reoccurring_values_to_all_values': np.float64(0.0),
 'percentage_of_reoccurring_datapoints_to_all_datapoints': np.float64(0.0),
 'last_location_of_maximum': np.float64(0.033590638327870015),
 'first_location_of_maximum': np.float64(0.030929470233689305),
 'partial_autocorrelation_lag_2': np.float64(0.08263325992208122),
 'linear_trend_attr_"slope"': np.float64(2.9216851441120013e-06),
 'linear_trend_attr_"rvalue"': np.float64(0.033569674624218254),
 'linear_trend_attr_"intercept"': np.float64(-0.0004052450886620991),
 'fft_coefficient_attr_"imag"__coeff_3': np.float64(0.003835018617629471),
 'fft_coefficient_attr_"imag"__coeff_2': np.float64(-0.0027590846675041

In [None]:
# 简单函数调用
ratio_value = tsfresh_fe.ratio_value_number_to_time_series_length(x)
ratio_beyond_6 = tsfresh_fe.ratio_beyond_r_sigma(x, 6)
ratio_beyond_1_5 = tsfresh_fe.ratio_beyond_r_sigma(x, 1.5)
quantile_0_6 = tsfresh_fe.quantile(x, 0.6)
quantile_0_4 = tsfresh_fe.quantile(x, 0.4)
quantile_0_1 = tsfresh_fe.quantile(x, 0.1)
reoccurring_values = tsfresh_fe.percentage_of_reoccurring_values_to_all_values(x)
reoccurring_datapoints = tsfresh_fe.percentage_of_reoccurring_datapoints_to_all_datapoints(x)
last_max = tsfresh_fe.last_location_of_maximum(x)
first_max = tsfresh_fe.first_location_of_maximum(x)

# 组合器函数调用
partial_autocorr = tsfresh_fe.partial_autocorrelation(x, [{"lag": 2}])
linear_trend_slope = tsfresh_fe.linear_trend(x, [{"attr": "slope"}])
linear_trend_rvalue = tsfresh_fe.linear_trend(x, [{"attr": "rvalue"}])
linear_trend_intercept = tsfresh_fe.linear_trend(x, [{"attr": "intercept"}])
fft_coeff_imag_3 = tsfresh_fe.fft_coefficient(x, [{"coeff": 3, "attr": "imag"}])
fft_coeff_imag_2 = tsfresh_fe.fft_coefficient(x, [{"coeff": 2, "attr": "imag"}])
fft_coeff_imag_1 = tsfresh_fe.fft_coefficient(x, [{"coeff": 1, "attr": "imag"}])

# 1. change_quantiles 相关特征
change_quantile_features = [
    {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.4},
    {"f_agg": "var", "isabs": True,  "qh": 1.0, "ql": 0.2},
    {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.6},
    {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.4},
    {"f_agg": "var", "isabs": True,  "qh": 0.8, "ql": 0.2},
    {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.4},
    {"f_agg": "var", "isabs": True,  "qh": 0.6, "ql": 0.2},
    {"f_agg": "var", "isabs": True,  "qh": 0.4, "ql": 0.2},
    {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.4},
    {"f_agg": "var", "isabs": False, "qh": 1.0, "ql": 0.2},
    {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.4},
    {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.2},
    {"f_agg": "var", "isabs": False, "qh": 0.8, "ql": 0.0},
    {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.4},
    {"f_agg": "var", "isabs": False, "qh": 0.6, "ql": 0.2},
    {"f_agg": "var", "isabs": False, "qh": 0.4, "ql": 0.2},
    {"f_agg": "mean","isabs": True,  "qh": 1.0, "ql": 0.4},
    {"f_agg": "mean","isabs": True,  "qh": 0.6, "ql": 0.4},
]

change_quantile_results = {
    f"change_quantiles_{i}": tsfresh_fe.change_quantiles(x, **cfg)
    for i, cfg in enumerate(change_quantile_features)
}

# 2. ar_coefficient 相关特征
ar_coeff_params = [{"coeff": 2, "k": 10}]
ar_coeff_result = {
    name: val for name, val in tsfresh_fe.ar_coefficient(x, ar_coeff_params)
}

# 3. agg_linear_trend 相关特征
agg_linear_trend_params = [
    {"attr": "slope", "chunk_len": 50, "f_agg": "mean"},
    {"attr": "slope", "chunk_len": 5,  "f_agg": "mean"},
    {"attr": "slope", "chunk_len": 10, "f_agg": "mean"},
    {"attr": "rvalue", "chunk_len": 50, "f_agg": "mean"},
    {"attr": "rvalue", "chunk_len": 50, "f_agg": "max"},
    {"attr": "rvalue", "chunk_len": 5,  "f_agg": "mean"},
    {"attr": "rvalue", "chunk_len": 5,  "f_agg": "max"},
    {"attr": "rvalue", "chunk_len": 10, "f_agg": "mean"},
    {"attr": "rvalue", "chunk_len": 10, "f_agg": "max"},
    {"attr": "intercept", "chunk_len": 50, "f_agg": "mean"},
    {"attr": "intercept", "chunk_len": 50, "f_agg": "max"},
    {"attr": "intercept", "chunk_len": 5,  "f_agg": "mean"},
    {"attr": "intercept", "chunk_len": 5,  "f_agg": "max"},
    {"attr": "intercept", "chunk_len": 10, "f_agg": "mean"},
    {"attr": "intercept", "chunk_len": 10, "f_agg": "max"},
]

agg_linear_trend_result = {
    name: val for name, val in tsfresh_fe.agg_linear_trend(x, agg_linear_trend_params)
}

In [45]:
for idx in range(15):
    feats = tsfresh_features(X_train.loc[idx])
    print(feats)
    print(y_train.loc[idx])

{'ratio_value_number_to_time_series_length': 0.0, 'ratio_beyond_r_sigma_6': 0.0, 'ratio_beyond_r_sigma_1.5': 0.0362518064220507, 'quantile_0.6': -0.00031144062471214515, 'quantile_0.4': -9.904180621324071e-05, 'quantile_0.1': 0.00048082909085806445, 'percentage_of_reoccurring_values_to_all_values': 0.0, 'percentage_of_reoccurring_datapoints_to_all_datapoints': 0.0, 'last_location_of_maximum': 0.033590638327870015, 'first_location_of_maximum': 0.030929470233689305, 'partial_autocorrelation_lag_2': 0.08263325992208122, 'linear_trend_attr_"slope"': 2.9216851441120013e-06, 'linear_trend_attr_"rvalue"': 0.033569674624218254, 'linear_trend_attr_"intercept"': -0.0004052450886620991, 'fft_coefficient_attr_"imag"__coeff_3': 0.003835018617629471, 'fft_coefficient_attr_"imag"__coeff_2': -0.002759084667504113, 'fft_coefficient_attr_"imag"__coeff_1': 0.004651045903185186, 'change_quantiles_f_agg_var_isabs_True_qh_1.0_ql_0.4': -5.036431009326008e-07, 'change_quantiles_f_agg_var_isabs_True_qh_1.0_ql_