# Собираем датасет

In [1]:
import pandas as pd
import numpy as np
import ta

import warnings
warnings.filterwarnings('ignore')

def add_ta_features_for_asset(df: pd.DataFrame, prefix: str, volume_col_override: str = None) -> pd.DataFrame:
    """Добавляет TA-индикаторы для актива с заданным префиксом.
    
    Parameters:
        prefix: префикс колонок актива (e.g. "gold", "sp500", "spot_price_history")
        volume_col_override: полное имя volume-колонки, если оно не {prefix}__volume
                             (e.g. "spot_price_history__volume_usd" для BTC)
    """
    df = df.copy()
    
    required = ['open', 'close', 'high', 'low', 'volume']
    col_map = {col: f"{prefix}__{col}" for col in required}

    # Позволяем переопределить имя volume-колонки
    if volume_col_override:
        col_map['volume'] = volume_col_override

    missing = [col_map[c] for c in required if col_map[c] not in df.columns]
    if missing:
        print(f"  Пропущены колонки для {prefix}: {missing}")
        return df

    temp_df = pd.DataFrame({
        'open': df[col_map['open']].values,
        'high': df[col_map['high']].values,
        'low': df[col_map['low']].values,
        'close': df[col_map['close']].values,
        'volume': df[col_map['volume']].values
    })

    temp_with_ta = ta.add_all_ta_features(
        temp_df,
        open="open", high="high", low="low", close="close", volume="volume",
        fillna=False
    )

    original_cols = {'open', 'high', 'low', 'close', 'volume'}
    ta_cols = [c for c in temp_with_ta.columns if c not in original_cols]

    for col in ta_cols:
        df.loc[df.index, f"{prefix}__{col}"] = temp_with_ta[col].values

    print(f"  Добавлено {len(ta_cols)} TA-фичей для {prefix}")
    
    return df


def add_lags(df: pd.DataFrame, cols: list, lags: tuple) -> pd.DataFrame:
    """Добавляет лаговые признаки для указанных колонок."""
    df = df.copy()
    for col in cols:
        for lag in lags:
            df[f"{col}__lag{lag}"] = df[col].shift(lag)
    return df

In [2]:
from dotenv import load_dotenv
import os
import pandas as pd
from LoggingSystem.LoggingSystem import LoggingSystem
from FeaturesGetterModule.FeaturesGetter import FeaturesGetter
from get_features_from_API import get_features
from FeaturesGetterModule.helpers._merge_features_by_date import merge_by_date
from FeaturesEngineer.FeaturesEngineer import FeaturesEngineer

# =============================================================================
# Конфигурация
# =============================================================================
load_dotenv("dev.env")
api_key = os.getenv("COINGLASS_API_KEY")

if not api_key:
    raise ValueError("COINGLASS_API_KEY not found in dev.env")

N_DAYS = 3
TARGET_COLUMN_NAME = f"y_up_{N_DAYS}d"
EXTERNAL_LAGS = (1, 3, 5, 7, 10, 15)

# Инициализация
getter = FeaturesGetter(api_key=api_key)
features_engineer = FeaturesEngineer()

In [3]:
# =============================================================================
# 1. Сбор данных из API
# =============================================================================
print("=" * 60)
print("1. Gathering features from API...")
dfs = get_features(getter, api_key)
df_all = merge_by_date(dfs, how="outer", dedupe="last")
df_all = df_all.sort_values('date').reset_index(drop=True)
print(f"   Raw data shape: {df_all.shape}")

# =============================================================================
# 2. Нормализация и первичное заполнение (ffill)
# =============================================================================
print("=" * 60)
print("2. Normalizing spot columns & Applying ffill...")
df_all = features_engineer.ensure_spot_prefix(df_all)

# Заполняем пропуски вперед (чтобы не было дырок в выходные/праздники перед генерацией фичей)
feature_cols = [c for c in df_all.columns if c != "date"]
df_all[feature_cols] = df_all[feature_cols].ffill()
print(f"   Remaining NaN after ffill: {df_all[feature_cols].isna().sum().sum()}")

# =============================================================================
# 3. Генерация фичей (ДО обрезки даты!)
# =============================================================================
print("=" * 60)
print("3. Engineering features & Adding lags...")

# --- 3.1 Инженерные фичи ---
print(f"   Shape before feature engineering: {df_all.shape}")
df_all = features_engineer.add_engineered_features(df_all, horizon=N_DAYS)

# --- 3.2 TA-фичи ---
df_all = add_ta_features_for_asset(df_all, prefix="gold")
df_all = add_ta_features_for_asset(df_all, prefix="sp500")
df_all = add_ta_features_for_asset(df_all, prefix="spot_price_history",
                                    volume_col_override="spot_price_history__volume_usd")

# --- 3.3 Лаги для внешних рынков (только OHLCV + diff/pct, НЕ TA-индикаторы) ---
# TA-индикаторы уже кодируют историю (RSI=14 дней, BB=20 дней и т.д.),
# лагирование их создаёт бесполезное дублирование и взрыв размерности.
_OHLCV_BASES = {'open', 'close', 'high', 'low', 'volume'}

def _is_ohlcv_based(col, prefix):
    """True для gold__open, gold__close__diff1, gold__volume__pct1.
       False для gold__trend_macd, gold__momentum_rsi и т.д."""
    suffix = col[len(prefix) + 2:]       # "open__diff1" из "gold__open__diff1"
    base = suffix.split('__')[0]          # "open" из "open__diff1"
    return base in _OHLCV_BASES

gold_cols = [c for c in df_all.columns
             if c.startswith("gold__") and "__lag" not in c and _is_ohlcv_based(c, "gold")]
sp500_cols = [c for c in df_all.columns
              if c.startswith("sp500__") and "__lag" not in c and _is_ohlcv_based(c, "sp500")]
external_market_cols = gold_cols + sp500_cols

if external_market_cols:
    df_all = add_lags(df_all, cols=external_market_cols, lags=EXTERNAL_LAGS)
    print(f"   Added {len(external_market_cols) * len(EXTERNAL_LAGS)} lag features (OHLCV only, no TA lags)")
    print(f"   Lagged columns: {len(external_market_cols)} per asset")

# --- 3.4 Целевая колонка ---
df_all = features_engineer.add_y_up_custom(df_all, horizon=N_DAYS, close_col="spot_price_history__close")

# =============================================================================
# 4. Фильтрация по дате (Оставляем последние 1500 дней)
# =============================================================================
print("=" * 60)
print("4. Filtering last 1500 days...")

df_all['date'] = pd.to_datetime(df_all['date'])
max_date = df_all['date'].max()
cutoff_date = max_date - pd.Timedelta(days=1500)

rows_total = len(df_all)
df_all = df_all[df_all['date'] >= cutoff_date]
print(f"   Rows kept: {len(df_all)} (from {rows_total})")

# =============================================================================
# 5. Очистка колонок и строк
# =============================================================================
print("=" * 60)
print("5. Final cleanup...")

df_all = df_all.dropna(subset=[TARGET_COLUMN_NAME])

nan_threshold = 0.3
nan_ratio = df_all.isna().mean()
cols_to_drop = [
    c for c in nan_ratio[nan_ratio > nan_threshold].index
    if not c.startswith("y_up_")
]
if cols_to_drop:
    print(f"   Dropping {len(cols_to_drop)} columns with >30% NaN")
    df_all = df_all.drop(columns=cols_to_drop)

rows_before_final = len(df_all)
df_all = df_all.dropna().reset_index(drop=True)
print(f"   Final Dropna: removed {rows_before_final - len(df_all)} rows.")

# =============================================================================
# Итоговый результат
# =============================================================================
print("=" * 60)
print(f"FINAL DATASET SHAPE: {df_all.shape}")
print(f"Date range: {df_all['date'].min()} to {df_all['date'].max()}")
print(f"Target distribution: {df_all[TARGET_COLUMN_NAME].value_counts().to_dict()}")

df2 = df_all
df_all.head()

1. Gathering features from API...
   Raw data shape: (5659, 112)
2. Normalizing spot columns & Applying ffill...
   Remaining NaN after ffill: 400195
3. Engineering features & Adding lags...
   Shape before feature engineering: (5659, 112)
  Добавлено 86 TA-фичей для gold
  Добавлено 86 TA-фичей для sp500
  Добавлено 86 TA-фичей для spot_price_history
   Added 180 lag features (OHLCV only, no TA lags)
   Lagged columns: 30 per asset
4. Filtering last 1500 days...
   Rows kept: 1501 (from 5659)
5. Final cleanup...
   Dropping 127 columns with >30% NaN
   Final Dropna: removed 431 rows.
FINAL DATASET SHAPE: (1070, 650)
Date range: 2022-11-26 00:00:00 to 2026-02-12 00:00:00
Target distribution: {np.int64(1): 569, np.int64(0): 501}


Unnamed: 0,futures_open_interest_history__open,futures_open_interest_history__high,futures_open_interest_history__low,futures_open_interest_history__close,date,futures_open_interest_aggregated_history__open,futures_open_interest_aggregated_history__high,futures_open_interest_aggregated_history__low,futures_open_interest_aggregated_history__close,futures_funding_rate_history__open,...,sp500__volume__diff1__lag7,sp500__volume__diff1__lag10,sp500__volume__diff1__lag15,sp500__volume__pct1__lag1,sp500__volume__pct1__lag3,sp500__volume__pct1__lag5,sp500__volume__pct1__lag7,sp500__volume__pct1__lag10,sp500__volume__pct1__lag15,y_up_3d
0,2071097000.0,2098987000.0,2060349000.0,2074905000.0,2022-11-26,9841010000.0,9985549000.0,9841010000.0,9904054000.0,0.00381,...,0.0,-849990000.0,-187950000.0,-0.479693,-0.156448,-0.046236,0.0,-0.169479,-0.03251,0
1,2074905000.0,2118137000.0,2062621000.0,2081465000.0,2022-11-27,9904054000.0,10027930000.0,9868134000.0,9946130000.0,0.001088,...,0.0,-113540000.0,0.0,0.0,0.0,0.009687,0.0,-0.027258,0.0,1
2,2081465000.0,2090258000.0,1970674000.0,2025418000.0,2022-11-28,9946130000.0,10005090000.0,9699231000.0,9894938000.0,0.002267,...,-186670000.0,-14420000.0,0.0,0.0,-0.479693,-0.156448,-0.046236,-0.003559,0.0,1
3,2025418000.0,2131462000.0,2001265000.0,2095082000.0,2022-11-29,9894938000.0,10299430000.0,8754260000.0,9991988000.0,0.00476,...,37300000.0,0.0,-1031380000.0,1.118673,0.0,0.0,0.009687,0.0,-0.184395,1
4,2095082000.0,2189434000.0,2082092000.0,2179708000.0,2022-11-30,9991988000.0,10352480000.0,8399971000.0,10259550000.0,0.003404,...,-608270000.0,0.0,453380000.0,-0.019193,0.0,-0.479693,-0.156448,0.0,0.099383,0


In [4]:
# Проверка внешних рыночных фичей
gold_cols = [c for c in df2.columns if c.startswith("gold__")]
sp500_cols = [c for c in df2.columns if c.startswith("sp500__")]

print(f"Gold фичей: {len(gold_cols)}")
print(f"S&P500 фичей: {len(sp500_cols)}")

if gold_cols:
    print(f"\nПримеры gold фичей: {gold_cols[:100]}")
if sp500_cols:
    print(f"\nПримеры sp500 фичей: {sp500_cols[:25]}")

Gold фичей: 184
S&P500 фичей: 184

Примеры gold фичей: ['gold__open', 'gold__close', 'gold__high', 'gold__low', 'gold__volume', 'gold__open__diff1', 'gold__open__pct1', 'gold__close__diff1', 'gold__close__pct1', 'gold__high__diff1', 'gold__high__pct1', 'gold__low__diff1', 'gold__low__pct1', 'gold__volume__diff1', 'gold__volume__pct1', 'gold__volume_adi', 'gold__volume_obv', 'gold__volume_cmf', 'gold__volume_fi', 'gold__volume_em', 'gold__volume_sma_em', 'gold__volume_vpt', 'gold__volume_vwap', 'gold__volume_mfi', 'gold__volume_nvi', 'gold__volatility_bbm', 'gold__volatility_bbh', 'gold__volatility_bbl', 'gold__volatility_bbw', 'gold__volatility_bbp', 'gold__volatility_bbhi', 'gold__volatility_bbli', 'gold__volatility_kcc', 'gold__volatility_kch', 'gold__volatility_kcl', 'gold__volatility_kcw', 'gold__volatility_kcp', 'gold__volatility_kchi', 'gold__volatility_kcli', 'gold__volatility_dcl', 'gold__volatility_dch', 'gold__volatility_dcm', 'gold__volatility_dcw', 'gold__volatility_dcp', '

In [5]:
pip install statsmodels

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: C:\Users\flays\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [6]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests
import numpy as np
import pandas as pd

# =============================================================================
# Конфигурация walk-forward CV
# =============================================================================
N_SPLITS = 5
MAX_FEATURES = 20          # макс. фичей на fold
CORR_THRESHOLD = 0.75      # порог удаления коррелированных фичей

# =============================================================================
# Вспомогательные функции
# =============================================================================

def smart_corr_removal(X: pd.DataFrame, y: pd.Series, threshold: float = 0.75) -> list:
    """Удаляет коррелированные фичи, сохраняя ту, у которой выше |Spearman| с таргетом.
    
    Возвращает список колонок, которые нужно ОСТАВИТЬ.
    """
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # abs(Spearman) с таргетом для каждой фичи
    target_corr = {}
    for col in X.columns:
        c, _ = spearmanr(X[col], y)
        target_corr[col] = abs(c)

    to_drop = set()
    for col in upper.columns:
        if col in to_drop:
            continue
        correlated_with = upper.index[upper[col] > threshold].tolist()
        for corr_col in correlated_with:
            if corr_col in to_drop:
                continue
            # Убираем фичу с МЕНЬШЕЙ корреляцией с таргетом
            if target_corr.get(col, 0) >= target_corr.get(corr_col, 0):
                to_drop.add(corr_col)
            else:
                to_drop.add(col)
                break  # col уже помечен на удаление, переходим к следующей

    keep = [c for c in X.columns if c not in to_drop]
    return keep


def select_features_spearman_fdr(X: pd.DataFrame, y: pd.Series, max_features: int = 20):
    """Отбирает top-K фичей по |Spearman| с FDR-коррекцией (Benjamini-Hochberg).
    
    Returns:
        selected: список отобранных фичей
        stats: DataFrame со статистикой по всем фичам
    """
    correlations = []
    p_values = []

    for col in X.columns:
        corr, p_val = spearmanr(X[col], y)
        correlations.append(abs(corr))
        p_values.append(p_val)

    # FDR коррекция
    reject, p_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

    stats = pd.DataFrame({
        'feature': X.columns,
        'abs_corr': correlations,
        'p_raw': p_values,
        'p_fdr': p_corrected,
        'significant_fdr': reject
    }).sort_values('abs_corr', ascending=False)

    # Берём значимые после FDR, ограничиваем max_features
    significant = stats[stats['significant_fdr']]

    if len(significant) >= 3:
        selected = significant.head(max_features)['feature'].tolist()
        method = f"FDR (q<0.05): {len(significant)} significant, took top-{len(selected)}"
    else:
        # Fallback: top по raw p < 0.01 (жёстче, т.к. без коррекции)
        fallback = stats[stats['p_raw'] < 0.01].head(max_features)
        selected = fallback['feature'].tolist()
        method = f"Fallback (raw p<0.01): {len(selected)} features"

    return selected, method, stats


print("Helper functions defined: smart_corr_removal(), select_features_spearman_fdr()")
print(f"Config: N_SPLITS={N_SPLITS}, MAX_FEATURES={MAX_FEATURES}, CORR_THRESHOLD={CORR_THRESHOLD}")

Helper functions defined: smart_corr_removal(), select_features_spearman_fdr()
Config: N_SPLITS=5, MAX_FEATURES=20, CORR_THRESHOLD=0.75


In [7]:
# =============================================================================
# Walk-Forward CV с NESTED feature selection (внутри каждого fold)
# =============================================================================
# Ключевое отличие от предыдущей версии:
# - Корреляционное удаление и Spearman-отбор делаются ВНУТРИ каждого fold
# - Это устраняет selection leakage — главную причину переобучения
# =============================================================================

tscv = TimeSeriesSplit(n_splits=N_SPLITS, gap=N_DAYS)

X_all = df2.drop([TARGET_COLUMN_NAME, 'date'], axis=1)
y_all = df2[TARGET_COLUMN_NAME]

fold_results = []
all_feature_sets = []

print(f"Walk-Forward CV: {N_SPLITS} splits, gap={N_DAYS}")
print(f"Total samples: {len(X_all)}, Features: {X_all.shape[1]}")
print("=" * 70)

for fold_idx, (train_idx, test_idx) in enumerate(tscv.split(X_all)):
    print(f"\n{'─'*70}")
    print(f"FOLD {fold_idx + 1}/{N_SPLITS}")
    print(f"{'─'*70}")

    X_fold_train = X_all.iloc[train_idx].copy()
    X_fold_test = X_all.iloc[test_idx].copy()
    y_fold_train = y_all.iloc[train_idx]
    y_fold_test = y_all.iloc[test_idx]

    dates_train = df2['date'].iloc[train_idx]
    dates_test = df2['date'].iloc[test_idx]
    print(f"  Train: {len(train_idx)} rows ({dates_train.min().date()} → {dates_train.max().date()})")
    print(f"  Test:  {len(test_idx)} rows ({dates_test.min().date()} → {dates_test.max().date()})")

    # --- Шаг 1: Smart correlation removal (на train fold) ---
    keep_cols = smart_corr_removal(X_fold_train, y_fold_train, threshold=CORR_THRESHOLD)
    X_fold_train = X_fold_train[keep_cols]
    X_fold_test = X_fold_test[keep_cols]
    print(f"  [1] Corr removal: {X_all.shape[1]} → {len(keep_cols)} features")

    # --- Шаг 2: Feature selection с FDR (на train fold) ---
    selected_features, method, feat_stats = select_features_spearman_fdr(
        X_fold_train, y_fold_train, max_features=MAX_FEATURES
    )
    print(f"  [2] Feature selection: {method}")
    all_feature_sets.append(set(selected_features))

    if len(selected_features) == 0:
        print("  SKIP: нет отобранных фичей!")
        continue

    # --- Шаг 3: Формируем финальные данные fold ---
    X_tr = X_fold_train[selected_features]
    X_te = X_fold_test[selected_features]

    # --- Шаг 4: Масштабируем ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_te_scaled = scaler.transform(X_te)

    # --- Шаг 5: Обучаем RF ---
    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=3,
        min_samples_leaf=20,
        min_samples_split=40,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_tr_scaled, y_fold_train)

    train_acc = model.score(X_tr_scaled, y_fold_train)
    test_acc = model.score(X_te_scaled, y_fold_test)

    y_pred = model.predict(X_te_scaled)
    y_proba = model.predict_proba(X_te_scaled)[:, 1]

    auc = roc_auc_score(y_fold_test, y_proba)
    f1 = f1_score(y_fold_test, y_pred)
    prec = precision_score(y_fold_test, y_pred, zero_division=0)
    rec = recall_score(y_fold_test, y_pred, zero_division=0)
    gap = train_acc - test_acc

    print(f"  [3] Train acc: {train_acc:.4f} | Test acc: {test_acc:.4f} | Gap: {gap:+.4f}")
    print(f"      AUC: {auc:.4f} | F1: {f1:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f}")
    print(f"      Features ({len(selected_features)}): {selected_features[:5]}{'...' if len(selected_features) > 5 else ''}")

    fold_results.append({
        'fold': fold_idx + 1,
        'train_size': len(train_idx),
        'test_size': len(test_idx),
        'n_features': len(selected_features),
        'train_acc': train_acc,
        'test_acc': test_acc,
        'gap': gap,
        'auc': auc,
        'f1': f1,
        'precision': prec,
        'recall': rec,
        'features': selected_features,
        'model': model,
        'scaler': scaler,
    })

Walk-Forward CV: 5 splits, gap=3
Total samples: 1070, Features: 648

──────────────────────────────────────────────────────────────────────
FOLD 1/5
──────────────────────────────────────────────────────────────────────
  Train: 177 rows (2022-11-26 → 2023-06-25)
  Test:  178 rows (2023-06-29 → 2023-12-23)
  [1] Corr removal: 648 → 236 features
  [2] Feature selection: FDR (q<0.05): 8 significant, took top-8
  [3] Train acc: 0.7062 | Test acc: 0.4438 | Gap: +0.2624
      AUC: 0.5639 | F1: 0.2080 | Prec: 0.5909 | Rec: 0.1262
      Features (8): ['index_btc_reserve_risk__hodl_bank__pct1', 'futures_open_interest_aggregated_history__close', 'gold__volatility_ui', 'gold__volume_vpt', 'sp500__volatility_dch']...

──────────────────────────────────────────────────────────────────────
FOLD 2/5
──────────────────────────────────────────────────────────────────────
  Train: 355 rows (2022-11-26 → 2023-12-20)
  Test:  178 rows (2023-12-24 → 2024-06-18)
  [1] Corr removal: 648 → 243 features
  [2]

In [8]:
# =============================================================================
# Сводка результатов Walk-Forward CV
# =============================================================================
print("=" * 70)
print("СВОДКА ПО FOLD'АМ")
print("=" * 70)

results_df = pd.DataFrame(fold_results)
display_cols = ['fold', 'train_size', 'test_size', 'n_features', 'train_acc', 'test_acc', 'gap', 'auc', 'f1', 'precision', 'recall']
print(results_df[display_cols].to_string(index=False))

print(f"\n{'─'*40}")
print(f"Средний test acc:  {results_df['test_acc'].mean():.4f} +/- {results_df['test_acc'].std():.4f}")
print(f"Средний AUC:       {results_df['auc'].mean():.4f} +/- {results_df['auc'].std():.4f}")
print(f"Средний F1:        {results_df['f1'].mean():.4f} +/- {results_df['f1'].std():.4f}")
print(f"Средний gap:       {results_df['gap'].mean():+.4f}")
print(f"{'─'*40}")

# Стабильность фичей: какие повторяются в разных fold'ах
print(f"\n{'='*70}")
print("СТАБИЛЬНОСТЬ ФИЧЕЙ")
print(f"{'='*70}")

if len(all_feature_sets) > 1:
    from collections import Counter
    feat_counter = Counter()
    for fs in all_feature_sets:
        feat_counter.update(fs)

    common_features = set.intersection(*all_feature_sets)
    all_unique = set.union(*all_feature_sets)

    print(f"Фичей, общих для ВСЕХ fold'ов: {len(common_features)}")
    print(f"Всего уникальных фичей: {len(all_unique)}")

    print(f"\nФичи по частоте появления в fold'ах:")
    for feat, count in feat_counter.most_common():
        marker = " ***" if count == N_SPLITS else ""
        print(f"  [{count}/{N_SPLITS}] {feat}{marker}")

СВОДКА ПО FOLD'АМ
 fold  train_size  test_size  n_features  train_acc  test_acc      gap      auc       f1  precision   recall
    1         177        178           8   0.706215  0.443820 0.262394 0.563883 0.208000   0.590909 0.126214
    2         355        178           9   0.712676  0.634831 0.077845 0.623100 0.713656   0.609023 0.861702
    3         533        178           4   0.686679  0.533708 0.152971 0.522414 0.617512   0.592920 0.644231
    4         711        178           4   0.659634  0.432584 0.227050 0.424399 0.542986   0.483871 0.618557
    5         889        178          12   0.673791  0.432584 0.241207 0.445825 0.507317   0.400000 0.693333

────────────────────────────────────────
Средний test acc:  0.4955 +/- 0.0887
Средний AUC:       0.5159 +/- 0.0823
Средний F1:        0.5179 +/- 0.1904
Средний gap:       +0.1923
────────────────────────────────────────

СТАБИЛЬНОСТЬ ФИЧЕЙ
Фичей, общих для ВСЕХ fold'ов: 1
Всего уникальных фичей: 23

Фичи по частоте появления 

In [9]:
# =============================================================================
# Финальная оценка: holdout test (последний fold) + Permutation Importance
# =============================================================================
from sklearn.inspection import permutation_importance

# Берём последний fold (самый свежий OOS-период)
best = fold_results[-1]
best_model = best['model']
best_scaler = best['scaler']
best_features = best['features']

print(f"Финальная модель: Fold {best['fold']}")
print(f"Features ({len(best_features)}):")
for f in best_features:
    print(f"  - {f}")

# Holdout-данные последнего fold
last_train_idx, last_test_idx = list(tscv.split(X_all))[-1]
X_holdout = X_all.iloc[last_test_idx][best_features]
y_holdout = y_all.iloc[last_test_idx]
X_holdout_scaled = best_scaler.transform(X_holdout)

# Permutation Importance на holdout
print(f"\nPermutation Importance (n_repeats=30) на holdout ({len(y_holdout)} samples)...")
r = permutation_importance(
    best_model,
    X_holdout_scaled,
    y_holdout,
    n_repeats=30,
    random_state=42,
    n_jobs=-1
)

perm_df = pd.DataFrame({
    'feature': best_features,
    'importance': r.importances_mean,
    'std': r.importances_std,
    'snr': r.importances_mean / (r.importances_std + 1e-10)
}).sort_values('importance', ascending=False)

print(f"\nPermutation Importance (все {len(perm_df)} фичей):")
print(perm_df.to_string(index=False))

significant_perm = perm_df[perm_df['importance'] > 2 * perm_df['std']]
positive_perm = perm_df[perm_df['importance'] > 0]

print(f"\nФич с importance > 0: {len(positive_perm)}")
print(f"Фич со значимым importance (> 2*std): {len(significant_perm)}")

if len(significant_perm) > 0:
    print("\nСтатистически значимые фичи:")
    for _, row in significant_perm.iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f} (snr={row['snr']:.2f})")

Финальная модель: Fold 5
Features (12):
  - gold__volume_sma_em
  - sp500__volatility_bbw
  - gold__volatility_dcp
  - gold__low__diff1__lag15
  - spot_price_history__volatility_kcli
  - gold__trend_aroon_up
  - sp500__trend_kst_diff
  - gold__trend_kst_diff
  - index_btc_lth_supply__lth_supply
  - index_btc_active_addresses__aa_z180
  - gold__trend_stc
  - sp500__close__diff1__lag7

Permutation Importance (n_repeats=30) на holdout (178 samples)...

Permutation Importance (все 12 фичей):
                            feature  importance      std       snr
index_btc_active_addresses__aa_z180    0.003371 0.013183  0.255686
               gold__trend_kst_diff    0.002996 0.012456  0.240554
spot_price_history__volatility_kcli    0.002996 0.008425  0.355643
              sp500__volatility_bbw    0.000936 0.018973  0.049350
                    gold__trend_stc   -0.001498 0.011956 -0.125306
               gold__trend_aroon_up   -0.002809 0.006446 -0.435745
                gold__volume_sma_em   