# Собираем датасет

In [10]:
import pandas as pd
import numpy as np
import ta

import warnings
warnings.filterwarnings('ignore')

def add_ta_features_for_asset(df: pd.DataFrame, prefix: str, volume_col_override: str = None) -> pd.DataFrame:
    """Добавляет TA-индикаторы для актива с заданным префиксом.
    
    Parameters:
        prefix: префикс колонок актива (e.g. "gold", "sp500", "spot_price_history")
        volume_col_override: полное имя volume-колонки, если оно не {prefix}__volume
                             (e.g. "spot_price_history__volume_usd" для BTC)
    """
    df = df.copy()
    
    required = ['open', 'close', 'high', 'low', 'volume']
    col_map = {col: f"{prefix}__{col}" for col in required}

    # Позволяем переопределить имя volume-колонки
    if volume_col_override:
        col_map['volume'] = volume_col_override

    missing = [col_map[c] for c in required if col_map[c] not in df.columns]
    if missing:
        print(f"  Пропущены колонки для {prefix}: {missing}")
        return df

    temp_df = pd.DataFrame({
        'open': df[col_map['open']].values,
        'high': df[col_map['high']].values,
        'low': df[col_map['low']].values,
        'close': df[col_map['close']].values,
        'volume': df[col_map['volume']].values
    })

    temp_with_ta = ta.add_all_ta_features(
        temp_df,
        open="open", high="high", low="low", close="close", volume="volume",
        fillna=False
    )

    original_cols = {'open', 'high', 'low', 'close', 'volume'}
    ta_cols = [c for c in temp_with_ta.columns if c not in original_cols]

    for col in ta_cols:
        df.loc[df.index, f"{prefix}__{col}"] = temp_with_ta[col].values

    print(f"  Добавлено {len(ta_cols)} TA-фичей для {prefix}")
    
    return df


def add_lags(df: pd.DataFrame, cols: list, lags: tuple) -> pd.DataFrame:
    """Добавляет лаговые признаки для указанных колонок."""
    df = df.copy()
    for col in cols:
        for lag in lags:
            df[f"{col}__lag{lag}"] = df[col].shift(lag)
    return df

In [19]:
from dotenv import load_dotenv
import os
import pandas as pd
from LoggingSystem.LoggingSystem import LoggingSystem
from FeaturesGetterModule.FeaturesGetter import FeaturesGetter
from get_features_from_API import get_features
from FeaturesGetterModule.helpers._merge_features_by_date import merge_by_date
from FeaturesEngineer.FeaturesEngineer import FeaturesEngineer

# =============================================================================
# Конфигурация
# =============================================================================
load_dotenv("dev.env")
api_key = os.getenv("COINGLASS_API_KEY")

if not api_key:
    raise ValueError("COINGLASS_API_KEY not found in dev.env")

N_DAYS = 3
TARGET_COLUMN_NAME = f"y_up_{N_DAYS}d"
EXTERNAL_LAGS = (1, 3, 5, 7, 10, 15)

# Инициализация
getter = FeaturesGetter(api_key=api_key)
features_engineer = FeaturesEngineer()

In [20]:
# =============================================================================
# 1. Сбор данных из API
# =============================================================================
print("=" * 60)
print("1. Gathering features from API...")
dfs = get_features(getter, api_key)
df_all = merge_by_date(dfs, how="outer", dedupe="last")
df_all = df_all.sort_values('date').reset_index(drop=True)
print(f"   Raw data shape: {df_all.shape}")

# =============================================================================
# 2. Нормализация и первичное заполнение (ffill)
# =============================================================================
print("=" * 60)
print("2. Normalizing spot columns & Applying ffill...")
df_all = features_engineer.ensure_spot_prefix(df_all)

# Заполняем пропуски вперед (чтобы не было дырок в выходные/праздники перед генерацией фичей)
feature_cols = [c for c in df_all.columns if c != "date"]
df_all[feature_cols] = df_all[feature_cols].ffill()
print(f"   Remaining NaN after ffill: {df_all[feature_cols].isna().sum().sum()}")

# =============================================================================
# 3. Генерация фичей (ДО обрезки даты!)
# =============================================================================
print("=" * 60)
print("3. Engineering features & Adding lags...")

# --- 3.1 Инженерные фичи ---
print(f"   Shape before feature engineering: {df_all.shape}")
df_all = features_engineer.add_engineered_features(df_all, horizon=N_DAYS)

# --- 3.2 TA-фичи ---
df_all = add_ta_features_for_asset(df_all, prefix="gold")
df_all = add_ta_features_for_asset(df_all, prefix="sp500")
df_all = add_ta_features_for_asset(df_all, prefix="spot_price_history",
                                    volume_col_override="spot_price_history__volume_usd")

# --- 3.3 Лаги для внешних рынков ---
gold_cols = [c for c in df_all.columns if c.startswith("gold__") and "__lag" not in c]
sp500_cols = [c for c in df_all.columns if c.startswith("sp500__") and "__lag" not in c]
external_market_cols = gold_cols + sp500_cols

if external_market_cols:
    df_all = add_lags(df_all, cols=external_market_cols, lags=EXTERNAL_LAGS)
    print(f"   Added {len(external_market_cols) * len(EXTERNAL_LAGS)} lag features")

# --- 3.4 Целевая колонка ---
# Добавляем таргет на полном датасете
df_all = features_engineer.add_y_up_custom(df_all, horizon=N_DAYS, close_col="spot_price_history__close")

# =============================================================================
# 4. Фильтрация по дате (Оставляем последние 1500 дней — увеличено с 1250 для компенсации dropna)
# =============================================================================
print("=" * 60)
print("4. Filtering last 1500 days...")

df_all['date'] = pd.to_datetime(df_all['date'])
max_date = df_all['date'].max()
cutoff_date = max_date - pd.Timedelta(days=1500)

rows_total = len(df_all)
df_all = df_all[df_all['date'] >= cutoff_date]
print(f"   Rows kept: {len(df_all)} (from {rows_total})")

# =============================================================================
# 5. Очистка колонок и строк
# =============================================================================
print("=" * 60)
print("5. Final cleanup...")

# Удаляем строки, где нет таргета (это последние N дней будущего, для которых мы не знаем исход)
df_all = df_all.dropna(subset=[TARGET_COLUMN_NAME])

# Удаляем колонки с >30% NaN
nan_threshold = 0.3
nan_ratio = df_all.isna().mean()
cols_to_drop = [
    c for c in nan_ratio[nan_ratio > nan_threshold].index
    if not c.startswith("y_up_")
]
if cols_to_drop:
    print(f"   Dropping {len(cols_to_drop)} columns with >30% NaN")
    df_all = df_all.drop(columns=cols_to_drop)

# Финальная очистка оставшихся NaN (теперь это безопасно)
rows_before_final = len(df_all)
df_all = df_all.dropna().reset_index(drop=True)
print(f"   Final Dropna: removed {rows_before_final - len(df_all)} rows.")

# =============================================================================
# Итоговый результат
# =============================================================================
print("=" * 60)
print(f"FINAL DATASET SHAPE: {df_all.shape}")
print(f"Date range: {df_all['date'].min()} to {df_all['date'].max()}")
print(f"Target distribution: {df_all[TARGET_COLUMN_NAME].value_counts().to_dict()}")

# Сохраняем в df2 для совместимости с кодом обучения
df2 = df_all
df_all.head()

1. Gathering features from API...
   Raw data shape: (5659, 112)
2. Normalizing spot columns & Applying ffill...
   Remaining NaN after ffill: 400195
3. Engineering features & Adding lags...
   Shape before feature engineering: (5659, 112)
  Добавлено 86 TA-фичей для gold
  Добавлено 86 TA-фичей для sp500
  Добавлено 86 TA-фичей для spot_price_history
   Added 1212 lag features
4. Filtering last 1500 days...
   Rows kept: 1501 (from 5659)
5. Final cleanup...
   Dropping 211 columns with >30% NaN
   Final Dropna: removed 495 rows.
FINAL DATASET SHAPE: (1006, 1598)
Date range: 2022-11-26 00:00:00 to 2026-02-12 00:00:00
Target distribution: {np.int64(1): 535, np.int64(0): 471}


Unnamed: 0,futures_open_interest_history__open,futures_open_interest_history__high,futures_open_interest_history__low,futures_open_interest_history__close,date,futures_open_interest_aggregated_history__open,futures_open_interest_aggregated_history__high,futures_open_interest_aggregated_history__low,futures_open_interest_aggregated_history__close,futures_funding_rate_history__open,...,sp500__others_dr__lag7,sp500__others_dr__lag10,sp500__others_dr__lag15,sp500__others_dlr__lag1,sp500__others_dlr__lag3,sp500__others_dlr__lag5,sp500__others_dlr__lag7,sp500__others_dlr__lag10,sp500__others_dlr__lag15,y_up_3d
0,2071097000.0,2098987000.0,2060349000.0,2074905000.0,2022-11-26,9841010000.0,9985549000.0,9841010000.0,9904054000.0,0.00381,...,0.0,-0.825205,0.924075,-0.028308,0.589727,-0.389125,0.0,-0.828628,0.919831,0
1,2074905000.0,2118137000.0,2062621000.0,2081465000.0,2022-11-27,9904054000.0,10027930000.0,9868134000.0,9946130000.0,0.001088,...,0.0,-0.308932,0.0,0.0,0.0,1.348861,0.0,-0.30941,0.0,1
2,2081465000.0,2090258000.0,1970674000.0,2025418000.0,2022-11-28,9946130000.0,10005090000.0,9699231000.0,9894938000.0,0.002267,...,-0.388369,0.475858,0.0,0.0,-0.028308,0.589727,-0.389125,0.47473,0.0,1
3,2025418000.0,2131462000.0,2001265000.0,2095082000.0,2022-11-29,9894938000.0,10299430000.0,8754260000.0,9991988000.0,0.00476,...,1.357999,0.0,-0.893578,-1.55647,0.0,0.0,1.348861,0.0,-0.897594,1
4,2095082000.0,2189434000.0,2082092000.0,2179708000.0,2022-11-30,9991988000.0,10352480000.0,8399971000.0,10259550000.0,0.003404,...,0.591469,0.0,0.871312,-0.159313,0.0,-0.028308,0.589727,0.0,0.867538,0


In [21]:
# Проверка внешних рыночных фичей
gold_cols = [c for c in df2.columns if c.startswith("gold__")]
sp500_cols = [c for c in df2.columns if c.startswith("sp500__")]

print(f"Gold фичей: {len(gold_cols)}")
print(f"S&P500 фичей: {len(sp500_cols)}")

if gold_cols:
    print(f"\nПримеры gold фичей: {gold_cols[:100]}")
if sp500_cols:
    print(f"\nПримеры sp500 фичей: {sp500_cols[:25]}")

Gold фичей: 658
S&P500 фичей: 658

Примеры gold фичей: ['gold__open', 'gold__close', 'gold__high', 'gold__low', 'gold__volume', 'gold__open__diff1', 'gold__open__pct1', 'gold__close__diff1', 'gold__close__pct1', 'gold__high__diff1', 'gold__high__pct1', 'gold__low__diff1', 'gold__low__pct1', 'gold__volume__diff1', 'gold__volume__pct1', 'gold__volume_adi', 'gold__volume_obv', 'gold__volume_cmf', 'gold__volume_fi', 'gold__volume_em', 'gold__volume_sma_em', 'gold__volume_vpt', 'gold__volume_vwap', 'gold__volume_mfi', 'gold__volume_nvi', 'gold__volatility_bbm', 'gold__volatility_bbh', 'gold__volatility_bbl', 'gold__volatility_bbw', 'gold__volatility_bbp', 'gold__volatility_bbhi', 'gold__volatility_bbli', 'gold__volatility_kcc', 'gold__volatility_kch', 'gold__volatility_kcl', 'gold__volatility_kcw', 'gold__volatility_kcp', 'gold__volatility_kchi', 'gold__volatility_kcli', 'gold__volatility_dcl', 'gold__volatility_dch', 'gold__volatility_dcm', 'gold__volatility_dcw', 'gold__volatility_dcp', '

In [22]:
from sklearn.preprocessing import StandardScaler

# 1. Делим на трейн и тест с purge gap (без shuffle для временных рядов!)
gap = N_DAYS  # 7 — purge gap: предотвращает target leakage
              # (таргет последних train-строк зависит от цен в test-периоде)

train_size = int(len(df2) * 0.8)
train = df2.iloc[:train_size - gap]  # убираем gap строк из конца train
test = df2.iloc[train_size:]          # test начинается после gap

print(f"Train: {len(train)} rows ({train['date'].min().date()} to {train['date'].max().date()})")
print(f"Gap:   {gap} rows (purge zone — не используется ни в train, ни в test)")
print(f"Test:  {len(test)} rows ({test['date'].min().date()} to {test['date'].max().date()})")

X_train = train.drop([TARGET_COLUMN_NAME, 'date'], axis=1)
X_test = test.drop([TARGET_COLUMN_NAME, 'date'], axis=1)

y_train = train[TARGET_COLUMN_NAME]
y_test = test[TARGET_COLUMN_NAME]

# 2. Масштабируем
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nFeatures: {X_train.shape[1]}")
df2.head(5)

Train: 801 rows (2022-11-26 to 2025-04-05)
Gap:   3 rows (purge zone — не используется ни в train, ни в test)
Test:  202 rows (2025-04-09 to 2026-02-12)

Features: 1596


Unnamed: 0,futures_open_interest_history__open,futures_open_interest_history__high,futures_open_interest_history__low,futures_open_interest_history__close,date,futures_open_interest_aggregated_history__open,futures_open_interest_aggregated_history__high,futures_open_interest_aggregated_history__low,futures_open_interest_aggregated_history__close,futures_funding_rate_history__open,...,sp500__others_dr__lag7,sp500__others_dr__lag10,sp500__others_dr__lag15,sp500__others_dlr__lag1,sp500__others_dlr__lag3,sp500__others_dlr__lag5,sp500__others_dlr__lag7,sp500__others_dlr__lag10,sp500__others_dlr__lag15,y_up_3d
0,2071097000.0,2098987000.0,2060349000.0,2074905000.0,2022-11-26,9841010000.0,9985549000.0,9841010000.0,9904054000.0,0.00381,...,0.0,-0.825205,0.924075,-0.028308,0.589727,-0.389125,0.0,-0.828628,0.919831,0
1,2074905000.0,2118137000.0,2062621000.0,2081465000.0,2022-11-27,9904054000.0,10027930000.0,9868134000.0,9946130000.0,0.001088,...,0.0,-0.308932,0.0,0.0,0.0,1.348861,0.0,-0.30941,0.0,1
2,2081465000.0,2090258000.0,1970674000.0,2025418000.0,2022-11-28,9946130000.0,10005090000.0,9699231000.0,9894938000.0,0.002267,...,-0.388369,0.475858,0.0,0.0,-0.028308,0.589727,-0.389125,0.47473,0.0,1
3,2025418000.0,2131462000.0,2001265000.0,2095082000.0,2022-11-29,9894938000.0,10299430000.0,8754260000.0,9991988000.0,0.00476,...,1.357999,0.0,-0.893578,-1.55647,0.0,0.0,1.348861,0.0,-0.897594,1
4,2095082000.0,2189434000.0,2082092000.0,2179708000.0,2022-11-30,9991988000.0,10352480000.0,8399971000.0,10259550000.0,0.003404,...,0.591469,0.0,0.871312,-0.159313,0.0,-0.028308,0.589727,0.0,0.867538,0


In [23]:
# =============================================================================
# Удаление сильно коррелированных фичей (дедупликация)
# =============================================================================
# 1. Создаем матрицу корреляций
corr_matrix = X_train.corr().abs()

# 2. Выбираем верхний треугольник матрицы
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# 3. Находим колонки с корреляцией > 0.75 (снижено с 0.9 — агрессивнее убираем дубликаты)
#    При 1500+ фичах огромное количество — лаги, diff/pct одного ряда — почти идентичны
CORR_THRESHOLD = 0.75
to_drop = [column for column in upper.columns if any(upper[column] > CORR_THRESHOLD)]

print(f"Порог корреляции: {CORR_THRESHOLD}")
print(f"Удаляем {len(to_drop)} фичей из {len(X_train.columns)}")

# 4. Удаляем их из обоих датасетов
X_train_reduced = X_train.drop(columns=to_drop)
X_test_reduced = X_test.drop(columns=to_drop)

print(f"Осталось фичей: {len(X_train_reduced.columns)}")

Порог корреляции: 0.75
Удаляем 1251 фичей из 1596
Осталось фичей: 345


In [24]:
# =============================================================================
# Анализ корреляции с целевой переменной (на очищенных данных)
# =============================================================================
from scipy.stats import spearmanr
import pandas as pd

MAX_FEATURES = 15  # Снижено с 50 — жёсткий отбор, только самые сильные сигналы

correlations = {}
significant_features = []
suspicious_features = []

print("Считаем корреляции с таргетом (Spearman + p-value)...")

for col in X_train_reduced.columns:
    corr, p_val = spearmanr(X_train_reduced[col], y_train)
    corr_abs = abs(corr)
    correlations[col] = corr_abs
    
    # Проверка на Data Leakage (слишком хорошая корреляция)
    if p_val < 0.05 and corr_abs > 0.95:
        suspicious_features.append((col, corr_abs))
    
    # Оставляем только статистически значимые (p < 0.05)
    if p_val < 0.05:
        significant_features.append((col, corr_abs))

# Выводим предупреждение о возможной утечке
if suspicious_features:
    print(f"\n  ВНИМАНИЕ! Найдено {len(suspicious_features)} подозрительных фичей (corr > 0.95, p < 0.05).")
    print("  Возможно, это утечка данных (заглядывание в будущее):")
    for f, c in suspicious_features:
        print(f"    - {f}: {c:.4f}")

# Сортируем значимые фичи по силе корреляции
significant_features.sort(key=lambda x: x[1], reverse=True)

# Отбираем top MAX_FEATURES из значимых
top_features_list = [f[0] for f in significant_features[:MAX_FEATURES]]

# Также показываем ВСЕ фичи для справки
sorted_features = sorted(correlations.items(), key=lambda x: x[1], reverse=True)

print(f"\nВсего фичей после удаления дублей: {len(X_train_reduced.columns)}")
print(f"Статистически значимых (p < 0.05): {len(significant_features)}")
print(f"Отобрано top-{len(top_features_list)} для обучения модели")

print("\nТоп-15 лидеров по корреляции с таргетом (из значимых):")
for f, c in significant_features[:15]:
    print(f"  {f}: {c:.4f}")

# =============================================================================
# Формируем финальные датасеты для модели
# =============================================================================
X_train_final = X_train_reduced[top_features_list]
X_test_final = X_test_reduced[top_features_list]

print(f"\nФинальный X_train shape: {X_train_final.shape}")
print(f"Финальный X_test shape: {X_test_final.shape}")
print(f"Ratio samples/features: {len(X_train_final) / X_train_final.shape[1]:.1f}:1")

Считаем корреляции с таргетом (Spearman + p-value)...

Всего фичей после удаления дублей: 345
Статистически значимых (p < 0.05): 45
Отобрано top-15 для обучения модели

Топ-15 лидеров по корреляции с таргетом (из значимых):
  gold__volume_sma_em: 0.1447
  index_btc_lth_supply__lth_supply: 0.1242
  sp500__volatility_kcli__lag3: 0.1146
  gold__volatility_kcw__lag15: 0.1138
  index_btc_active_addresses__aa_z180: 0.1084
  sp500__volatility_kchi__lag1: 0.1069
  index_btc_active_addresses__aa_pct7: 0.1067
  gold__volatility_kchi__lag1: 0.1046
  gold__volatility_kcli__lag5: 0.1039
  gold__volatility_kchi__lag3: 0.1023
  gold__low__diff1__lag15: 0.1017
  sp500__volatility_kcli__lag1: 0.1000
  sp500__close__diff1__lag7: 0.0992
  index_btc_active_addresses__active_address_count: 0.0954
  spot_price_history__volatility_kcli: 0.0938

Финальный X_train shape: (801, 15)
Финальный X_test shape: (202, 15)
Ratio samples/features: 53.4:1


In [25]:
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# =============================================================================
# 1. Обучение модели для отбора фичей (жёсткая регуляризация!)
# =============================================================================

print("Обучаем RandomForest для оценки важности (сильная регуляризация)...")

model_selector = RandomForestClassifier(
    n_estimators=200,       # больше деревьев = стабильнее оценка
    max_depth=3,            # СНИЖЕНО с 5 → 3: деревья мельче, сложнее переобучиться
    min_samples_leaf=20,    # ДОБАВЛЕНО: лист должен содержать ≥20 сэмплов
    min_samples_split=40,   # ДОБАВЛЕНО: для сплита нужно ≥40 сэмплов
    max_features='sqrt',    # каждое дерево видит √N фич — снижает корреляцию деревьев
    random_state=42, 
    n_jobs=-1
)

model_selector.fit(X_train_final, y_train)

train_acc = model_selector.score(X_train_final, y_train)
test_acc = model_selector.score(X_test_final, y_test)
print(f"Train accuracy: {train_acc:.4f}")
print(f"Test accuracy:  {test_acc:.4f}")
gap_acc = train_acc - test_acc
if gap_acc > 0.10:
    print(f"  Разрыв train-test = {gap_acc:.4f} — переобучение!")
elif gap_acc > 0.05:
    print(f"  Разрыв train-test = {gap_acc:.4f} — умеренный, ок для feature selection")
else:
    print(f"  Разрыв train-test = {gap_acc:.4f} — хорошая генерализация")

# =============================================================================
# 2. Считаем Permutation Importance
# =============================================================================
print("\nСчитаем Permutation Importance (n_repeats=30)...")

r = permutation_importance(
    model_selector,
    X_test_final,
    y_test, 
    n_repeats=30,
    random_state=42,
    n_jobs=-1
)

# =============================================================================
# 3. Собираем и анализируем результаты
# =============================================================================

perm_importance = pd.DataFrame({
    'feature': X_train_final.columns,
    'importance': r.importances_mean,
    'std': r.importances_std,
    'snr': r.importances_mean / (r.importances_std + 1e-10)
}).sort_values('importance', ascending=False)

print(f"\nВсе {len(perm_importance)} фичей по Permutation Importance:")
print(perm_importance.to_string())

# =============================================================================
# 4. Фильтр по статистической значимости (importance > 2*std)
# =============================================================================
significant_mask = perm_importance['importance'] > 2 * perm_importance['std']
significant_perm = perm_importance[significant_mask].copy()

positive_mask = perm_importance['importance'] > 0
positive_perm = perm_importance[positive_mask].copy()

print(f"\nФич с importance > 0: {len(positive_perm)}")
print(f"Фич со значимым importance (> 2*std): {len(significant_perm)}")

if len(significant_perm) > 0:
    print("\nСтатистически значимые фичи:")
    print(significant_perm.to_string())

Обучаем RandomForest для оценки важности (сильная регуляризация)...
Train accuracy: 0.6841
Test accuracy:  0.4851
  Разрыв train-test = 0.1990 — переобучение!

Считаем Permutation Importance (n_repeats=30)...

Все 15 фичей по Permutation Importance:
                                             feature  importance       std       snr
13  index_btc_active_addresses__active_address_count    0.007591  0.011626  0.652893
4                index_btc_active_addresses__aa_z180    0.004455  0.018406  0.242067
9                        gold__volatility_kchi__lag3    0.004125  0.006528  0.631950
14               spot_price_history__volatility_kcli    0.003795  0.009278  0.409087
7                        gold__volatility_kchi__lag1    0.000660  0.008453  0.078087
0                                gold__volume_sma_em   -0.000825  0.025884 -0.031876
5                       sp500__volatility_kchi__lag1   -0.001650  0.012013 -0.137361
12                         sp500__close__diff1__lag7   -0.001650  0.01

In [26]:
# Убираем ограничение на ширину колонки (None означает "без лимита")
pd.set_option('display.max_colwidth', None)

# Если таблица разъезжается, можно расширить и общую ширину вывода
pd.set_option('display.width', 1000)

print("Топ-20 фичей по Permutation Importance:")
print(perm_importance.head(25))

Топ-20 фичей по Permutation Importance:
                                             feature  importance       std       snr
13  index_btc_active_addresses__active_address_count    0.007591  0.011626  0.652893
4                index_btc_active_addresses__aa_z180    0.004455  0.018406  0.242067
9                        gold__volatility_kchi__lag3    0.004125  0.006528  0.631950
14               spot_price_history__volatility_kcli    0.003795  0.009278  0.409087
7                        gold__volatility_kchi__lag1    0.000660  0.008453  0.078087
0                                gold__volume_sma_em   -0.000825  0.025884 -0.031876
5                       sp500__volatility_kchi__lag1   -0.001650  0.012013 -0.137361
12                         sp500__close__diff1__lag7   -0.001650  0.012013 -0.137361
8                        gold__volatility_kcli__lag5   -0.003135  0.006822 -0.459603
2                       sp500__volatility_kcli__lag3   -0.003135  0.008992 -0.348697
11                      s