# 1. Libraries

In [26]:
# Standard libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import xgboost as xgb
import shap
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

# Scikit-learn
from sklearn.preprocessing import StandardScaler

# Personal utilities
#sys.path.append('../')  # if personal_utils is in parent folder
#from personal_utils import quick_notify
#sys.path.remove('../')

# PyTorch for autoencoder
import torch

# Import Autoencoder from src
sys.path.append('../src')  # Correct path to src from notebooks/
from autoencoder import *
sys.path.remove('../src')

  from .autonotebook import tqdm as notebook_tqdm


# 2. Data Loading

In [4]:
filepath = "../data/"

train_df = pd.read_parquet(filepath+"train.parquet")
features = train_df.drop('label', axis = 1)
y = train_df['label']
# test_df = pd.read_parquet(filepath+"test.parquet")

# 3. Collinearity Block Compression
- Idea was from Tony271YnoT's solution, by no means novel on my end
- Used to remove highly correlated and possibly uninformative features

In [17]:
corr_matrix = features.corr(method="spearman")

In [19]:
threshold = 0.6 # Copied from Tony271YnoT's Approach
groups = []
visited = set()

for col in corr_matrix.columns:
    if col in visited:
        continue

    group = set(corr_matrix.columns[corr_matrix[col] > threshold])
    groups.append(group)
    visited |= group

selected_features = []

for group in groups:
    if len(group) == 1:
        selected_features.extend(group)
        continue
    
    sub_corr = corr_matrix.loc[list(group), list(group)]
    mean_corr = sub_corr.mean(axis=1)
    medoid = mean_corr.idxmax()

    selected_features.append(medoid)

reduced_df = train_df[selected_features]

In [23]:
threshold = 1e-4

filtered_features = []

for col in reduced_df.columns:
    corr = reduced_df[col].corr(y, method='spearman')
    if abs(corr) >= threshold:
        filtered_features.append(col)

filtered_df = reduced_df[filtered_features].copy()
print(f"Kept {len(filtered_features)} / {reduced_df.shape[1]} features with |corr| >= {threshold}")

Kept 142 / 144 features with |corr| >= 0.0001


# 4. Feature Selection Function Definition + Initial Run

In [None]:
def cv_shap_feature_selection(X, y, n_splits=6, top_k=20):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    fold_top_features = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"\nFold {fold+1}/{n_splits}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        
        params = {
            "objective": "reg:squarederror",
            "max_depth": 6,
            "learning_rate": 0.05,
            "n_jobs": -1,
            "eval_metric": "mae"
        }
        
        evals = [(dval, "eval")]
        
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=500,
            evals=evals,
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        explainer = shap.Explainer(bst)
        shap_values = explainer(X_train)
        
        # Mean absolute SHAP per feature
        mean_shap = np.abs(shap_values.values).mean(axis=0)
        feature_importance = pd.Series(mean_shap, index=X.columns)
        top_features = feature_importance.nlargest(top_k).index.tolist()
        fold_top_features.append(top_features)
        
        print(f"Top-{top_k} features for fold {fold+1}:", top_features)
    
    # Features that appear in at least one fold's top-k
    union_features = list(set(f for fold_list in fold_top_features for f in fold_list))
    print(f"\nUnion of top-{top_k} features across folds ({len(union_features)} features):", union_features)
    
    return union_features, fold_top_features

union_features, fold_top_features = cv_shap_feature_selection(filtered_df, y)


Fold 1/6
Top-20 features for fold 1: ['X758', 'X752', 'X415', 'X332', 'X451', 'X294', 'X610', 'X772', 'X759', 'X86', 'X287', 'X21', 'X608', 'X778', 'X331', 'X39', 'X777', 'X168', 'X751', 'X614']

Fold 2/6
Top-20 features for fold 2: ['X758', 'X752', 'X415', 'X332', 'X451', 'X86', 'X610', 'X759', 'X294', 'X21', 'X772', 'X608', 'X331', 'X287', 'X777', 'X39', 'X778', 'X652', 'X614', 'X168']

Fold 3/6
Top-20 features for fold 3: ['X758', 'X752', 'X415', 'X332', 'X610', 'X294', 'X451', 'X86', 'X772', 'X759', 'X21', 'X608', 'X331', 'X287', 'X778', 'X777', 'X39', 'X168', 'X614', 'X751']

Fold 4/6
Top-20 features for fold 4: ['X758', 'X752', 'X415', 'X332', 'X451', 'X610', 'X772', 'X759', 'X86', 'X294', 'X608', 'X21', 'X331', 'X778', 'X287', 'X39', 'X777', 'X168', 'X614', 'X751']

Fold 5/6
Top-20 features for fold 5: ['X758', 'X752', 'X415', 'X332', 'X294', 'X451', 'X86', 'X759', 'X610', 'X772', 'X608', 'X21', 'X331', 'X778', 'X39', 'X777', 'X287', 'X168', 'X507', 'X614']

Fold 6/6
Top-20 fea

(['X332',
  'X287',
  'X610',
  'X39',
  'X752',
  'X86',
  'X778',
  'X614',
  'X168',
  'X758',
  'X772',
  'X759',
  'X652',
  'X777',
  'X415',
  'X294',
  'X21',
  'X608',
  'X751',
  'X331',
  'X451',
  'X507'],
 [['X758',
   'X752',
   'X415',
   'X332',
   'X451',
   'X294',
   'X610',
   'X772',
   'X759',
   'X86',
   'X287',
   'X21',
   'X608',
   'X778',
   'X331',
   'X39',
   'X777',
   'X168',
   'X751',
   'X614'],
  ['X758',
   'X752',
   'X415',
   'X332',
   'X451',
   'X86',
   'X610',
   'X759',
   'X294',
   'X21',
   'X772',
   'X608',
   'X331',
   'X287',
   'X777',
   'X39',
   'X778',
   'X652',
   'X614',
   'X168'],
  ['X758',
   'X752',
   'X415',
   'X332',
   'X610',
   'X294',
   'X451',
   'X86',
   'X772',
   'X759',
   'X21',
   'X608',
   'X331',
   'X287',
   'X778',
   'X777',
   'X39',
   'X168',
   'X614',
   'X751'],
  ['X758',
   'X752',
   'X415',
   'X332',
   'X451',
   'X610',
   'X772',
   'X759',
   'X86',
   'X294',
   'X608',
   'X21'

# 6. Interaction Engineering

In [1]:
union_features = []

# 7. Regime Features

# 8. Model Optimization

# 9. Summary + Next Steps