In [1]:
"""
Feature selection module
Uses Mutual Information and Spearman correlation filtering
"""

import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

from config import *


def load_processed_data():
    """Load processed features from disk."""
    data_path = OUTPUT_DIR / OUTPUT_FILES['processed_data']
    print(f"Loading processed data from: {data_path}")
    features = joblib.load(data_path)
    print(f"Loaded {features.shape[0]:,} samples with {features.shape[1]} columns")
    return features


def select_features(df, events_ahead=None, max_corr=None):
    """
    Select features using MI scores and Spearman correlation filtering.
    
    Args:
        df: DataFrame with features and target
        events_ahead: Events ahead for MI calculation (uses MI_EVENTS_AHEAD if None)
        max_corr: Max correlation threshold (uses MAX_CORRELATION if None)
    
    Returns:
        List of selected feature names
    """
    if events_ahead is None:
        events_ahead = MI_EVENTS_AHEAD
    if max_corr is None:
        max_corr = MAX_CORRELATION
    
    print("\n" + "="*60)
    print("FEATURE SELECTION")
    print("="*60)
    print(f"Events ahead for MI calculation: {events_ahead}")
    print(f"Max correlation threshold: {max_corr}")
    
    # Create temporary label for feature selection
    temp_df = df.copy()
    temp_df["label"] = 0
    temp_df.loc[
        temp_df["mid_price"].shift(-events_ahead) > temp_df["mid_price"], 
        "label"
    ] = 1
    temp_df.loc[
        temp_df["mid_price"].shift(-events_ahead) < temp_df["mid_price"], 
        "label"
    ] = -1

    feature_names = [
        col for col in temp_df.columns 
        if col not in ['target', 'date', 'label']
    ]
    temp_df = temp_df.dropna()
    
    # ========================================================================
    # COMPUTE MUTUAL INFORMATION SCORES
    # ========================================================================
    print("\nComputing Mutual Information scores...")
    X = temp_df[feature_names].values
    y = temp_df['label'].values
    
    mi_scores = mutual_info_classif(X, y, random_state=RANDOM_STATE, n_neighbors=3)
    mi_dict = dict(zip(feature_names, mi_scores))
    
    selected_features = feature_names.copy()
    print(f"Initial features: {len(selected_features)}")
    
    # ========================================================================
    # ITERATIVE CORRELATION-BASED REMOVAL
    # ========================================================================
    print("\nIterative correlation-based removal:")
    
    iteration = 0
    while True:
        iteration += 1
        
        # Compute correlation matrix
        corr_matrix, _ = spearmanr(temp_df[selected_features].values)
        
        max_correlation = -1
        remove_feat = None
        keep_feat = None
        
        # Find highest correlation pair
        for i in range(len(selected_features)):
            for j in range(i+1, len(selected_features)):
                corr_val = abs(corr_matrix[i, j])
                
                if corr_val > max_corr and corr_val > max_correlation:
                    max_correlation = corr_val
                    feat1 = selected_features[i]
                    feat2 = selected_features[j]
                    
                    # Remove feature with lower MI score
                    if mi_dict[feat1] < mi_dict[feat2]:
                        remove_feat = feat1
                        keep_feat = feat2
                    else:
                        remove_feat = feat2
                        keep_feat = feat1
        
        # Stop if no high correlations found
        if remove_feat is None:
            break
        
        print(f"  Iter {iteration}: Removing '{remove_feat}' "
              f"(corr={max_correlation:.3f} with '{keep_feat}', "
              f"MI={mi_dict[remove_feat]:.4f})")
        
        selected_features.remove(remove_feat)
    
    print(f"\nFinal features: {len(selected_features)}")
    print(f"Removed: {len(feature_names) - len(selected_features)} features")

    # ========================================================================
    # DISPLAY SELECTED FEATURES WITH MI SCORES
    # ========================================================================
    print("\n" + "="*60)
    print("SELECTED FEATURES WITH MI SCORES (sorted by MI)")
    print("="*60)

    selected_mi = [(feat, mi_dict[feat]) for feat in selected_features]
    selected_mi.sort(key=lambda x: x[1], reverse=True)
    
    for feat, mi_score in selected_mi:
        print(f"{feat:30s} {mi_score:.6f}")
    
    return selected_features


def normalize_and_split(features, selected_features, 
                        train_dates=None, val_dates=None, test_dates=None):
    """
    Split by date and normalize features.
    
    Args:
        features: DataFrame with all features
        selected_features: List of feature names to use
        train_dates: List of training dates (uses TRAIN_DATES if None)
        val_dates: List of validation dates (uses VAL_DATES if None)
        test_dates: List of test dates (uses TEST_DATES if None)
    
    Returns:
        Tuple of (X_train, y_train, X_val, y_val, X_test, y_test, scaler)
    """
    if train_dates is None:
        train_dates = TRAIN_DATES
    if val_dates is None:
        val_dates = VAL_DATES
    if test_dates is None:
        test_dates = TEST_DATES
    
    print("\n" + "="*60)
    print("DATA SPLITTING AND NORMALIZATION")
    print("="*60)
    
    # Split by date
    train_df = features[features['date'].isin(train_dates)].copy()
    val_df = features[features['date'].isin(val_dates)].copy()
    test_df = features[features['date'].isin(test_dates)].copy()
    
    print(f"Train: {len(train_df):,} events (days {', '.join(train_dates)})")
    print(f"Val:   {len(val_df):,} events (days {', '.join(val_dates)})")
    print(f"Test:  {len(test_df):,} events (days {', '.join(test_dates)})")
    
    # Keep only selected features and target
    train_df = train_df[selected_features + ['target']]
    val_df = val_df[selected_features + ['target']]
    test_df = test_df[selected_features + ['target']]
    
    # Normalize using StandardScaler
    print("\nNormalizing features using StandardScaler...")
    scaler = StandardScaler()
    scaler.fit(train_df[selected_features])
    
    train_df[selected_features] = scaler.transform(train_df[selected_features])
    val_df[selected_features] = scaler.transform(val_df[selected_features])
    test_df[selected_features] = scaler.transform(test_df[selected_features])
    
    # Print label distribution
    print("\nLabel Distribution:")
    for name, df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
        counts = df['target'].value_counts().sort_index()
        total = len(df)
        print(f"\n{name} set ({total:,} samples):")
        print(f"  Down (0):    {counts.get(0, 0):6,} "
              f"({counts.get(0, 0)/total*100:5.2f}%)")
        print(f"  Neutral (1): {counts.get(1, 0):6,} "
              f"({counts.get(1, 0)/total*100:5.2f}%)")
        print(f"  Up (2):      {counts.get(2, 0):6,} "
              f"({counts.get(2, 0)/total*100:5.2f}%)")
    
    # Split into X and y
    X_train, y_train = train_df[selected_features], train_df['target']
    X_val, y_val = val_df[selected_features], val_df['target']
    X_test, y_test = test_df[selected_features], test_df['target']
    
    return X_train, y_train, X_val, y_val, X_test, y_test, scaler


def save_selection_outputs(selected_features, scaler):
    """Save selected features and scaler to disk."""
    # Save selected features
    features_path = OUTPUT_DIR / OUTPUT_FILES['selected_features']
    joblib.dump(selected_features, features_path)
    print(f"\n✓ Saved selected features to: {features_path}")
    
    # Save scaler
    scaler_path = OUTPUT_DIR / OUTPUT_FILES['scaler']
    joblib.dump(scaler, scaler_path)
    print(f"✓ Saved scaler to: {scaler_path}")


def main():
    """Main feature selection pipeline."""
    print("="*60)
    print("FEATURE SELECTION PIPELINE")
    print("="*60)
    
    # Load processed data
    features = load_processed_data()
    
    # Select features using training data only
    train_features = features[features['date'].isin(TRAIN_DATES)].copy()
    selected_features = select_features(train_features)
    
    # Normalize and split
    X_train, y_train, X_val, y_val, X_test, y_test, scaler = normalize_and_split(
        features, selected_features
    )
    
    # Save outputs
    save_selection_outputs(selected_features, scaler)
    
    # Save split data for training
    split_data = {
        'X_train': X_train,
        'y_train': y_train,
        'X_val': X_val,
        'y_val': y_val,
        'X_test': X_test,
        'y_test': y_test
    }
    split_path = OUTPUT_DIR / 'split_data.pkl'
    joblib.dump(split_data, split_path)
    print(f"✓ Saved split data to: {split_path}")
    
    print("\n" + "="*60)
    print("FEATURE SELECTION COMPLETE")
    print("="*60)
    print(f"Selected features: {len(selected_features)}")
    print(f"Train samples: {len(X_train):,}")
    print(f"Val samples: {len(X_val):,}")
    print(f"Test samples: {len(X_test):,}")
    

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'config'