## Ensemble Approach

### Models Used: 
#### - Trees: LightGBM, CatBoost
#### - Neural Network: TabNet
#### - Time Series: N-HiTS
#### - Linear: Ridge

In [1]:
# ==============================
# Imports
# ==============================

import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
import lightgbm as lgb
import catboost as cb
from sklearn.linear_model import Ridge
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import torch

In [None]:
# ==============================
# Data Handling
# ==============================

# Load train data:
df_train = pd.read_parquet('train.parquet', engine='pyarrow')
print(df_train.head)

# Get list of all feature columns
feature_columns = [col for col in df_train.columns if col != 'label']  # Exclude target if present
print(f"Total features: {len(feature_columns)}")
print(feature_columns)

# Load test data:
df_competition = pd.read_parquet('test.parquet', engine='pyarrow')
print(df_competition.head)

<bound method NDFrame.head of                      bid_qty  ask_qty  buy_qty  sell_qty   volume        X1  \
2023-03-01 00:00:00   15.283    8.425  176.405    44.984  221.389  0.181844   
2023-03-01 00:01:00   38.590    2.336  525.846   321.950  847.796  0.489497   
2023-03-01 00:02:00    0.442   60.250  159.227   136.369  295.596  0.260121   
2023-03-01 00:03:00    4.865   21.016  335.742   124.963  460.705  0.099976   
2023-03-01 00:04:00   27.158    3.451   98.411    44.407  142.818  0.270893   
...                      ...      ...      ...       ...      ...       ...   
2024-02-29 23:55:00    4.163    6.805   39.037    55.351   94.388  0.010535   
2024-02-29 23:56:00    2.290    4.058  110.201    67.171  177.372  0.003939   
2024-02-29 23:57:00    5.237    3.640   70.499    30.753  101.252  0.053320   
2024-02-29 23:58:00    5.731    4.901   22.365    52.195   74.560  0.187808   
2024-02-29 23:59:00    3.925    3.865   86.585   217.102  303.687  0.601014   

                     

In [None]:
# ==============================
# Feature Selection for Crypto Trading Data
# ==============================

import gc
from sklearn.feature_selection import mutual_info_regression, SelectKBest, f_regression
from sklearn.linear_model import LassoCV
from scipy.stats import spearmanr
import warnings
warnings.filterwarnings('ignore')

NUM_FEATURES = 200  # Target number of features to keep

def calculate_feature_importance_metrics(X, y, feature_names):
    """
    Calculate multiple feature importance metrics for crypto trading data
    """
    print("Calculating feature importance metrics...")
    results = {}
    
    # 1. MUTUAL INFORMATION (Best for non-linear relationships)
    print("  Computing mutual information...")
    mi_scores = mutual_info_regression(X, y, random_state=42, n_neighbors=5)
    results['mutual_info'] = dict(zip(feature_names, mi_scores))
    
    # 2. CORRELATION WITH TARGET (Linear relationships)
    print("  Computing correlations...")
    correlations = []
    for i, col in enumerate(feature_names):
        try:
            corr = np.corrcoef(X[:, i], y)[0, 1]
            correlations.append(abs(corr) if not np.isnan(corr) else 0)
        except:
            correlations.append(0)
    results['correlation'] = dict(zip(feature_names, correlations))
    
    # 3. SPEARMAN CORRELATION (Monotonic relationships)
    print("  Computing Spearman correlations...")
    spearman_corrs = []
    for i, col in enumerate(feature_names):
        try:
            corr, _ = spearmanr(X[:, i], y)
            spearman_corrs.append(abs(corr) if not np.isnan(corr) else 0)
        except:
            spearman_corrs.append(0)
    results['spearman'] = dict(zip(feature_names, spearman_corrs))
    
    # 4. F-STATISTIC (Linear model relevance)
    print("  Computing F-statistics...")
    f_scores, _ = f_regression(X, y)
    results['f_statistic'] = dict(zip(feature_names, f_scores))
    
    # 5. LASSO REGULARIZATION (Sparse selection)
    print("  Computing Lasso coefficients...")
    try:
        # Use a subset for Lasso to avoid memory issues
        sample_size = min(10000, len(X))
        idx = np.random.choice(len(X), sample_size, replace=False)
        lasso = LassoCV(cv=3, random_state=42, max_iter=1000)
        lasso.fit(X[idx], y[idx])
        lasso_coefs = abs(lasso.coef_)
        results['lasso'] = dict(zip(feature_names, lasso_coefs))
    except Exception as e:
        print(f"    Lasso failed: {e}")
        results['lasso'] = dict(zip(feature_names, [0] * len(feature_names)))
    
    return results

def calculate_feature_stability(X, feature_names, window_size=1000):
    """
    Calculate feature stability over time (important for crypto)
    """
    print("  Computing feature stability...")
    stabilities = []
    
    for i, col in enumerate(feature_names):
        feature_data = X[:, i]
        
        # Calculate rolling standard deviation
        if len(feature_data) > window_size:
            rolling_stds = []
            for j in range(0, len(feature_data) - window_size, window_size // 2):
                window_data = feature_data[j:j + window_size]
                rolling_stds.append(np.std(window_data))
            
            # Stability = inverse of variance in rolling std
            stability = 1 / (1 + np.var(rolling_stds)) if len(rolling_stds) > 1 else 0
        else:
            stability = 0
            
        stabilities.append(stability)
    
    return dict(zip(feature_names, stabilities))

def remove_highly_correlated_features(X, feature_names, threshold=0.95):
    """
    Remove features that are highly correlated with each other
    """
    print("  Removing highly correlated features...")
    
    # Calculate correlation matrix
    corr_matrix = np.corrcoef(X.T)
    
    # Find highly correlated pairs
    to_remove = set()
    for i in range(len(corr_matrix)):
        for j in range(i + 1, len(corr_matrix)):
            if abs(corr_matrix[i, j]) > threshold:
                # Remove the feature with lower variance
                var_i = np.var(X[:, i])
                var_j = np.var(X[:, j])
                to_remove.add(j if var_i > var_j else i)
    
    # Keep features not in removal set
    keep_indices = [i for i in range(len(feature_names)) if i not in to_remove]
    
    print(f"    Removed {len(to_remove)} highly correlated features")
    return keep_indices

def select_best_features(X, y, feature_names, target_count=200):
    """
    Comprehensive feature selection for crypto trading
    """
    print(f"Starting feature selection: {len(feature_names)} -> {target_count} features")

    # clean data:

    
    
    # Step 1: Remove highly correlated features
    keep_indices = remove_highly_correlated_features(X, feature_names, threshold=0.95)
    X_filtered = X[:, keep_indices]
    feature_names_filtered = [feature_names[i] for i in keep_indices]
    
    print(f"After correlation filtering: {len(feature_names_filtered)} features")
    
    # Step 2: Calculate importance metrics
    importance_metrics = calculate_feature_importance_metrics(X_filtered, y, feature_names_filtered)
    
    # Step 3: Calculate stability
    stability_scores = calculate_feature_stability(X_filtered, feature_names_filtered)
    
    # Step 4: Combine metrics with weights optimized for crypto trading
    feature_scores = {}
    for feature in feature_names_filtered:
        # Weighted combination - prioritize metrics that work well for crypto
        score = (
            0.30 * importance_metrics['mutual_info'][feature] +      # Non-linear patterns
            0.25 * importance_metrics['spearman'][feature] +         # Monotonic relationships  
            0.20 * importance_metrics['correlation'][feature] +      # Linear relationships
            0.15 * (importance_metrics['f_statistic'][feature] / max(importance_metrics['f_statistic'].values())) +  # Normalized F-stat
            0.05 * importance_metrics['lasso'][feature] +            # Sparse selection
            0.05 * stability_scores[feature]                         # Temporal stability
        )
        feature_scores[feature] = score
    
    # Step 5: Select top features
    top_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)[:target_count]
    selected_features = [feat[0] for feat in top_features]
    
    print(f"Final selection: {len(selected_features)} features")
    
    # Print top 10 features for inspection
    print("\nTop 10 features:")
    for i, (feat, score) in enumerate(top_features[:10]):
        print(f"  {i+1}. {feat}: {score:.4f}")
    
    return selected_features, feature_scores

# Memory-efficient feature selection
print("Loading and preparing data for feature selection...")

df_sample = df_train.copy()

# Replace infinities with NaN
df_sample.replace([np.inf, -np.inf], np.nan, inplace=True)

# Forward fill (appropriate for time series)
df_sample.fillna(method='ffill', inplace=True)

# Fill remaining NaNs with 0
df_sample.fillna(0, inplace=True)

# Prepare features and target
feature_columns = [col for col in df_sample.columns if col != 'label']
X_sample = df_sample[feature_columns].fillna(0).values  # Simple fillna for selection
y_sample = df_sample['label'].values


print(f"Feature selection data shape: {X_sample.shape}")

# Run feature selection
selected_features, feature_importance_scores = select_best_features(
    X_sample, y_sample, feature_columns, target_count=NUM_FEATURES
)

# Clean up memory
del X_sample, y_sample, df_sample
gc.collect()

print(f"\n✅ Feature selection completed!")
print(f"Selected {len(selected_features)} features out of {len(feature_columns)}")
print(f"Memory usage reduced by ~{(1 - len(selected_features)/len(feature_columns))*100:.1f}%")

Loading and preparing data for feature selection...
Feature selection data shape: (525887, 895)
Starting feature selection: 895 -> 200 features
  Removing highly correlated features...
    Removed 275 highly correlated features
After correlation filtering: 620 features
Calculating feature importance metrics...
  Computing mutual information...
  Computing correlations...
  Computing Spearman correlations...
  Computing F-statistics...
  Computing Lasso coefficients...
  Computing feature stability...
Final selection: 200 features

Top 10 features:
  1. X21: 0.2466
  2. X20: 0.2378
  3. X28: 0.2179
  4. X863: 0.2142
  5. X19: 0.2099
  6. X27: 0.2094
  7. X29: 0.2060
  8. X858: 0.1949
  9. X860: 0.1913
  10. X219: 0.1830

✅ Feature selection completed!
Selected 200 features out of 895
Memory usage reduced by ~77.7%


In [None]:
# ==============================
# Apply Feature Selection to Datasets
# ==============================

print("Applying feature selection to train and test datasets...")

# Apply to training data
print("Processing training data...")
df_train_filtered = df_train[['label'] + selected_features].copy()

# Apply to test data  
print("Processing test data...")
df_test_filtered = df_competition[selected_features].copy()

# Handle missing values properly
print("Handling missing values...")
for df in [df_train_filtered, df_test_filtered]:
    # Replace infinities with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Forward fill (appropriate for time series)
    df.fillna(method='ffill', inplace=True)
    
    # Fill remaining NaNs with 0
    df.fillna(0, inplace=True)

print(f"Final training data shape: {df_train_filtered.shape}")
print(f"Final test data shape: {df_test_filtered.shape}")
gc.collect()

print("✅ Data preprocessing completed!")

Applying feature selection to train and test datasets...
Processing training data...
Processing test data...
Handling missing values...
Final training data shape: (525887, 201)
Final test data shape: (538150, 200)
✅ Data preprocessing completed!


In [None]:
# ==============================
# Ensemble Model
# ==============================

"""
- Trees: LightGBM, CatBoost
- Neural Network: TabNet
- Time Series: N-HiTS
- Linear: Ridge

"""
models_config = {
    'lightgbm': {
        'weight': 0.40,
        'params': {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 10,  # Very conservative
            'learning_rate': 0.005,  # Very slow learning
            'feature_fraction': 0.6,  # Use fewer features
            'bagging_fraction': 0.6,  # More aggressive bagging
            'bagging_freq': 5,
            'min_data_in_leaf': 50,  # Require more data per leaf
            'lambda_l1': 0.5,  # Strong L1 regularization
            'lambda_l2': 0.5,  # Strong L2 regularization
            'verbose': -1,
            'random_state': 42
        }
    },
    
    'catboost': {
        'weight': 0.30,
        'params': {
            'iterations': 200,  # Fewer iterations
            'learning_rate': 0.005,  # Very slow learning
            'depth': 3,  # Very shallow trees
            'l2_leaf_reg': 10,  # Strong regularization
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
            'random_seed': 42,
            'verbose': False,
            'min_data_in_leaf': 50  # Require more data
        }
    },
    
    'ridge': {
        'weight': 0.20,
        'params': {
            'alpha': 50.0,  # Very strong regularization
            'random_state': 42
        }
    },
    
    # Correct TabNet configuration
    'tabnet': {
    'weight': 0.15,
    'params': {
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 2e-2},
        'scheduler_params': {'step_size': 50, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'entmax',
        'n_d': 8,
        'n_a': 8,
        'n_steps': 3,
        'gamma': 1.3,
        'n_independent': 2,
        'n_shared': 2,
        'lambda_sparse': 1e-4,
        'momentum': 0.3,
        'clip_value': 2,
        'verbose': 10
        }
    },
    
    'extra_trees': {
        'weight': 0.10,
        'params': {
            'n_estimators': 25,  # Fewer trees
            'max_depth': 4,  # Very shallow
            'min_samples_split': 20,  # Require more data to split
            'min_samples_leaf': 10,  # Require more data per leaf
            'random_state': 42
        }
    }
}


In [None]:
class CompetitiveEnsemble:
    def __init__(self, models_config):
        self.models_config = models_config
        self.models = {}
        self.weights = {}
        self.is_fitted = False
        
    def train_model(self, model_name, config, X_train, y_train, X_val=None, y_val=None):
        """Train individual model"""
        
        if model_name == 'lightgbm':
            train_data = lgb.Dataset(X_train, label=y_train)
            if X_val is not None:
                val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
                model = lgb.train(
                    config['params'],
                    train_data,
                    valid_sets=[val_data],
                    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
                )
            else:
                model = lgb.train(config['params'], train_data)
                
        elif model_name == 'catboost':
            if X_val is not None:
                model = cb.CatBoostRegressor(**config['params'])
                model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=False)
            else:
                model = cb.CatBoostRegressor(**config['params'])
                model.fit(X_train, y_train, verbose=False)
                
        elif model_name == 'ridge':
            model = Ridge(**config['params'])
            model.fit(X_train, y_train)
            
        elif model_name == 'tabnet':
            model = TabNetRegressor(**config['params'])
            # Reshape y_train and y_val to 2D
            y_train_2d = y_train.reshape(-1, 1)
            y_val_2d = y_val.reshape(-1, 1) if y_val is not None else None
            
            model.fit(
                X_train, y_train_2d,  # Use 2D target
                eval_set=[(X_val, y_val_2d)] if y_val_2d is not None else None,
                max_epochs=50,
                patience=20,
                batch_size=256,
                virtual_batch_size=128,
                num_workers=0,
                drop_last=False
            )
            
        elif model_name == 'extra_trees':
            from sklearn.ensemble import ExtraTreesRegressor
            model = ExtraTreesRegressor(**config['params'])
            model.fit(X_train, y_train)
            
        return model
    
    def fit(self, X, y, validation_split=0.2):
        """Train all models and determine optimal weights"""
        print("Training competitive ensemble...")
        
        # Split data for validation
        split_idx = int(len(X) * (1 - validation_split))
        X_train, X_val = X[:split_idx], X[split_idx:]
        y_train, y_val = y[:split_idx], y[split_idx:]
        
        # Train all models
        for model_name, config in self.models_config.items():
            print(f"Training {model_name}...")
            try:
                self.models[model_name] = self.train_model(
                    model_name, config, X_train, y_train, X_val, y_val
                )
            except Exception as e:
                print(f"Error training {model_name}: {e}")
                continue
        
        # Get validation predictions
        val_predictions = {}
        for model_name, model in self.models.items():
            try:
                preds = model.predict(X_val)
                val_predictions[model_name] = preds
            except Exception as e:
                print(f"Error predicting with {model_name}: {e}")
                continue
        
        # Calculate adaptive weights
        base_weights = {name: config['weight'] for name, config in self.models_config.items() 
                       if name in self.models}
        
        self.weights = self.calculate_adaptive_weights(val_predictions, y_val, base_weights)
        
        # Print model performance
        print("\n=== MODEL PERFORMANCE ===")
        for model_name, preds in val_predictions.items():
            # Ensure both are 1D for correlation calculation
            preds_1d = preds.ravel() if preds.ndim > 1 else preds
            actual_1d = y_val.ravel() if y_val.ndim > 1 else y_val
            
            corr = np.corrcoef(actual_1d, preds_1d)[0, 1]
            rmse = np.sqrt(mean_squared_error(actual_1d, preds_1d))
            weight = self.weights[model_name]
            print(f"{model_name}: Corr={corr:.4f}, RMSE={rmse:.4f}, Weight={weight:.3f}")
        
        self.is_fitted = True
        return self
    
    def predict(self, X):
        """Make ensemble prediction"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")
        
        predictions = {}
        for model_name, model in self.models.items():
            try:
                preds = model.predict(X)
                # Ensure predictions are 1D
                preds_1d = preds.ravel() if preds.ndim > 1 else preds
                predictions[model_name] = preds_1d
            except Exception as e:
                print(f"Error predicting with {model_name}: {e}")
                continue
        
        # Weighted ensemble
        final_prediction = np.zeros(len(X))
        for model_name, preds in predictions.items():
            final_prediction += self.weights[model_name] * preds
        
        return final_prediction
    
    def calculate_adaptive_weights(self, predictions, actual, base_weights):
        """Calculate adaptive weights based on validation performance"""
        # Calculate performance metrics
        performance_scores = {}
        
        for model_name, preds in predictions.items():
            # Ensure predictions and actual are 1D for correlation calculation
            preds_1d = preds.ravel() if preds.ndim > 1 else preds
            actual_1d = actual.ravel() if actual.ndim > 1 else actual
            
            # Use Pearson correlation as primary metric (competition metric)
            corr = np.corrcoef(actual_1d, preds_1d)[0, 1]
            if np.isnan(corr):
                corr = 0
            performance_scores[model_name] = max(0, corr)  # Ensure non-negative
        
        # Normalize performance scores
        total_performance = sum(performance_scores.values())
        if total_performance > 0:
            performance_weights = {k: v/total_performance for k, v in performance_scores.items()}
        else:
            performance_weights = base_weights
        
        # Blend base weights with performance weights
        alpha = 0.7  # Weight for base weights vs performance
        final_weights = {}
        for model_name in base_weights.keys():
            final_weights[model_name] = (
                alpha * base_weights[model_name] + 
                (1 - alpha) * performance_weights[model_name]
            )
        
        # Renormalize
        total_weight = sum(final_weights.values())
        final_weights = {k: v/total_weight for k, v in final_weights.items()}
        
        return final_weights

In [None]:
# ==============================
# Train and Predict
# ==============================

# Prepare data (using your already filtered 2024+ data)
X_train = df_train_filtered.drop('label', axis=1).values
y_train = df_train_filtered['label'].values
X_test = df_test_filtered.values

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# Initialize and train ensemble
ensemble = CompetitiveEnsemble(models_config)
ensemble.fit(X_train, y_train, validation_split=0.2)

# Make predictions
predictions = ensemble.predict(X_test)

print(f"\n=== ENSEMBLE RESULTS ===")
print(f"Predictions shape: {predictions.shape}")
print(f"Prediction range: [{predictions.min():.4f}, {predictions.max():.4f}]")
print(f"Prediction mean: {predictions.mean():.4f}")
print(f"Prediction std: {predictions.std():.4f}")

# Save predictions
submission = pd.DataFrame({
    'ID': df_competition.index,
    'prediction': predictions
})
submission.to_csv('ensemble_submission.csv', index=False)
print("✅ Submission saved as 'ensemble_submission.csv'")

Training data shape: (525887, 200)
Test data shape: (538150, 200)
Training competitive ensemble...
Training lightgbm...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	valid_0's rmse: 1.04052
Training catboost...
Training ridge...
Training tabnet...
epoch 0  | loss: 0.99065 | val_0_mse: 1.14846 |  0:00:51s
epoch 10 | loss: 0.37332 | val_0_mse: 2.89719 |  0:09:27s
epoch 20 | loss: 0.30245 | val_0_mse: 2.22098 |  0:18:04s

Early stopping occurred at epoch 20 with best_epoch = 0 and best_val_0_mse = 1.14846
Training extra_trees...

=== MODEL PERFORMANCE ===
lightgbm: Corr=0.0164, RMSE=1.0405, Weight=0.280
catboost: Corr=0.0055, RMSE=1.0405, Weight=0.199
ridge: Corr=0.0891, RMSE=1.1470, Weight=0.272
tabnet: Corr=0.0305, RMSE=1.0717, Weight=0.145
extra_trees: Corr=0.0256, RMSE=1.0692, Weight=0.105

=== ENSEMBLE RESULTS ===
Predictions shape: (538150,)
Prediction range: [-1.7145, 2.4904]
Prediction mean: -0.0031
Prediction std: 0.1939
✅ Sub