In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
from typing import Dict, Any
import logging
from dataclasses import dataclass
import warnings
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV # noqa
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [2]:
train_data = pd.read_csv('train_data.csv')
test_features = pd.read_csv('test_data.csv')

In [3]:
player_numbers = ['1', '2', '3', '4', '5']
stats = ['gold', 'xp', 'roshans_killed', 'firstblood_claimed', 'camps_stacked', 'creeps_stacked', 'lh', 'deaths']

# Remove unwanted stats 
stats_with_player_lead = stats.copy()
stats_with_player_lead.remove('roshans_killed')
stats_with_player_lead.remove('firstblood_claimed')

stats_with_team_lead = stats.copy()
stats_with_team_lead.remove('firstblood_claimed')
for df in [train_data, test_features]:
    
    # Custom Features
    carry_multiplier = 1.10
    
    d1_gold_lead = df['d1_gold'] - df['r1_gold']
    d2_gold_lead = df['d2_gold'] - df['r2_gold']
    df['dire_carries_gold_lead'] = d1_gold_lead * carry_multiplier + d2_gold_lead
    
    d1_xp_lead = df['d1_xp'] - df['r1_xp']
    d2_xp_lead = df['d2_xp'] - df['r2_xp']
    df['dire_carries_xp_lead'] = d1_xp_lead * carry_multiplier + d2_xp_lead
    
    # Team Stats
    for stat in stats_with_team_lead:
        df[f'dire_team_{stat}'] = df[[f'd{number}_{stat}' for number in player_numbers]].sum(axis=1)
        df[f'radiant_team_{stat}'] = df[[f'r{number}_{stat}' for number in player_numbers]].sum(axis=1)
        df[f'dire_team_{stat}_lead'] = df[f'dire_team_{stat}'] - df[f'radiant_team_{stat}']

    # Player Stat leads
    for number in player_numbers:
        for stat in stats_with_player_lead:
            df[f'd{number}_{stat}_lead'] = df[f'd{number}_{stat}'] - df[f'r{number}_{stat}']

In [4]:
# Remove unwanted columns
columns_to_drop = []
player_stats_to_drop = stats + ['x', 'y']
player_stats_to_drop.remove('firstblood_claimed')  # Remove firstblood_claimed as it is not in test data
columns_to_drop += ['lobby_type']
for stat in player_stats_to_drop:
    for team in ['r', 'd']:
        columns_to_drop.extend([f'{team}{number}_{stat}' for number in player_numbers])

train_data.drop(columns=columns_to_drop, inplace=True)
test_features.drop(columns=columns_to_drop, inplace=True)

In [5]:
train_data

Unnamed: 0,game_time,game_mode,objectives_len,chat_len,r1_hero_id,r1_kills,r1_assists,r1_denies,r1_health,r1_max_health,...,d4_camps_stacked_lead,d4_creeps_stacked_lead,d4_lh_lead,d4_deaths_lead,d5_gold_lead,d5_xp_lead,d5_camps_stacked_lead,d5_creeps_stacked_lead,d5_lh_lead,d5_deaths_lead
0,871,22,4,2,110,2,11,3,497,1120,...,4,8,23,5,-3350,-4142,0,0,-77,5
1,2549,22,17,0,114,16,12,24,2086,2600,...,-1,-3,152,-1,2797,-253,0,0,85,5
2,1841,22,8,1,100,2,12,2,1009,1500,...,3,5,7,-1,-2107,-1747,-1,-3,-133,-2
3,2211,22,11,3,32,14,11,21,1166,1940,...,0,0,-55,8,1459,-818,-1,-2,77,0
4,458,22,1,0,68,3,0,15,235,820,...,0,0,9,0,-166,24,0,0,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29670,1664,3,8,0,17,1,1,8,1440,1440,...,1,2,192,1,3605,2744,-3,-10,19,-9
29671,2898,22,26,108,89,4,17,5,1928,2600,...,0,0,21,1,-7952,-1388,0,0,-76,-3
29672,1246,23,6,10,51,9,15,16,4055,4055,...,0,0,-7,0,3896,5710,0,0,-7,0
29673,2620,22,15,10,114,9,5,10,3205,3205,...,2,6,280,0,-7072,-7,-1,-2,-185,-1


In [6]:
X_train, X_val = train_test_split(train_data, test_size=0.2)

y_train = X_train['radiant_win']
X_train = X_train.drop(['radiant_win', 'ID'], axis=1)

y_val = X_val['radiant_win']
X_val = X_val.drop(['radiant_win', 'ID'], axis=1)

X_test = test_features.drop('ID', axis=1)

In [7]:
import matplotlib.pyplot as plt

def plot_scaler_results(scaler_results_dict):
    """
    Plot ROC AUC vs number of features for each scaler
    scaler_results_dict: dictionary with scaler names as keys and their feature_test_results as values
    """
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()

    scaler_names = list(scaler_results_dict.keys())

    for i, (scaler_name, results) in enumerate(scaler_results_dict.items()):
        ax = axes[i]

        # Extract data for plotting
        n_features = list(results.keys())
        roc_scores = list(results.values())

        # Create the plot
        ax.plot(n_features, roc_scores, 'b-o', linewidth=2, markersize=4)
        ax.set_title(f'{scaler_name}', fontsize=14, fontweight='bold')
        ax.set_xlabel('Number of features selected', fontsize=12)
        ax.set_ylabel('ROC AUC', fontsize=12)
        ax.grid(True, alpha=0.3)

        # Find and highlight best point
        best_idx = roc_scores.index(max(roc_scores))
        ax.plot(n_features[best_idx], roc_scores[best_idx], 'ro', markersize=8,
                label=f'Best: {max(roc_scores):.4f}')
        ax.legend()

        # Set consistent y-axis limits for comparison
        ax.set_ylim([min(min(results.values()) for results in scaler_results_dict.values()) - 0.001,
                     max(max(results.values()) for results in scaler_results_dict.values()) + 0.001])

    plt.tight_layout()
    plt.suptitle('ROC AUC vs Number of Features by Scaler', fontsize=16, fontweight='bold', y=1.02)
    plt.show()



In [8]:

@dataclass
class FeatureSelectionResult:
    """Data class to hold feature selection results"""
    best_scaler_name: str
    best_scaler: Any
    feature_indices: np.ndarray
    final_model: RandomForestClassifier
    feature_names: np.ndarray
    val_roc_auc: float
    best_n_features: int
    best_params: Dict
    test_predictions: np.ndarray

class OptimizedFeatureSelector:
    """
    High-performance feature selection pipeline optimized for ROC AUC using RFECV
    """

    def __init__(self, cv_folds: int = 5, n_jobs: int = -1, random_state: int = 42,
                 rfecv_step: float = 5, rfecv_min_features: int = 5):
        self.cv_folds = cv_folds
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.rfecv_step = rfecv_step 
        self.rfecv_min_features = rfecv_min_features  # Minimum number of features to select
        self.estimator = RandomForestClassifier(
            n_estimators=600,
            random_state=self.random_state,
            n_jobs=-1,
        )
        # Optimized hyperparameter grid for ROC AUC
        self.rf_param_grid = {
            # More trees generally improve ROC AUC stability
            'n_estimators': [300, 500, 800],
            # Keep depth options but add more granular control
            'max_depth': [8, 12, 16, None],
            # Feature sampling - crucial for ROC AUC
            'max_features': ['sqrt', 'log2', 0.3, 0.5],
            # Lower splits can help with ROC AUC by creating more nuanced decision boundaries
            'min_samples_split': [2, 5, 10, 15],
            # Leaf size affects probability calibration
            'min_samples_leaf': [1, 2, 4, 8],
            # Bootstrap sampling ratio - very important for ROC AUC
            'max_samples': [0.7, 0.8, 0.9, None],
            # Essential for imbalanced datasets and ROC AUC
            'class_weight': ['balanced', 'balanced_subsample', None],
            # Both criteria can work well for ROC AUC
            'criterion': ['gini', 'entropy'],
            # Bootstrap vs extra trees - affects ensemble diversity
            'bootstrap': [True, False]
        }
        # Scalers to test
        self.scalers = {
            'RobustScaler': RobustScaler(),
            'MinMaxScaler': MinMaxScaler(),
            # 'StandardScaler': StandardScaler(),
            # 'No Scaling': None  # No scaling option
        }

    def _validate_inputs(self, X_train, y_train, X_val, y_val, X_test, feature_names):
        """Validate and convert inputs to consistent format"""
        # Convert to numpy arrays
        X_train = self._to_numpy(X_train)
        X_val = self._to_numpy(X_val)
        X_test = self._to_numpy(X_test)
        y_train = self._to_numpy(y_train).ravel()
        y_val = self._to_numpy(y_val).ravel()

        # Validate feature names
        feature_names = self._validate_feature_names(feature_names, X_train.shape[1])

        # Basic validation
        assert X_train.shape[1] == X_val.shape[1] == X_test.shape[1], "Feature dimension mismatch"
        assert len(feature_names) == X_train.shape[1], "Feature names length mismatch"

        return X_train, y_train, X_val, y_val, X_test, feature_names

    def _to_numpy(self, data):
        """Convert data to numpy array"""
        if hasattr(data, 'values'):
            return data.values
        return np.array(data) if not isinstance(data, np.ndarray) else data

    def _validate_feature_names(self, feature_names, n_features):
        """Validate and convert feature names"""
        if feature_names is None:
            return np.array([f'feature_{i}' for i in range(n_features)])

        if hasattr(feature_names, 'values'):
            return feature_names.values
        elif hasattr(feature_names, '__iter__') and not isinstance(feature_names, str):
            return np.array(list(feature_names))
        else:
            return np.array(feature_names)

    def _apply_scaling(self, scaler, X_train, X_val, X_test):
        """Apply scaling transformation"""
        if scaler is None:
            return X_train.copy(), X_val.copy(), X_test.copy()

        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)

        return X_train_scaled, X_val_scaled, X_test_scaled

    def _perform_rfecv(self, X_train, y_train):
        """Perform RFECV feature selection"""
        # RFECV with ROC AUC scoring\

        model = RFECV(
            estimator=self.estimator,
            step=self.rfecv_step,
            min_features_to_select=self.rfecv_min_features,
            cv=StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state),
            scoring='roc_auc',
            n_jobs=self.n_jobs,
            verbose=1,
        )
        logger.info("Starting RFECV feature selection...")
        model.fit(X_train, y_train)
        logger.info(f"RFECV selected {model.n_features_} features out of {X_train.shape[1]}")
        return model

    def _optimize_hyperparameters(self, X_train, y_train):
        """Optimize hyperparameters using GridSearchCV"""
        grid_search = HalvingGridSearchCV(
            estimator=self.estimator,
            param_grid=self.rf_param_grid,
            scoring='roc_auc',
            n_jobs=self.n_jobs,
            cv=self.cv_folds,
            return_train_score=False,
            random_state=self.random_state,
            verbose=1
        )
        grid_search.fit(X_train, y_train)
        return grid_search
    
    def _create_feature_importance_df(self, rfecv, feature_names, selected_features_mask, selected_feature_indices, scaler_name):
        
        # Create feature importance DataFrame for visualization
        if hasattr(rfecv.estimator_, 'feature_importances_'):
            # Get feature importance from the final RFECV estimator
            all_importances = np.zeros(len(feature_names))
            all_importances[selected_feature_indices] = rfecv.estimator_.feature_importances_

            feature_imp_df = pd.DataFrame({
                'Feature': feature_names,
                'Selected': selected_features_mask,
                'Importance': all_importances,
                'Ranking': rfecv.ranking_
            }).sort_values('Importance', ascending=False)
            feature_imp_df.to_csv(f'rfecv_results_{scaler_name.lower()}.csv', index=False)
            logger.info(f"Feature importances saved to rfecv_results_{scaler_name.lower()}.csv")
            
    def _evaluate_scaler(self, scaler_name, scaler, X_train, X_val, X_test, y_train, y_val, feature_names):
        """Evaluate a single scaler configuration with RFECV"""
        logger.info(f"Testing {scaler_name}...")

        # Apply scaling
        X_train_scaled, X_val_scaled, X_test_scaled = self._apply_scaling(
            scaler, X_train, X_val, X_test
        )

        # Perform RFECV feature selection
        rfecv = self._perform_rfecv(X_train_scaled, y_train)

        # Get selected features
        selected_features_mask = rfecv.support_
        selected_feature_indices = np.where(selected_features_mask)[0]

        self._create_feature_importance_df(rfecv, feature_names, selected_features_mask, selected_feature_indices, scaler_name)
        
        # Select features for hyperparameter optimization
        X_train_selected = X_train_scaled[:, selected_feature_indices]
        X_val_selected = X_val_scaled[:, selected_feature_indices]

        # Optimize hyperparameters on selected features
        logger.info("Optimizing hyperparameters on selected features...")
        grid_search = self._optimize_hyperparameters(X_train_selected, y_train)

        # Evaluate on validation set
        best_model = grid_search.best_estimator_
        y_val_pred_proba = best_model.predict_proba(X_val_selected)[:, 1]
        roc_auc = roc_auc_score(y_val, y_val_pred_proba)

        logger.info(f"{scaler_name} - Validation ROC AUC: {roc_auc:.4f}")
        logger.info(f"{scaler_name} - CV ROC AUC: {grid_search.best_score_:.4f}")
        return {
            'scaler_name': scaler_name,
            'scaler': scaler,
            'rfecv': rfecv,
            'selected_feature_indices': selected_feature_indices,
            'n_features': len(selected_feature_indices),
            'val_roc_auc': roc_auc,
            'cv_roc_auc': grid_search.best_score_,
            'best_params': grid_search.best_params_,
            'best_model': best_model,
            'scaled_data': (X_train_scaled, X_val_scaled, X_test_scaled),
        }

    def fit(self, X_train, y_train, X_val, y_val, X_test) -> FeatureSelectionResult:
        """
        Main method to perform optimized feature selection with RFECV
        
        Args:
            X_train, y_train: Training data
            X_val, y_val: Validation data
            X_test: Test data
            feature_names: Feature names (optional)
            
        Returns:
            FeatureSelectionResult with optimized model and features
        """
        logger.info("Starting optimized feature selection pipeline with RFECV...")
        feature_names = X_train.columns
        # Validate inputs
        X_train, y_train, X_val, y_val, X_test, feature_names = self._validate_inputs(
            X_train, y_train, X_val, y_val, X_test, feature_names
        )
        best_overall_score = 0
        best_config = None

        # Test each scaler
        for scaler_name, scaler in self.scalers.items():
            try:
                config = self._evaluate_scaler(
                    scaler_name, scaler, X_train, X_val, X_test,
                    y_train, y_val, feature_names
                )

                if config and config['cv_roc_auc'] > best_overall_score:
                    best_overall_score = config['cv_roc_auc']
                    best_config = config

            except Exception as e:
                logger.warning(f"Error evaluating {scaler_name}: {str(e)}")
                continue
        if best_config is None:
            raise ValueError("No successful configuration found!")
        logger.info(f"Best overall: {best_config['scaler_name']} with CV ROC AUC {best_overall_score:.4f}")
        logger.info(f"Selected {best_config['n_features']} features")
        
        return self._create_final_result(best_config, feature_names)

    def _create_final_result(self, best_config, feature_names) -> FeatureSelectionResult:
        """Create the final FeatureSelectionResult"""
        # Get selected feature information
        feature_indices = best_config['selected_feature_indices']
        final_feature_names = feature_names[feature_indices]

        # Get scaled test data and make final predictions
        X_test_scaled = best_config['scaled_data'][2]
        X_test_final = X_test_scaled[:, feature_indices]
        test_predictions = best_config['best_model'].predict_proba(X_test_final)[:, 1]

        logger.info(f"Final model Validation ROC AUC: {best_config['val_roc_auc']:.4f}")
        logger.info(f"Final model CV ROC AUC: {best_config['cv_roc_auc']:.4f}")
        logger.info(f"Selected features: {final_feature_names[:5]}...")

        return FeatureSelectionResult(
            best_scaler_name=best_config['scaler_name'],
            best_scaler=best_config['scaler'],
            feature_indices=feature_indices,
            final_model=best_config['best_model'],
            feature_names=final_feature_names,
            val_roc_auc=best_config['val_roc_auc'],
            best_n_features=best_config['n_features'],
            best_params=best_config['best_params'],
            test_predictions=test_predictions,
        )

# Usage function
def optimized_feature_selection(X_train, y_train, X_val, y_val, X_test,
) -> FeatureSelectionResult:
    """
    Optimized feature selection pipeline using RFECV for maximum ROC AUC
    
    Args:
        X_train, y_train: Training data and labels
        X_val, y_val: Validation data and labels  
        X_test: Test data
    Returns:
        FeatureSelectionResult with optimized model and selected features
    """
    selector = OptimizedFeatureSelector()
    return selector.fit(X_train, y_train, X_val, y_val, X_test)

In [None]:

results: FeatureSelectionResult = optimized_feature_selection(
    X_train,
    y_train,
    X_val,
    y_val,
    X_test,
)

2025-06-05 14:02:20,331 - INFO - Starting optimized feature selection pipeline with RFECV...
2025-06-05 14:02:20,405 - INFO - Testing RobustScaler...
2025-06-05 14:02:20,598 - INFO - Starting RFECV feature selection...


In [None]:
print(results)
# """Data class to hold feature selection results"""
# best_scaler_name: str
# best_scaler: Any
# feature_indices: np.ndarray
# final_model: RandomForestClassifier
# feature_names: np.ndarray
# val_roc_auc: float
# best_n_features: int
# best_params: Dict
# test_predictions: np.ndarray

In [None]:
submission_array = np.vstack([test_features['ID'].values, results.test_predictions]).T
submission_df = pd.DataFrame(submission_array, columns=['ID', 'radiant_win']).reset_index(drop=True)
submission_df.to_csv('simple_baseline.csv', index=None)