In [366]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, QuantileTransformer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

df = pd.read_csv('train_data.csv')
test_features = pd.read_csv('test_data.csv')

In [367]:
print(list(df.columns))

['game_time', 'game_mode', 'lobby_type', 'objectives_len', 'chat_len', 'r1_hero_id', 'r1_kills', 'r1_deaths', 'r1_assists', 'r1_denies', 'r1_gold', 'r1_lh', 'r1_xp', 'r1_health', 'r1_max_health', 'r1_max_mana', 'r1_level', 'r1_x', 'r1_y', 'r1_stuns', 'r1_creeps_stacked', 'r1_camps_stacked', 'r1_rune_pickups', 'r1_firstblood_claimed', 'r1_teamfight_participation', 'r1_towers_killed', 'r1_roshans_killed', 'r1_obs_placed', 'r1_sen_placed', 'r2_hero_id', 'r2_kills', 'r2_deaths', 'r2_assists', 'r2_denies', 'r2_gold', 'r2_lh', 'r2_xp', 'r2_health', 'r2_max_health', 'r2_max_mana', 'r2_level', 'r2_x', 'r2_y', 'r2_stuns', 'r2_creeps_stacked', 'r2_camps_stacked', 'r2_rune_pickups', 'r2_firstblood_claimed', 'r2_teamfight_participation', 'r2_towers_killed', 'r2_roshans_killed', 'r2_obs_placed', 'r2_sen_placed', 'r3_hero_id', 'r3_kills', 'r3_deaths', 'r3_assists', 'r3_denies', 'r3_gold', 'r3_lh', 'r3_xp', 'r3_health', 'r3_max_health', 'r3_max_mana', 'r3_level', 'r3_x', 'r3_y', 'r3_stuns', 'r3_creep

In [368]:
dataframes = [df, test_features]

player_numbers = ['1', '2', '3', '4', '5']
stats = ['gold', 'xp']
# Add the new features to both the training and test set
for dataframe in dataframes:
    # Team Stats
    for stat in stats:
        dataframe[f'dire_team_{stat}'] = dataframe[[f'd{number}_{stat}' for number in player_numbers]].sum(axis=1)
        dataframe[f'radiant_team_{stat}'] = dataframe[[f'r{number}_{stat}' for number in player_numbers]].sum(axis=1)
        dataframe[f'dire_team_{stat}_lead'] = dataframe[f'dire_team_{stat}'] - dataframe[f'radiant_team_{stat}']

    # Player Stats
    for number in player_numbers:
        for stat in stats:
            dataframe[f'd{number}_{stat}_lead'] = dataframe[f'd{number}_{stat}'] - dataframe[f'r{number}_{stat}']




In [369]:
# Remove unwanted columns
columns_to_drop = []
stats_to_drop = ['gold', 'xp', 'x', 'y']
for stat in stats_to_drop:
    dire_columns = [f'd{number}_{stat}' for number in player_numbers]
    radiant_columns = [f'r{number}_{stat}' for number in player_numbers]
    columns_to_drop.extend(dire_columns)
    columns_to_drop.extend(radiant_columns)

df = df.drop(columns=columns_to_drop)
test_features = test_features.drop(columns=columns_to_drop)

In [370]:
df

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,...,d1_gold_lead,d1_xp_lead,d2_gold_lead,d2_xp_lead,d3_gold_lead,d3_xp_lead,d4_gold_lead,d4_xp_lead,d5_gold_lead,d5_xp_lead
0,871,22,0,4,2,110,2,3,11,3,...,827,2134,-2839,-2546,-926,1477,-2097,-2655,-3350,-4142
1,2549,22,0,17,0,114,16,2,12,24,...,-11699,-7029,2450,-3167,-9469,-8278,4935,-322,2797,-253
2,1841,22,0,8,1,100,2,11,12,2,...,2334,105,-4594,-6039,7324,3934,-1281,-5461,-2107,-1747
3,2211,22,7,11,3,32,14,3,11,21,...,2590,4190,-6364,-11050,-2387,-1702,-7443,-11605,1459,-818
4,458,22,7,1,0,68,3,0,0,15,...,-889,-1590,775,1210,-293,-540,-165,1008,-166,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29670,1664,3,0,8,0,17,1,6,1,8,...,-2489,-4469,8831,6826,-2505,-4606,7301,4098,3605,2744
29671,2898,22,7,26,108,89,4,5,17,5,...,6123,15,-4808,730,-8490,-6101,-672,451,-7952,-1388
29672,1246,23,0,6,10,51,9,7,15,16,...,1896,-481,1149,2052,-6269,-9874,-2114,-1512,3896,5710
29673,2620,22,7,15,10,114,9,8,5,10,...,-11736,-11832,5117,9723,7929,7343,12332,532,-7072,-7


In [371]:
X_train, X_val = train_test_split(df, test_size=0.2)

y_train = X_train['radiant_win']
X_train = X_train.drop(['radiant_win', 'ID'], axis=1)

y_val = X_val['radiant_win']
X_val = X_val.drop(['radiant_win', 'ID'], axis=1)

X_test = test_features.drop('ID', axis=1)

In [372]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import matplotlib.pyplot as plt
import pandas as pd

def plot_scaler_results(scaler_results_dict):
    """
    Plot ROC AUC vs number of features for each scaler
    scaler_results_dict: dictionary with scaler names as keys and their feature_test_results as values
    """
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()

    scaler_names = list(scaler_results_dict.keys())

    for i, (scaler_name, results) in enumerate(scaler_results_dict.items()):
        ax = axes[i]

        # Extract data for plotting
        n_features = list(results.keys())
        roc_scores = list(results.values())

        # Create the plot
        ax.plot(n_features, roc_scores, 'b-o', linewidth=2, markersize=4)
        ax.set_title(f'{scaler_name}', fontsize=14, fontweight='bold')
        ax.set_xlabel('Number of features selected', fontsize=12)
        ax.set_ylabel('ROC AUC', fontsize=12)
        ax.grid(True, alpha=0.3)

        # Find and highlight best point
        best_idx = roc_scores.index(max(roc_scores))
        ax.plot(n_features[best_idx], roc_scores[best_idx], 'ro', markersize=8,
                label=f'Best: {max(roc_scores):.4f}')
        ax.legend()

        # Set consistent y-axis limits for comparison
        ax.set_ylim([min(min(results.values()) for results in scaler_results_dict.values()) - 0.001,
                     max(max(results.values()) for results in scaler_results_dict.values()) + 0.001])

    plt.tight_layout()
    plt.suptitle('ROC AUC vs Number of Features by Scaler', fontsize=16, fontweight='bold', y=1.02)
    plt.show()



In [373]:
def fast_feature_selection_pipeline(X_train, y_train, X_val, y_val, X_test, feature_names):
    print("=== Fast Feature Selection Pipeline ===")

    # Convert DataFrames to numpy arrays for consistent indexing
    X_train_np = X_train.values if hasattr(X_train, 'values') else X_train
    X_val_np = X_val.values if hasattr(X_val, 'values') else X_val
    X_test_np = X_test.values if hasattr(X_test, 'values') else X_test

    # Ensure feature_names is a numpy array
    if hasattr(feature_names, 'values'):
        feature_names_array = feature_names.values
    elif hasattr(feature_names, '__iter__') and not isinstance(feature_names, str):
        feature_names_array = np.array(list(feature_names))
    else:
        feature_names_array = np.array(feature_names)

    # Test different scalers
    scalers = {
        'None': None,
        'StandardScaler': StandardScaler(),
        'RobustScaler': RobustScaler(),
        'MinMaxScaler': MinMaxScaler()
    }

    best_scaler_name = None
    best_scaler = None
    best_overall_score = 0
    best_scaler_results = {}
    all_scaler_results = {}  # Store results for all scalers for plotting

    for scaler_name, scaler in scalers.items():
        print(f"\n=== Testing {scaler_name} ===")

        # Apply scaling
        if scaler is None:
            X_train_scaled = X_train_np.copy()
            X_val_scaled = X_val_np.copy()
            X_test_scaled = X_test_np.copy()
        else:
            X_train_scaled = scaler.fit_transform(X_train_np)
            X_val_scaled = scaler.transform(X_val_np)
            X_test_scaled = scaler.transform(X_test_np)

        # Step 2: Get feature importance
        print("Getting feature importance from Random Forest...")
        rf_importance = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
        rf_importance.fit(X_train_scaled, y_train)
        feature_importance = rf_importance.feature_importances_

        # Step 3: Test different numbers of top features
        print("Testing different numbers of top features...")
        results = {}
        
        
        model = RandomForestClassifier(
            n_estimators=500,
            n_jobs=-1,
            random_state=42
        )
        
        max_number_of_features = len(feature_names_array)  # Use the total number of features
        for n_features in range(10, max_number_of_features, 10):
            # Select top N features
            top_features_idx = np.argsort(feature_importance)[-n_features:]
            X_train_test = X_train_scaled[:, top_features_idx]
            X_val_test = X_val_scaled[:, top_features_idx]

            # Train model
            model.fit(X_train_test, y_train)

            # Evaluate using the probability predictions
            y_val_pred_proba = model.predict_proba(X_val_test)[:, 1]
            roc_auc = roc_auc_score(y_val, y_val_pred_proba)
            results[n_features] = roc_auc
            print(f'Number of features: {n_features}, ROC AUC: {roc_auc:.4f}')

        # Store results for plotting
        all_scaler_results[scaler_name] = results

        # Find best for this scaler
        best_n_features_scaler = max(results, key=results.get)
        best_score_scaler = results[best_n_features_scaler]
        print(f'Best for {scaler_name}: {best_score_scaler:.4f} with {best_n_features_scaler} features')

        # Track overall best
        if best_score_scaler > best_overall_score:
            best_overall_score = best_score_scaler
            best_scaler_name = scaler_name
            best_scaler = scaler
            best_scaler_results = {
                'feature_importance': feature_importance,
                'best_n_features': best_n_features_scaler,
                'feature_test_results': results,
                'X_train_scaled': X_train_scaled,
                'X_val_scaled': X_val_scaled,
                'X_test_scaled': X_test_scaled
            }

    # Plot results for all scalers (uncomment if you have this function)
    plot_scaler_results(all_scaler_results)

    print(f"\n=== BEST OVERALL: {best_scaler_name} with score {best_overall_score:.4f} ===")

    # Step 4: Train final model with best scaler and best number of features
    top_features_idx = np.argsort(best_scaler_results['feature_importance'])[-best_scaler_results['best_n_features']:]
    X_train_final = best_scaler_results['X_train_scaled'][:, top_features_idx]
    X_val_final = best_scaler_results['X_val_scaled'][:, top_features_idx]
    X_test_final = best_scaler_results['X_test_scaled'][:, top_features_idx]
    final_feature_names = feature_names_array[top_features_idx]

    final_model = model
    final_model.fit(X_train_final, y_train)

    # Final evaluation
    y_val_pred_proba = final_model.predict_proba(X_val_final)[:, 1]
    final_roc_auc = roc_auc_score(y_val, y_val_pred_proba)

    print(f'Final ROC AUC: {final_roc_auc:.4f}')

    return {
        'best_scaler_name': best_scaler_name,
        'best_scaler': best_scaler,
        'feature_indices': top_features_idx,
        'final_model': final_model,
        'feature_names': final_feature_names,
        'val_roc_auc': final_roc_auc,
        'best_n_features': best_scaler_results['best_n_features'],
        'feature_test_results': best_scaler_results['feature_test_results'],
        'all_scaler_results': all_scaler_results,
        'test_predictions': final_model.predict_proba(X_test_final)[:, 1]
    }

In [374]:
# results = fast_feature_selection_pipeline(
#     X_train,
#     y_train,
#     X_val,
#     y_val,
#     X_test,
#     X_train.columns,
# )



In [375]:
import time
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
def tune_rf_with_gridsearch(model, X_train, y_train, cv_folds=5, scoring='roc_auc'):
    """
    Tune Random Forest hyperparameters using GridSearchCV
    """
    print("=== Grid Search Hyperparameter Tuning ===")
    param_grid = {
        'n_estimators': [300, 400, 500],          
        'max_depth': [10, 15, 20],               
    }
    # Grid search
    grid_search = HalvingGridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scoring,
        n_jobs=-1,
        verbose=1,
    )
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    end_time = time.time()

    print(f"Grid Search completed in {end_time - start_time:.2f} seconds")
    print(f"Best CV Score: {grid_search.best_score_:.4f}")
    print(f"Best Parameters: {grid_search.best_params_}")
    return grid_search


import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from typing import Dict, Tuple, Any, Optional, Union
import logging
from dataclasses import dataclass
from concurrent.futures import ProcessPoolExecutor
import warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class FeatureSelectionResult:
    """Data class to hold feature selection results"""
    best_scaler_name: str
    best_scaler: Any
    feature_indices: np.ndarray
    final_model: RandomForestClassifier
    feature_names: np.ndarray
    val_roc_auc: float
    best_n_features: int
    best_params: Dict
    test_predictions: np.ndarray

class OptimizedFeatureSelector:
    """
    High-performance feature selection pipeline optimized for ROC AUC
    """

    def __init__(self, cv_folds: int = 5, n_jobs: int = -1, random_state: int = 42):
        self.cv_folds = cv_folds
        self.n_jobs = n_jobs
        self.random_state = random_state

        # Optimized hyperparameter grid for ROC AUC
        self.rf_param_grid = {
            'n_estimators': [200, 400],
            'max_depth': [10, 15, None],
            'min_samples_split': [5, 10],
            'min_samples_leaf': [2, 4],
            'class_weight': ['balanced', None]
        }

        # Scalers to test
        self.scalers = {
            'StandardScaler': StandardScaler(),
            'RobustScaler': RobustScaler(),
            'MinMaxScaler': MinMaxScaler(),
            'None': None
        }

    def _validate_inputs(self, X_train, y_train, X_val, y_val, X_test, feature_names):
        """Validate and convert inputs to consistent format"""
        # Convert to numpy arrays
        X_train = self._to_numpy(X_train)
        X_val = self._to_numpy(X_val)
        X_test = self._to_numpy(X_test)
        y_train = self._to_numpy(y_train).ravel()
        y_val = self._to_numpy(y_val).ravel()

        # Validate feature names
        feature_names = self._validate_feature_names(feature_names, X_train.shape[1])

        # Basic validation
        assert X_train.shape[1] == X_val.shape[1] == X_test.shape[1], "Feature dimension mismatch"
        assert len(feature_names) == X_train.shape[1], "Feature names length mismatch"

        return X_train, y_train, X_val, y_val, X_test, feature_names

    def _to_numpy(self, data):
        """Convert data to numpy array"""
        if hasattr(data, 'values'):
            return data.values
        return np.array(data) if not isinstance(data, np.ndarray) else data

    def _validate_feature_names(self, feature_names, n_features):
        """Validate and convert feature names"""
        if feature_names is None:
            return np.array([f'feature_{i}' for i in range(n_features)])

        if hasattr(feature_names, 'values'):
            return feature_names.values
        elif hasattr(feature_names, '__iter__') and not isinstance(feature_names, str):
            return np.array(list(feature_names))
        else:
            return np.array(feature_names)

    def _apply_scaling(self, scaler, X_train, X_val, X_test):
        """Apply scaling transformation"""
        if scaler is None:
            return X_train.copy(), X_val.copy(), X_test.copy()

        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)

        return X_train_scaled, X_val_scaled, X_test_scaled

    def _get_feature_importance(self, X_train, y_train):
        """Get feature importance using Random Forest"""
        rf = RandomForestClassifier(
            n_estimators=100,  # Reduced for speed in importance calculation
            n_jobs=self.n_jobs,
            random_state=self.random_state,
            class_weight='balanced'
        )
        rf.fit(X_train, y_train)
        return rf.feature_importances_

    def _optimize_hyperparameters(self, X_train, y_train):
        """Optimize hyperparameters using GridSearchCV"""
        rf = RandomForestClassifier(random_state=self.random_state, n_jobs=1)  # n_jobs=1 for GridSearch parallelization
        grid_search = HalvingGridSearchCV(
            estimator=rf,
            param_grid=self.rf_param_grid,
            scoring='roc_auc',
            n_jobs=self.n_jobs,
            return_train_score=False
        )
        grid_search.fit(X_train, y_train)
        return grid_search

    def _test_feature_counts(self, X_train, X_val, y_train, y_val, feature_importance, max_features):
        """Test different numbers of features efficiently"""
        results = {}

        # Smart feature range selection
        min_features = round(0.5 * max_features)
        feature_steps = 5
        feature_range = range(min_features, max_features + 1, feature_steps)

        # Ensure we test the full feature set
        if max_features not in feature_range:
            feature_range = list(feature_range) + [max_features]

        for n_features in feature_range:
            logger.info(f"Testing {n_features} features...")

            try:
                # Select top N features
                top_features_idx = np.argsort(feature_importance)[-n_features:]
                X_train_subset = X_train[:, top_features_idx]
                X_val_subset = X_val[:, top_features_idx]

                # Optimize hyperparameters
                grid_search = self._optimize_hyperparameters(X_train_subset, y_train)

                # Evaluate on validation set
                best_model = grid_search.best_estimator_
                y_val_pred_proba = best_model.predict_proba(X_val_subset)[:, 1]
                roc_auc = roc_auc_score(y_val, y_val_pred_proba)

                results[n_features] = {
                    'roc_auc': roc_auc,
                    'best_params': grid_search.best_params_,
                    'cv_score': grid_search.best_score_,
                    'feature_indices': top_features_idx
                }

                logger.info(f"Features: {n_features}, Val ROC AUC: {roc_auc:.4f}, CV Score: {grid_search.best_score_:.4f}")

            except Exception as e:
                logger.warning(f"Error testing {n_features} features: {str(e)}")
                continue

        return results

    def _evaluate_scaler(self, scaler_name, scaler, X_train, X_val, X_test, y_train, y_val, feature_names):
        """Evaluate a single scaler configuration"""
        logger.info(f"Testing {scaler_name}...")

        # Apply scaling
        X_train_scaled, X_val_scaled, X_test_scaled = self._apply_scaling(
            scaler, X_train, X_val, X_test
        )

        # Get feature importance
        feature_importance = self._get_feature_importance(X_train_scaled, y_train)

        # Test different feature counts
        results = self._test_feature_counts(
            X_train_scaled, X_val_scaled, y_train, y_val,
            feature_importance, len(feature_names)
        )

        if not results:
            return None

        # Find best configuration for this scaler
        best_n_features = max(results, key=lambda x: results[x]['roc_auc'])
        best_result = results[best_n_features]

        logger.info(f"Best for {scaler_name}: {best_result['roc_auc']:.4f} with {best_n_features} features")

        return {
            'scaler_name': scaler_name,
            'scaler': scaler,
            'best_n_features': best_n_features,
            'best_result': best_result,
            'feature_importance': feature_importance,
            'scaled_data': (X_train_scaled, X_val_scaled, X_test_scaled)
        }

    def fit(self, X_train, y_train, X_val, y_val, X_test, feature_names=None) -> FeatureSelectionResult:
        """
        Main method to perform optimized feature selection
        
        Args:
            X_train, y_train: Training data
            X_val, y_val: Validation data
            X_test: Test data
            feature_names: Feature names (optional)
            
        Returns:
            FeatureSelectionResult with optimized model and features
        """
        logger.info("Starting optimized feature selection pipeline...")

        # Validate inputs
        X_train, y_train, X_val, y_val, X_test, feature_names = self._validate_inputs(
            X_train, y_train, X_val, y_val, X_test, feature_names
        )

        best_overall_score = 0
        best_config = None

        # Test each scaler
        for scaler_name, scaler in self.scalers.items():
            config = self._evaluate_scaler(
                scaler_name, scaler, X_train, X_val, X_test,
                y_train, y_val, feature_names
            )

            if config and config['best_result']['roc_auc'] > best_overall_score:
                best_overall_score = config['best_result']['roc_auc']
                best_config = config

        if best_config is None:
            raise ValueError("No successful configuration found!")

        logger.info(f"Best overall: {best_config['scaler_name']} with ROC AUC {best_overall_score:.4f}")

        # Train final model
        return self._train_final_model(best_config, y_train, y_val, feature_names)

    def _train_final_model(self, best_config, y_train, y_val, feature_names) -> FeatureSelectionResult:
        """Train the final model with best configuration"""
        # Get scaled data and feature indices
        X_train_scaled, X_val_scaled, X_test_scaled = best_config['scaled_data']
        feature_indices = best_config['best_result']['feature_indices']

        # Select final features
        X_train_final = X_train_scaled[:, feature_indices]
        X_val_final = X_val_scaled[:, feature_indices]
        X_test_final = X_test_scaled[:, feature_indices]
        final_feature_names = feature_names[feature_indices]

        # Train final model
        final_model = RandomForestClassifier(
            **best_config['best_result']['best_params'],
            random_state=self.random_state,
            n_jobs=self.n_jobs
        )
        final_model.fit(X_train_final, y_train)

        # Final evaluation
        y_val_pred_proba = final_model.predict_proba(X_val_final)[:, 1]
        final_roc_auc = roc_auc_score(y_val, y_val_pred_proba)

        # Generate test predictions
        test_predictions = final_model.predict_proba(X_test_final)[:, 1]

        logger.info(f"Final model ROC AUC: {final_roc_auc:.4f}")
        logger.info(f"Selected {len(feature_indices)} features: {final_feature_names[:5]}...")

        return FeatureSelectionResult(
            best_scaler_name=best_config['scaler_name'],
            best_scaler=best_config['scaler'],
            feature_indices=feature_indices,
            final_model=final_model,
            feature_names=final_feature_names,
            val_roc_auc=final_roc_auc,
            best_n_features=best_config['best_n_features'],
            best_params=best_config['best_result']['best_params'],
            test_predictions=test_predictions
        )

# Usage function
def optimized_feature_selection(X_train, y_train, X_val, y_val, X_test, feature_names=None,
                                cv_folds=5, n_jobs=-1, random_state=42) -> FeatureSelectionResult:
    """
    Optimized feature selection pipeline for maximum ROC AUC
    
    Args:
        X_train, y_train: Training data and labels
        X_val, y_val: Validation data and labels  
        X_test: Test data
        feature_names: Optional feature names
        cv_folds: Number of CV folds for hyperparameter tuning
        n_jobs: Number of parallel jobs (-1 for all cores)
        random_state: Random seed for reproducibility
        
    Returns:
        FeatureSelectionResult with optimized model and selected features
    """
    selector = OptimizedFeatureSelector(cv_folds=cv_folds, n_jobs=n_jobs, random_state=random_state)
    return selector.fit(X_train, y_train, X_val, y_val, X_test, feature_names)


In [376]:
results: FeatureSelectionResult = optimized_feature_selection(
    X_train,
    y_train,
    X_val,
    y_val,
    X_test,
    X_train.columns,
)

2025-06-04 20:29:38,082 - INFO - Starting optimized feature selection pipeline...
2025-06-04 20:29:38,121 - INFO - Testing StandardScaler...
2025-06-04 20:29:40,164 - INFO - Testing 110 features...
2025-06-04 20:32:14,146 - INFO - Features: 110, Val ROC AUC: 0.8247, CV Score: 0.8160
2025-06-04 20:32:14,146 - INFO - Testing 115 features...
2025-06-04 20:34:29,192 - INFO - Features: 115, Val ROC AUC: 0.8237, CV Score: 0.8153
2025-06-04 20:34:29,193 - INFO - Testing 120 features...
2025-06-04 20:36:58,010 - INFO - Features: 120, Val ROC AUC: 0.8235, CV Score: 0.8160
2025-06-04 20:36:58,011 - INFO - Testing 125 features...
2025-06-04 20:39:04,981 - INFO - Features: 125, Val ROC AUC: 0.8229, CV Score: 0.8161
2025-06-04 20:39:04,982 - INFO - Testing 130 features...
2025-06-04 20:41:19,054 - INFO - Features: 130, Val ROC AUC: 0.8236, CV Score: 0.8154
2025-06-04 20:41:19,055 - INFO - Testing 135 features...
2025-06-04 20:43:28,902 - INFO - Features: 135, Val ROC AUC: 0.8239, CV Score: 0.8156
2

KeyboardInterrupt: 

In [None]:
y_test_pred = results.test_predictions
y_test_pred

# """Data class to hold feature selection results"""
# best_scaler_name: str
# best_scaler: Any
# feature_indices: np.ndarray
# final_model: RandomForestClassifier
# feature_names: np.ndarray
# val_roc_auc: float
# best_n_features: int
# best_params: Dict
# test_predictions: np.ndarray


In [None]:
import numpy as np
submission_array = np.vstack([test_features['ID'].values, y_test_pred]).T
submission_df = pd.DataFrame(submission_array, columns=['ID', 'radiant_win']).reset_index(drop=True)
submission_df.to_csv('simple_baseline.csv', index=None)