In [34]:
import warnings
from typing import Any

from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV  # noqa
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

In [35]:
train_data = pd.read_csv('train_data.CSV')
test_data = pd.read_csv('test_data.CSV')

In [36]:
player_numbers = ['1', '2', '3', '4', '5']

In [37]:
def get_hero_win_rates(df):
    # Create empty list to store all hero-game combinations
    hero_games = []

    # Process Radiant players (r1-r5)
    for i in player_numbers:
        radiant_col = f'r{i}_hero_id'
        if radiant_col in df.columns:
            # For radiant players, they win when radiant_win == 1
            radiant_data = df[[radiant_col, 'radiant_win']].copy()
            radiant_data.columns = ['hero_id', 'won']
            radiant_data = radiant_data.dropna(subset=['hero_id'])  # Add this line
            hero_games.append(radiant_data)

    # Process Dire players (d1-d5)
    for i in player_numbers:
        dire_col = f'd{i}_hero_id'
        if dire_col in df.columns:
            # For dire players, they win when radiant_win == 0
            dire_data = df[[dire_col, 'radiant_win']].copy()
            dire_data.columns = ['hero_id', 'won']
            dire_data['won'] = 1 - dire_data['won']  # Flip for dire side
            dire_data = dire_data.dropna(subset=['hero_id'])  # Add this line
            hero_games.append(dire_data)

    # Combine all hero games
    all_hero_games = pd.concat(hero_games, ignore_index=True)

    # Calculate win rates
    win_rates = all_hero_games.groupby('hero_id').agg({
        'won': ['sum', 'count']
    }).round(4)

    win_rates.columns = ['wins', 'total_games']
    win_rates['win_rate'] = (win_rates['wins'] / win_rates['total_games']).round(4)

    return win_rates.sort_values('win_rate', ascending=False)



In [38]:

for df in [train_data, test_data]:
    df.drop(columns=['chat_len'], inplace=True)
    df['radiant_carry_%_team_gold'] = df['r1_gold'] / (df['r1_gold'] + df['r2_gold'] + df['r3_gold'] + df['r4_gold'] + df['r5_gold'])
    df['radiant_carry_%_total_gold'] = df['r1_gold'] / (
            df['r1_gold'] + df['r2_gold'] + df['r3_gold'] + df['r4_gold'] + df['r5_gold'] + df['d1_gold'] + df['d2_gold'] + df[
        'd3_gold'] + df['d4_gold'] + df['d5_gold'])

    # Team Stats
    for stat in ['gold', 'xp']:
        dire_team_stat = df[[f'd{number}_{stat}' for number in player_numbers]].sum(axis=1)
        df[f'radiant_team_{stat}'] = df[[f'r{number}_{stat}' for number in player_numbers]].sum(axis=1)
        df[f'radiant_team_{stat}_lead'] = dire_team_stat - df[f'radiant_team_{stat}']
        df[f'radiant_carries_{stat}_lead'] = df[f'r1_{stat}'] - df[f'd1_{stat}'] + df[f'r2_{stat}'] - df[f'd2_{stat}']



In [39]:
# Calculate win rates from training data only
hero_win_rates = get_hero_win_rates(train_data)
hero_winrate_dict = hero_win_rates['win_rate'].to_dict()
overall_avg_winrate = hero_win_rates['win_rate'].mean()

# Rest of your code with one key change - handle missing heroes:
radiant_players = ['r1', 'r2', 'r3', 'r4', 'r5']
dire_players = ['d1', 'd2', 'd3', 'd4', 'd5']
radiant_players_and_dire_players = zip(radiant_players, dire_players)
player_stats = ['kills', 'deaths', 'assists', 'denies', 'gold', 'lh', 'xp', 'health', 'max_health', 'max_mana', 'level', 'stuns',
                'creeps_stacked', 'camps_stacked', 'rune_pickups', 'firstblood_claimed', 'teamfight_participation', 'towers_killed',
                'roshans_killed', 'obs_placed', 'sen_placed']


def process_hero_stats(df):
    for radiant, dire in zip(radiant_players, dire_players):
        # Get rid of the coordinates x,y
        df = df.drop([f'{radiant}_x', f'{radiant}_y', f'{dire}_x', f'{dire}_y'], axis=1)
        # Map win rates with fallback to average for unknown heroes
        df[f'{radiant}_hero_winrate'] = df[f'{radiant}_hero_id'].map(hero_winrate_dict).fillna(overall_avg_winrate)
        df[f'{dire}_hero_winrate'] = df[f'{dire}_hero_id'].map(hero_winrate_dict).fillna(overall_avg_winrate)
        df = df.drop([f'{radiant}_hero_id', f'{dire}_hero_id'], axis=1)
        for stat in player_stats:
            # Get the radiant lead for the stat
            df[f'{radiant}_{stat}_lead'] = df[f'{radiant}_{stat}'] - df[f'{dire}_{stat}']
            # Drop the individual player stats
            df = df.drop([f'{radiant}_{stat}', f'{dire}_{stat}'], axis=1)

    return df


In [40]:
train_data = process_hero_stats(train_data)
test_data = process_hero_stats(test_data)

In [41]:
test_data

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,ID,radiant_carry_%_team_gold,radiant_carry_%_total_gold,radiant_team_gold,radiant_team_gold_lead,radiant_carries_gold_lead,...,r5_stuns_lead,r5_creeps_stacked_lead,r5_camps_stacked_lead,r5_rune_pickups_lead,r5_firstblood_claimed_lead,r5_teamfight_participation_lead,r5_towers_killed_lead,r5_roshans_killed_lead,r5_obs_placed_lead,r5_sen_placed_lead
0,155,22,7,1,a400b8f29dece5f4d266f49f1ae2e98a,0.266568,0.090727,2037,1911,-828,...,0.000000,0,0,0,0,0.000000,0,0,0,0
1,1362,4,0,6,34c81a8faede0d8f1f87dcc6ee824658,0.221980,0.105656,38098,3847,98,...,-10.064224,-9,-3,-12,0,0.420635,0,0,0,0
2,2388,4,0,16,5feece770ca79e5e8cd8052198b3f533,0.208383,0.108253,87320,-6553,-5909,...,0.110524,2,1,12,0,-0.149689,1,0,0,0
3,2043,22,0,15,8f56cc2468ba5c37edb79f3a7b4af6e6,0.228447,0.094634,53229,22037,-9972,...,-39.865319,0,0,7,0,-0.445946,-2,0,4,0
4,840,22,7,2,44cdded6d3311134563f743eb77685b2,0.187484,0.090807,19015,1229,-431,...,0.000000,0,0,-1,0,-0.017857,-1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2525,22,0,15,308faee28efee2e66b39f9f2ba6ea9cf,0.161932,0.087170,85925,-12231,-6012,...,0.000000,-6,-2,13,0,0.107212,0,0,0,11
9996,1002,4,0,4,6066cc7417b43c749d551e123d00f0c8,0.212522,0.096385,21706,4448,-1330,...,12.596962,0,0,-10,-1,0.158730,0,0,0,0
9997,643,22,7,1,e2ca68ac1a6847f4a37f6c9c8ee8695b,0.153894,0.060538,11352,6154,-2278,...,2.532715,0,0,-3,-1,-0.500000,0,0,0,1
9998,2405,22,7,12,47ad6454ede66c1c78fdaa9391dfc556,0.161861,0.084374,88811,-7250,-4096,...,17.480328,-3,-1,6,0,0.041667,3,1,0,0


In [42]:
train_data

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,radiant_win,ID,radiant_carry_%_team_gold,radiant_carry_%_total_gold,radiant_team_gold,radiant_team_gold_lead,...,r5_stuns_lead,r5_creeps_stacked_lead,r5_camps_stacked_lead,r5_rune_pickups_lead,r5_firstblood_claimed_lead,r5_teamfight_participation_lead,r5_towers_killed_lead,r5_roshans_killed_lead,r5_obs_placed_lead,r5_sen_placed_lead
0,871,22,0,4,True,a363534a6344f1b0be1d7ba2c4047d9a,0.124252,0.073763,26575,-8385,...,-17.662320,0,0,-7,-1,-0.602273,0,0,-6,-2
1,2549,22,0,17,True,a0ba4ef0965f56d2eba69c2b9ef33353,0.317245,0.168983,89590,-10986,...,-33.003062,0,0,6,0,-0.123950,-1,0,12,5
2,1841,22,0,8,True,18873e56c2142af326b4e08ca41df63a,0.157368,0.077559,57769,1676,...,-3.932761,3,1,-2,0,-0.318182,0,0,0,0
3,2211,22,7,11,True,c143931a6a8b3fb55a8ef6b9f30c6933,0.253589,0.138347,72720,-12145,...,11.344666,2,1,2,0,-0.014493,1,0,0,2
4,458,22,7,1,True,5a324d8b37522e9f9684493465720023,0.296747,0.154515,9284,-738,...,0.000000,0,0,-2,0,0.166667,0,0,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29670,1664,3,0,8,False,db7050396622d08c6e8a5d6aeedf5f44,0.235066,0.101131,45451,14743,...,30.359493,10,3,-1,0,0.053030,-1,0,1,0
29671,2898,22,7,26,True,b1259dbd78c0e36cba45c2f567c9d3de,0.207528,0.111176,118485,-15799,...,-34.718493,0,0,4,0,0.157956,4,1,0,0
29672,1246,23,0,6,True,9bae67d14d950856a9693b7efd9bce3f,0.216844,0.109415,79458,-1442,...,-3.298476,0,0,1,0,0.048148,1,0,0,0
29673,2620,22,7,15,False,de2e339479cf8e5697aff693d190e411,0.284716,0.137672,96496,6570,...,33.677744,2,1,-4,0,-0.125315,-1,0,1,1


In [43]:
X_train, X_val = train_test_split(train_data, test_size=0.2)

y_train = X_train['radiant_win']
X_train = X_train.drop(['radiant_win', 'ID'], axis=1)

y_val = X_val['radiant_win']
X_val = X_val.drop(['radiant_win', 'ID'], axis=1)

X_test = test_data.drop('ID', axis=1)

In [44]:
def to_numpy(data) -> np.ndarray:
    """Convert data to numpy array"""
    if hasattr(data, 'values'):
        return data.values
    return np.array(data) if not isinstance(data, np.ndarray) else data


def validate_feature_names(feature_names, n_features: int) -> np.ndarray:
    """Validate and convert feature names"""
    if feature_names is None:
        return np.array([f'feature_{i}' for i in range(n_features)])

    if hasattr(feature_names, 'values'):
        return feature_names.values
    elif hasattr(feature_names, '__iter__') and not isinstance(feature_names, str):
        return np.array(list(feature_names))
    else:
        return np.array(feature_names)


def validate_inputs(X_train, y_train, X_val, y_val, X_test) -> tuple[np.ndarray, ...]:
    """Validate and convert inputs to consistent format"""
    # Validate feature names
    feature_names = validate_feature_names(X_train.columns, X_train.shape[1])

    # Convert to numpy arrays
    X_train = to_numpy(X_train)
    X_val = to_numpy(X_val)
    X_test = to_numpy(X_test)
    y_train = to_numpy(y_train).ravel()
    y_val = to_numpy(y_val).ravel()

    # Basic validation
    assert X_train.shape[1] == X_val.shape[1] == X_test.shape[1], "Feature dimension mismatch"
    assert len(feature_names) == X_train.shape[1], "Feature names length mismatch"

    return X_train, y_train, X_val, y_val, X_test, feature_names

In [45]:
@dataclass
class FeatureSelectionResult:
    best_scaler_name: str
    best_scaler: Any
    final_model: RandomForestClassifier | LogisticRegression
    feature_names: np.ndarray
    val_roc_auc: float
    best_model_cv_score: float
    best_n_features: int
    best_params: dict
    test_predictions: np.ndarray


@dataclass
class Config:
    scalers: dict[str, Any]
    models_and_param_grid: list[tuple[Any]]
    cv_folds: int = 3
    n_jobs: int = -1
    random_state: int = 42


@dataclass
class ModelScalerEvaluationResult:
    estimator_name: str
    scaler_name: str
    scaler: Any
    best_n_features: int
    best_result: dict
    scaled_data: tuple[np.ndarray, np.ndarray, np.ndarray]


In [46]:
def create_default_config(cv_folds: int = 3, n_jobs: int = -1, random_state: int = 42) -> Config:
    """Create default configuration for feature selection"""
    scalers = {
        'RobustScaler': RobustScaler(),
        'MinMaxScaler': MinMaxScaler(),
        'StandardScaler': StandardScaler(),
        'None': None
    }

    rf_param_grid = {
        'max_depth': [10, 15, 20],
        'max_features': ['sqrt', 0.5],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [2, 5],
        'class_weight': ['balanced', 'balanced_subsample'],
    }

    # Logistic Regression Param Grid
    lr_param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['liblinear', 'saga', 'lbfgs'],
        'class_weight': ['balanced', None, {0: 1, 1: 2}, {0: 1, 1: 3}],
        'max_iter': [2000, 5000],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]  # Only used with elasticnet
    }

    # Test each scaler
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=400)
    lr = LogisticRegression(random_state=42, n_jobs=-1, max_iter=1000)

    models_and_param_grid = [
        # (rf, rf_param_grid),
        (lr, lr_param_grid)
    ]
    return Config(
        models_and_param_grid=models_and_param_grid,
        scalers=scalers,
        cv_folds=cv_folds,
        n_jobs=n_jobs,
        random_state=random_state
    )

In [47]:
from sklearn.linear_model import LogisticRegression


def apply_scaling(scaler, X_train: np.ndarray, X_val: np.ndarray, X_test: np.ndarray) -> tuple[
    np.ndarray, np.ndarray, np.ndarray]:
    """Apply scaling transformation"""
    if scaler is None:
        return X_train.copy(), X_val.copy(), X_test.copy()

    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_val_scaled, X_test_scaled


def get_feature_importance(X_train: np.ndarray, y_train: np.ndarray, random_state: int = 42, n_jobs: int = -1) -> np.ndarray:
    """Get feature importance using Random Forest"""
    rf = RandomForestClassifier(
        n_estimators=1000,
        n_jobs=n_jobs,
        random_state=random_state,
        class_weight='balanced'
    )
    rf.fit(X_train, y_train)
    return rf.feature_importances_


def optimize_hyperparameters(X_train: np.ndarray, y_train: np.ndarray, estimator, param_grid: dict) -> Any:
    """Optimize hyperparameters using GridSearchCV"""
    random_search = HalvingGridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        random_state=42
    )
    random_search.fit(X_train, y_train)
    return random_search


def test_different_numbers_of_features(X_train: np.ndarray, X_val: np.ndarray, y_train: np.ndarray, y_val: np.ndarray, estimator,
                                       param_grid: dict, max_features: int = 150) -> dict:
    """Test different numbers of features efficiently"""
    # # Get feature importance
    result_dict: dict = {}

    # Smart feature range selection
    min_features = 50
    feature_steps = 5
    feature_range = range(min_features, max_features + 1, feature_steps)
    for n_features in feature_range:
        print(f"Testing {n_features} features...")

        # Select top N features
        top_features_idx = np.argsort(get_feature_importance(X_train, y_train))[-n_features:]
        X_train_subset_of_features = X_train[:, top_features_idx]
        X_val_subset_of_features = X_val[:, top_features_idx]

        # Optimize hyperparameters
        grid_search = optimize_hyperparameters(X_train_subset_of_features, y_train, estimator, param_grid)

        # Evaluate on validation set
        y_val_pred_proba = grid_search.best_estimator_.predict_proba(X_val_subset_of_features)[:, 1]
        roc_auc = roc_auc_score(y_val, y_val_pred_proba)

        result_dict[n_features] = {
            'roc_auc': roc_auc,
            'best_params': grid_search.best_params_,
            'best_model': grid_search.best_estimator_,
            'cv_score': grid_search.best_score_,
            'feature_indices': top_features_idx
        }
        print(f"Features: {n_features}, Val ROC AUC: {roc_auc:.4f}, CV Score: {grid_search.best_score_:.4f}")

    return result_dict


def evaluate_model(estimator, scaler_name: str, scaler: Any, X_train: np.ndarray, X_val: np.ndarray, X_test: np.ndarray,
                   y_train: np.ndarray, y_val: np.ndarray,
                   param_grid: dict, max_features: int = 150) -> ModelScalerEvaluationResult:
    """Evaluate a single scaler configuration"""
    estimator_name = estimator.__class__.__name__
    print(f"Testing {scaler_name} for {estimator_name}...")

    # Apply scaling
    X_train_scaled, X_val_scaled, X_test_scaled = apply_scaling(scaler, X_train, X_val, X_test)

    # Test different feature counts
    results: dict = test_different_numbers_of_features(
        X_train_scaled, X_val_scaled, y_train, y_val,
        estimator, param_grid, max_features
    )
    # Find best configuration for this scaler
    best_n_features = max(results, key=lambda x: results[x]['roc_auc'])
    best_result = results[best_n_features]

    print(f"Best for {scaler_name} with {estimator_name}: {best_result['roc_auc']:.4f} with {best_n_features} features")
    data = {
        'estimator_name': estimator_name,
        'scaler_name': scaler_name,
        'scaler': scaler,
        'best_n_features': best_n_features,
        'best_result': best_result,
        'scaled_data': (X_train_scaled, X_val_scaled, X_test_scaled)
    }
    return ModelScalerEvaluationResult(**data)


all_results = []


def find_best_configuration(X_train: np.ndarray, X_val: np.ndarray, X_test: np.ndarray,
                            y_train: np.ndarray, y_val: np.ndarray, config: Config) -> ModelScalerEvaluationResult:
    """Find the best scaler and feature configuration"""
    best_overall_score = 0
    best_config: ModelScalerEvaluationResult | None = None
    for scaler_name, scaler in config.scalers.items():
        for estimator, param_grid in config.models_and_param_grid:
            result: ModelScalerEvaluationResult = evaluate_model(
                estimator, scaler_name, scaler, X_train, X_val, X_test, y_train, y_val, param_grid
            )
            all_results.append(result)

            best_result = result.best_result
            best_cv_score = best_result['cv_score']
            if best_result and best_cv_score > best_overall_score:
                best_overall_score = best_cv_score
                best_config = result

    if best_config is None:
        raise ValueError("No successful configuration found!")

    print(f"Best overall: {best_config.scaler_name} with CV mean ROC AUC: {best_overall_score:.4f}")
    return best_config


def select_final_features(X_train: np.ndarray, X_val: np.ndarray, X_test: np.ndarray, feature_indices: np.ndarray) -> tuple[
    np.ndarray, np.ndarray, np.ndarray]:
    """Select final features based on indices"""
    X_train_final = X_train[:, feature_indices]
    X_val_final = X_val[:, feature_indices]
    X_test_final = X_test[:, feature_indices]
    return X_train_final, X_val_final, X_test_final


def train_final_model(best_config: ModelScalerEvaluationResult, y_train: np.ndarray, y_val: np.ndarray,
                      feature_names: np.ndarray) -> FeatureSelectionResult:
    best_result: dict = best_config.best_result

    # Get scaled data and feature indices
    X_train_scaled, X_val_scaled, X_test_scaled = best_config.scaled_data
    feature_indices = best_result['feature_indices']
    # Select final features
    X_train_final, X_val_final, X_test_final = select_final_features(X_train_scaled, X_val_scaled, X_test_scaled, feature_indices)

    final_model = best_result['best_model']
    final_model.fit(X_train_final, y_train)

    # Final evaluation
    y_val_pred_proba = final_model.predict_proba(X_val_final)[:, 1]
    val_roc_auc = roc_auc_score(y_val, y_val_pred_proba)
    test_predictions = final_model.predict_proba(X_test_final)[:, 1]

    return FeatureSelectionResult(
        final_model=final_model,
        val_roc_auc=val_roc_auc,
        best_model_cv_score=best_result['cv_score'],
        best_scaler=best_config.scaler,
        best_scaler_name=best_config.scaler_name,
        best_n_features=best_config.best_n_features,
        feature_names=feature_names[feature_indices],
        best_params=best_result['best_params'],
        test_predictions=test_predictions
    )


def optimized_feature_selection(X_train, y_train, X_val, y_val, X_test, config) -> FeatureSelectionResult:
    # Validate inputs
    X_train, y_train, X_val, y_val, X_test, feature_names = validate_inputs(
        X_train, y_train, X_val, y_val, X_test
    )
    # Find best configuration
    best_config: ModelScalerEvaluationResult = find_best_configuration(
        X_train, X_val, X_test, y_train, y_val, config
    )
    # Train final model
    result: FeatureSelectionResult = train_final_model(best_config, y_train, y_val, feature_names)
    return result

In [None]:
config: Config = create_default_config()

results = optimized_feature_selection(
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    X_test=X_test,
    config=config
)

Testing RobustScaler for LogisticRegression...
Feature importance calculated.
[0.00880981 0.00219125 0.00103709 0.00416636 0.00834001 0.00952225
 0.00875668 0.09500384 0.02719632 0.00834181 0.06856115 0.01836442
 0.00965015 0.00969122 0.0047623  0.00859799 0.00653451 0.00638893
 0.01253786 0.00677956 0.00943526 0.00860816 0.00770811 0.00844359
 0.0045922  0.00689459 0.00272804 0.00206796 0.00582748 0.00129197
 0.00705693 0.00205597 0.00036951 0.00374552 0.00282665 0.00951991
 0.01037742 0.00462823 0.00873151 0.0079931  0.00658692 0.01222882
 0.00685266 0.00936874 0.00895539 0.00784958 0.00849491 0.00462264
 0.00694899 0.00293657 0.00214745 0.00609244 0.00128774 0.00708994
 0.00220077 0.00046495 0.00368005 0.00291016 0.00995359 0.00978705
 0.00574854 0.00998021 0.00643594 0.00680492 0.01544705 0.007388
 0.01200578 0.00958996 0.00809856 0.00874557 0.00603926 0.00716243
 0.00293972 0.00217403 0.00598528 0.00129054 0.00740976 0.00263035
 0.00054427 0.00384338 0.00294289 0.01001176 0.009863

In [None]:
print(results)

In [None]:
print('Best_Scaler:', results.best_scaler_name)
print('Best_Params:', results.best_params)
print('Best_N_Features:', results.best_n_features)
print('Val ROC AUC:', results.val_roc_auc)
print('CV Score:', results.best_model_cv_score)


In [None]:
submission_array = np.vstack([test_data['ID'].values, results.test_predictions]).T
submission_df = pd.DataFrame(submission_array, columns=['ID', 'radiant_win']).reset_index(drop=True)
submission_df.to_csv('simple_baseline.csv', index=None)