In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

class MicroOptimizer:
    """ÎØ∏ÏÑ∏ Ï°∞Ï†ïÏùÑ ÏúÑÌïú ÏµúÏ†ÅÌôî ÌÅ¥ÎûòÏä§"""
    
    def __init__(self):
        self.models = {}
        self.submissions = {}
        
    def load_and_preprocess(self, random_seed=42):
        """Îã§ÏñëÌïú ÏãúÎìúÎ°ú Ï†ÑÏ≤òÎ¶¨"""
        np.random.seed(random_seed)
        
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        X_train = train_df[feature_cols].copy()
        y_train = train_df['Cancer'].copy()
        X_test = test_df[feature_cols].copy()
        
        # Ïπ¥ÌÖåÍ≥†Î¶¨Ïª¨ Ïù∏ÏΩîÎî©
        categorical_cols = X_train.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            
            test_values = X_test[col].astype(str)
            test_encoded = []
            for val in test_values:
                if val in le.classes_:
                    test_encoded.append(le.transform([val])[0])
                else:
                    test_encoded.append(0)
            X_test[col] = test_encoded
        
        # Í≤∞Ï∏°Í∞í Ï≤òÎ¶¨
        numeric_cols = X_train.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_test[col].fillna(median_val, inplace=True)
        
        return X_train, y_train, X_test, test_df['ID']
    
    def strategy_1_different_seeds(self, n_seeds=5):
        """Ï†ÑÎûµ 1: Îã§ÏñëÌïú ÏãúÎìúÎ°ú Ïó¨Îü¨ Î™®Îç∏ ÏÉùÏÑ±"""
        print("üé≤ Ï†ÑÎûµ 1: Îã§ÏñëÌïú ÏãúÎìú ÌÖåÏä§Ìä∏")
        
        best_score = 0
        best_seed = 42
        best_predictions = None
        
        for seed in [42, 123, 456, 789, 999][:n_seeds]:
            print(f"  ÏãúÎìú {seed} ÌÖåÏä§Ìä∏ Ï§ë...")
            
            X_train, y_train, X_test, test_ids = self.load_and_preprocess(seed)
            
            # XGBoost with different seed
            pos_count = (y_train == 1).sum()
            neg_count = (y_train == 0).sum()
            scale_pos_weight = neg_count / pos_count
            
            model = xgb.XGBClassifier(
                n_estimators=150,  # Ï°∞Í∏à Îçî ÎßéÏù¥
                max_depth=6,
                learning_rate=0.08,  # Ï°∞Í∏à Îçî Î≥¥ÏàòÏ†Å
                random_state=seed,
                scale_pos_weight=scale_pos_weight,
                subsample=0.8,
                colsample_bytree=0.8,
                eval_metric='logloss'
            )
            
            # CV ÌèâÍ∞Ä
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
            cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
            cv_mean = cv_scores.mean()
            
            print(f"    CV F1: {cv_mean:.6f}")
            
            if cv_mean > best_score:
                best_score = cv_mean
                best_seed = seed
                model.fit(X_train, y_train)
                best_predictions = model.predict(X_test)
        
        print(f"  ‚úÖ ÏµúÍ≥† ÏãúÎìú: {best_seed} (CV: {best_score:.6f})")
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': best_predictions})
        self.submissions['strategy_1'] = submission
        return submission
    
    def strategy_2_ensemble_voting(self):
        """Ï†ÑÎûµ 2: Îã§ÏñëÌïú Î™®Îç∏ ÏïôÏÉÅÎ∏î"""
        print("\nü§ù Ï†ÑÎûµ 2: Îã§ÏñëÌïú Î™®Îç∏ ÏïôÏÉÅÎ∏î")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # Îã§ÏñëÌïú Î™®Îç∏ ÏÉùÏÑ±
        models = {
            'xgb': xgb.XGBClassifier(
                n_estimators=120,
                max_depth=5,
                learning_rate=0.1,
                random_state=42,
                scale_pos_weight=scale_pos_weight
            ),
            'lgb': lgb.LGBMClassifier(
                n_estimators=120,
                max_depth=5,
                learning_rate=0.1,
                random_state=42,
                class_weight='balanced',
                verbose=-1
            ),
            'rf': RandomForestClassifier(
                n_estimators=120,
                max_depth=8,
                random_state=42,
                class_weight='balanced'
            )
        }
        
        # Í∞Å Î™®Îç∏ ÏÑ±Îä• ÌèâÍ∞Ä
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        model_scores = {}
        
        for name, model in models.items():
            scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
            model_scores[name] = scores.mean()
            print(f"  {name}: {scores.mean():.6f}")
            model.fit(X_train, y_train)
        
        # Í∞ÄÏ§ë ÏïôÏÉÅÎ∏î (ÏÑ±Îä• Í∏∞Î∞ò)
        total_score = sum(model_scores.values())
        weights = [score/total_score for score in model_scores.values()]
        
        print(f"  Í∞ÄÏ§ëÏπò: {dict(zip(model_scores.keys(), weights))}")
        
        # ÏòàÏ∏° Ìï©ÏÑ±
        predictions = np.zeros(len(X_test))
        for (name, model), weight in zip(models.items(), weights):
            pred = model.predict_proba(X_test)[:, 1]
            predictions += weight * pred
        
        final_predictions = (predictions > 0.5).astype(int)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': final_predictions})
        self.submissions['strategy_2'] = submission
        return submission
    
    def strategy_3_threshold_optimization(self):
        """Ï†ÑÎûµ 3: ÏûÑÍ≥ÑÍ∞í ÏµúÏ†ÅÌôî"""
        print("\n‚öñÔ∏è Ï†ÑÎûµ 3: ÏûÑÍ≥ÑÍ∞í ÏµúÏ†ÅÌôî")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        model = xgb.XGBClassifier(
            n_estimators=150,
            max_depth=6,
            learning_rate=0.08,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        
        # 5-foldÎ°ú ÏµúÏ†Å ÏûÑÍ≥ÑÍ∞í Ï∞æÍ∏∞
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        best_threshold = 0.5
        best_f1 = 0
        
        thresholds = np.arange(0.3, 0.7, 0.02)
        
        for threshold in thresholds:
            f1_scores = []
            
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                model.fit(X_tr, y_tr)
                y_pred_proba = model.predict_proba(X_val)[:, 1]
                y_pred = (y_pred_proba >= threshold).astype(int)
                
                f1_scores.append(f1_score(y_val, y_pred))
            
            mean_f1 = np.mean(f1_scores)
            if mean_f1 > best_f1:
                best_f1 = mean_f1
                best_threshold = threshold
        
        print(f"  ÏµúÏ†Å ÏûÑÍ≥ÑÍ∞í: {best_threshold:.3f} (F1: {best_f1:.6f})")
        
        # Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞Î°ú ÌïôÏäµ ÌõÑ ÏòàÏ∏°
        model.fit(X_train, y_train)
        test_proba = model.predict_proba(X_test)[:, 1]
        test_predictions = (test_proba >= best_threshold).astype(int)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': test_predictions})
        self.submissions['strategy_3'] = submission
        return submission
    
    def strategy_4_feature_selection(self):
        """Ï†ÑÎûµ 4: ÌäπÏÑ± ÏÑ†ÌÉù ÏµúÏ†ÅÌôî"""
        print("\nüéØ Ï†ÑÎûµ 4: ÌäπÏÑ± ÏÑ†ÌÉù ÏµúÏ†ÅÌôî")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        # Í∏∞Î≥∏ Î™®Îç∏Î°ú ÌäπÏÑ± Ï§ëÏöîÎèÑ Í≥ÑÏÇ∞
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        base_model = xgb.XGBClassifier(
            n_estimators=100,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        base_model.fit(X_train, y_train)
        
        # ÌäπÏÑ± Ï§ëÏöîÎèÑ Í∏∞Î∞ò ÏÑ†ÌÉù
        feature_importance = base_model.feature_importances_
        feature_names = X_train.columns
        
        # ÏÉÅÏúÑ ÌäπÏÑ±Îì§Îßå ÏÑ†ÌÉù
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=False)
        
        print("  ÏÉÅÏúÑ 10Í∞ú Ï§ëÏöî ÌäπÏÑ±:")
        for i, row in importance_df.head(10).iterrows():
            print(f"    {row['feature']}: {row['importance']:.4f}")
        
        # ÏÉÅÏúÑ 12Í∞ú ÌäπÏÑ±Îßå ÏÇ¨Ïö©
        top_features = importance_df.head(12)['feature'].tolist()
        X_train_selected = X_train[top_features]
        X_test_selected = X_test[top_features]
        
        # ÏµúÏ¢Ö Î™®Îç∏
        final_model = xgb.XGBClassifier(
            n_estimators=150,
            max_depth=6,
            learning_rate=0.08,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        
        # CV ÌèâÍ∞Ä
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(final_model, X_train_selected, y_train, cv=cv, scoring='f1')
        print(f"  ÏÑ†ÌÉùÎêú ÌäπÏÑ± CV F1: {cv_scores.mean():.6f}")
        
        final_model.fit(X_train_selected, y_train)
        predictions = final_model.predict(X_test_selected)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': predictions})
        self.submissions['strategy_4'] = submission
        return submission
    
    def strategy_5_meta_ensemble(self):
        """Ï†ÑÎûµ 5: Î©îÌÉÄ ÏïôÏÉÅÎ∏î (Ïó¨Îü¨ Ï†ÑÎûµ Ï°∞Ìï©)"""
        print("\nüîÆ Ï†ÑÎûµ 5: Î©îÌÉÄ ÏïôÏÉÅÎ∏î")
        
        if len(self.submissions) < 2:
            print("  ‚ùå Ï∂©Î∂ÑÌïú Ï†ÑÎûµÏù¥ Ïã§ÌñâÎêòÏßÄ ÏïäÏùå")
            return None
        
        # Î™®Îì† ÏòàÏ∏° Í≤∞Í≥º ÏàòÏßë
        all_predictions = []
        strategy_names = []
        
        for name, submission in self.submissions.items():
            all_predictions.append(submission['Cancer'].values)
            strategy_names.append(name)
        
        all_predictions = np.array(all_predictions)
        
        # Îã§ÏñëÌïú Ï°∞Ìï© ÏãúÎèÑ
        combinations = [
            ('majority_vote', np.round(np.mean(all_predictions, axis=0))),
            ('weighted_avg', np.round(np.average(all_predictions, axis=0, weights=[1.2, 1.0, 1.1, 0.9]))),
        ]
        
        print(f"  {len(strategy_names)}Í∞ú Ï†ÑÎûµ Ï°∞Ìï©:")
        for i, name in enumerate(strategy_names):
            print(f"    {i+1}. {name}")
        
        # Í∞Å Ï°∞Ìï© Í≤∞Í≥º
        test_ids = self.submissions[list(self.submissions.keys())[0]]['ID']
        
        for combo_name, combo_pred in combinations:
            submission = pd.DataFrame({'ID': test_ids, 'Cancer': combo_pred.astype(int)})
            self.submissions[f'meta_{combo_name}'] = submission
            
            # ÏòàÏ∏° Î∂ÑÌè¨
            pred_dist = pd.Series(combo_pred.astype(int)).value_counts()
            print(f"  {combo_name} ÏòàÏ∏° Î∂ÑÌè¨: 0={pred_dist.get(0,0)} 1={pred_dist.get(1,0)}")
        
        return self.submissions[f'meta_majority_vote']

def run_all_strategies():
    """Î™®Îì† Ï†ÑÎûµ Ïã§Ìñâ"""
    print("üöÄ ÎØ∏ÏÑ∏ Ï°∞Ï†ï ÏµúÏ†ÅÌôî ÏãúÏûë!")
    print("=" * 50)
    
    optimizer = MicroOptimizer()
    
    # Í∞Å Ï†ÑÎûµ Ïã§Ìñâ
    strategies = [
        optimizer.strategy_1_different_seeds,
        optimizer.strategy_2_ensemble_voting,
        optimizer.strategy_3_threshold_optimization,
        optimizer.strategy_4_feature_selection,
        optimizer.strategy_5_meta_ensemble
    ]
    
    for i, strategy in enumerate(strategies, 1):
        print(f"\n{'='*20} Ï†ÑÎûµ {i} {'='*20}")
        try:
            result = strategy()
            if result is not None:
                filename = f'submission_strategy_{i}.csv'
                result.to_csv(filename, index=False)
                print(f"  üíæ Ï†ÄÏû•: {filename}")
        except Exception as e:
            print(f"  ‚ùå Ï†ÑÎûµ {i} Ïã§Ìå®: {e}")
    
    # ÏµúÏ¢Ö Í∂åÏû• Ï†úÏ∂úÌååÏùº
    if 'meta_majority_vote' in optimizer.submissions:
        best_submission = optimizer.submissions['meta_majority_vote']
        best_submission.to_csv('final_optimized_submission.csv', index=False)
        print(f"\nüèÜ ÏµúÏ¢Ö Í∂åÏû• Ï†úÏ∂ú: final_optimized_submission.csv")
    
    print(f"\nüìã ÏÉùÏÑ±Îêú Ï†úÏ∂ú ÌååÏùºÎì§:")
    for name in optimizer.submissions.keys():
        print(f"  - {name}")
    
    print(f"\nüí° Ï†úÏ∂ú Í∂åÏû• ÏàúÏÑú:")
    print(f"1. final_optimized_submission.csv (Î©îÌÉÄ ÏïôÏÉÅÎ∏î)")
    print(f"2. submission_strategy_3.csv (ÏûÑÍ≥ÑÍ∞í ÏµúÏ†ÅÌôî)")
    print(f"3. submission_strategy_1.csv (ÏµúÏ†Å ÏãúÎìú)")

if __name__ == "__main__":
    run_all_strategies()

üöÄ ÎØ∏ÏÑ∏ Ï°∞Ï†ï ÏµúÏ†ÅÌôî ÏãúÏûë!

üé≤ Ï†ÑÎûµ 1: Îã§ÏñëÌïú ÏãúÎìú ÌÖåÏä§Ìä∏
  ÏãúÎìú 42 ÌÖåÏä§Ìä∏ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.468266
  ÏãúÎìú 123 ÌÖåÏä§Ìä∏ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.467269
  ÏãúÎìú 456 ÌÖåÏä§Ìä∏ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.468154
  ÏãúÎìú 789 ÌÖåÏä§Ìä∏ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.467811
  ÏãúÎìú 999 ÌÖåÏä§Ìä∏ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.467157
  ‚úÖ ÏµúÍ≥† ÏãúÎìú: 42 (CV: 0.468266)
  üíæ Ï†ÄÏû•: submission_strategy_1.csv


ü§ù Ï†ÑÎûµ 2: Îã§ÏñëÌïú Î™®Îç∏ ÏïôÏÉÅÎ∏î


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  xgb: 0.473768
  lgb: 0.474828
  rf: 0.419316
  Í∞ÄÏ§ëÏπò: {'xgb': np.float64(0.34634365040266146), 'lgb': np.float64(0.34711890931523237), 'rf': np.float64(0.3065374402821061)}
  üíæ Ï†ÄÏû•: submission_strategy_2.csv


‚öñÔ∏è Ï†ÑÎûµ 3: ÏûÑÍ≥ÑÍ∞í ÏµúÏ†ÅÌôî


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏµúÏ†Å ÏûÑÍ≥ÑÍ∞í: 0.600 (F1: 0.484147)
  üíæ Ï†ÄÏû•: submission_strategy_3.csv


üéØ Ï†ÑÎûµ 4: ÌäπÏÑ± ÏÑ†ÌÉù ÏµúÏ†ÅÌôî


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏÉÅÏúÑ 10Í∞ú Ï§ëÏöî ÌäπÏÑ±:
    Family_Background: 0.2334
    Radiation_History: 0.1933
    Race: 0.1640
    Iodine_Deficiency: 0.1235
    Country: 0.0893
    T4_Result: 0.0237
    T3_Result: 0.0233
    Nodule_Size: 0.0231
    Age: 0.0227
    TSH_Result: 0.0222
  ÏÑ†ÌÉùÎêú ÌäπÏÑ± CV F1: 0.469882
  üíæ Ï†ÄÏû•: submission_strategy_4.csv


üîÆ Ï†ÑÎûµ 5: Î©îÌÉÄ ÏïôÏÉÅÎ∏î
  4Í∞ú Ï†ÑÎûµ Ï°∞Ìï©:
    1. strategy_1
    2. strategy_2
    3. strategy_3
    4. strategy_4
  majority_vote ÏòàÏ∏° Î∂ÑÌè¨: 0=40431 1=5773
  weighted_avg ÏòàÏ∏° Î∂ÑÌè¨: 0=40404 1=5800
  üíæ Ï†ÄÏû•: submission_strategy_5.csv

üèÜ ÏµúÏ¢Ö Í∂åÏû• Ï†úÏ∂ú: final_optimized_submission.csv

üìã ÏÉùÏÑ±Îêú Ï†úÏ∂ú ÌååÏùºÎì§:
  - strategy_1
  - strategy_2
  - strategy_3
  - strategy_4
  - meta_majority_vote
  - meta_weighted_avg

üí° Ï†úÏ∂ú Í∂åÏû• ÏàúÏÑú:
1. final_optimized_submission.csv (Î©îÌÉÄ ÏïôÏÉÅÎ∏î)
2. submission_strategy_3.csv (ÏûÑÍ≥ÑÍ∞í ÏµúÏ†ÅÌôî)
3. submission_strategy_1.csv (ÏµúÏ†Å ÏãúÎìú)


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestClassifier

class FinalPushOptimizer:
    """0.51+ ÎèåÌååÎ•º ÏúÑÌïú ÏµúÏ¢Ö ÏµúÏ†ÅÌôî"""
    
    def __init__(self):
        self.submissions = {}
        
    def load_and_preprocess(self, random_seed=42):
        """Í∏∞Î≥∏ Ï†ÑÏ≤òÎ¶¨"""
        np.random.seed(random_seed)
        
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        X_train = train_df[feature_cols].copy()
        y_train = train_df['Cancer'].copy()
        X_test = test_df[feature_cols].copy()
        
        # Ïπ¥ÌÖåÍ≥†Î¶¨Ïª¨ Ïù∏ÏΩîÎî©
        categorical_cols = X_train.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            
            test_values = X_test[col].astype(str)
            test_encoded = []
            for val in test_values:
                if val in le.classes_:
                    test_encoded.append(le.transform([val])[0])
                else:
                    test_encoded.append(0)
            X_test[col] = test_encoded
        
        # Í≤∞Ï∏°Í∞í Ï≤òÎ¶¨
        numeric_cols = X_train.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_test[col].fillna(median_val, inplace=True)
        
        return X_train, y_train, X_test, test_df['ID']
    
    def strategy_hyperparameter_grid(self):
        """Ï†ÑÎûµ 1: ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ Í∑∏Î¶¨Îìú ÌÉêÏÉâ (Ï†ïÍµêÌïú ÌäúÎãù)"""
        print("‚öôÔ∏è Ï†ÑÎûµ 1: Ï†ïÍµêÌïú ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # ÏÑ∏Î∞ÄÌïú ÌååÎùºÎØ∏ÌÑ∞ Í∑∏Î¶¨Îìú
        param_combinations = [
            # Ï°∞Ìï© 1: Î≥¥ÏàòÏ†Å
            {'n_estimators': 180, 'max_depth': 5, 'learning_rate': 0.07, 'subsample': 0.85, 'colsample_bytree': 0.85},
            # Ï°∞Ìï© 2: Í∑†Ìòï
            {'n_estimators': 160, 'max_depth': 6, 'learning_rate': 0.08, 'subsample': 0.8, 'colsample_bytree': 0.8},
            # Ï°∞Ìï© 3: Í≥µÍ≤©Ï†Å
            {'n_estimators': 140, 'max_depth': 7, 'learning_rate': 0.09, 'subsample': 0.75, 'colsample_bytree': 0.9},
            # Ï°∞Ìï© 4: ÍπäÏù¥ Ï§ëÏã¨
            {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.06, 'subsample': 0.9, 'colsample_bytree': 0.75},
        ]
        
        best_score = 0
        best_params = None
        best_predictions = None
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        for i, params in enumerate(param_combinations, 1):
            print(f"  Ï°∞Ìï© {i} ÌÖåÏä§Ìä∏: {params}")
            
            model = xgb.XGBClassifier(
                **params,
                random_state=42,
                scale_pos_weight=scale_pos_weight,
                reg_alpha=0.1,
                reg_lambda=0.1,
                eval_metric='logloss'
            )
            
            cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
            cv_mean = cv_scores.mean()
            
            print(f"    CV F1: {cv_mean:.6f}")
            
            if cv_mean > best_score:
                best_score = cv_mean
                best_params = params
                model.fit(X_train, y_train)
                best_predictions = model.predict(X_test)
        
        print(f"  ‚úÖ ÏµúÏ†Å Ï°∞Ìï©: {best_params}")
        print(f"  ‚úÖ ÏµúÍ≥† CV F1: {best_score:.6f}")
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': best_predictions})
        self.submissions['hyperparameter_tuned'] = submission
        return submission
    
    def strategy_multiple_seeds_ensemble(self):
        """Ï†ÑÎûµ 2: Ïó¨Îü¨ ÏãúÎìúÏùò ÏïôÏÉÅÎ∏î (10Í∞ú ÏãúÎìú)"""
        print("\nüé≤ Ï†ÑÎûµ 2: 10Í∞ú ÏãúÎìú ÏïôÏÉÅÎ∏î")
        
        seeds = [42, 123, 456, 789, 999, 1337, 2024, 555, 777, 2025]
        all_predictions = []
        
        for i, seed in enumerate(seeds, 1):
            print(f"  ÏãúÎìú {seed} ({i}/10) Ï≤òÎ¶¨ Ï§ë...")
            
            X_train, y_train, X_test, test_ids = self.load_and_preprocess(seed)
            
            pos_count = (y_train == 1).sum()
            neg_count = (y_train == 0).sum()
            scale_pos_weight = neg_count / pos_count
            
            # ÏµúÏ†ÅÌôîÎêú ÌååÎùºÎØ∏ÌÑ∞ ÏÇ¨Ïö©
            model = xgb.XGBClassifier(
                n_estimators=160,
                max_depth=6,
                learning_rate=0.08,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=seed,
                scale_pos_weight=scale_pos_weight,
                reg_alpha=0.1,
                reg_lambda=0.1
            )
            
            model.fit(X_train, y_train)
            pred_proba = model.predict_proba(X_test)[:, 1]
            all_predictions.append(pred_proba)
        
        # ÌèâÍ∑† ÌôïÎ•† Í≥ÑÏÇ∞
        avg_proba = np.mean(all_predictions, axis=0)
        final_predictions = (avg_proba > 0.5).astype(int)
        
        print(f"  ‚úÖ 10Í∞ú ÏãúÎìú ÏïôÏÉÅÎ∏î ÏôÑÎ£å")
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': final_predictions})
        self.submissions['multi_seed_ensemble'] = submission
        return submission
    
    def strategy_stacking_ensemble(self):
        """Ï†ÑÎûµ 3: Ïä§ÌÉúÌÇπ ÏïôÏÉÅÎ∏î"""
        print("\nüèóÔ∏è Ï†ÑÎûµ 3: Ïä§ÌÉúÌÇπ ÏïôÏÉÅÎ∏î")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # Level 1 Î™®Îç∏Îì§
        models = {
            'xgb1': xgb.XGBClassifier(n_estimators=150, max_depth=5, learning_rate=0.08, random_state=42, scale_pos_weight=scale_pos_weight),
            'xgb2': xgb.XGBClassifier(n_estimators=180, max_depth=6, learning_rate=0.07, random_state=123, scale_pos_weight=scale_pos_weight),
            'lgb': lgb.LGBMClassifier(n_estimators=150, max_depth=6, learning_rate=0.08, random_state=42, class_weight='balanced', verbose=-1),
            'cat': cb.CatBoostClassifier(iterations=150, depth=6, learning_rate=0.08, random_state=42, verbose=False),
            'rf': RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42, class_weight='balanced')
        }
        
        # Cross-validationÏúºÎ°ú Level 1 ÏòàÏ∏° ÏÉùÏÑ±
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        oof_predictions = np.zeros((len(X_train), len(models)))
        test_predictions = np.zeros((len(X_test), len(models)))
        
        for i, (name, model) in enumerate(models.items()):
            print(f"  {name} Ï≤òÎ¶¨ Ï§ë...")
            
            oof_pred = np.zeros(len(X_train))
            
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                model.fit(X_tr, y_tr)
                oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
            
            oof_predictions[:, i] = oof_pred
            
            # Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞Î°ú Ïû¨ÌïôÏäµ
            model.fit(X_train, y_train)
            test_predictions[:, i] = model.predict_proba(X_test)[:, 1]
        
        # Level 2 Î™®Îç∏ (Î©îÌÉÄ Î™®Îç∏)
        from sklearn.linear_model import LogisticRegression
        meta_model = LogisticRegression(random_state=42, class_weight='balanced')
        meta_model.fit(oof_predictions, y_train)
        
        # ÏµúÏ¢Ö ÏòàÏ∏°
        final_proba = meta_model.predict_proba(test_predictions)[:, 1]
        final_predictions = (final_proba > 0.5).astype(int)
        
        print(f"  ‚úÖ Ïä§ÌÉúÌÇπ ÏïôÏÉÅÎ∏î ÏôÑÎ£å")
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': final_predictions})
        self.submissions['stacking_ensemble'] = submission
        return submission
    
    def strategy_threshold_fine_tuning(self):
        """Ï†ÑÎûµ 4: Ï¥àÏ†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í Ï°∞Ï†ï"""
        print("\n‚öñÔ∏è Ï†ÑÎûµ 4: Ï¥àÏ†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í Ï°∞Ï†ï")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        model = xgb.XGBClassifier(
            n_estimators=160,
            max_depth=6,
            learning_rate=0.08,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        
        # Îß§Ïö∞ ÏÑ∏Î∞ÄÌïú ÏûÑÍ≥ÑÍ∞í ÌÉêÏÉâ
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        thresholds = np.arange(0.40, 0.60, 0.005)  # 0.005 Í∞ÑÍ≤©ÏúºÎ°ú ÏÑ∏Î∞ÄÌïòÍ≤å
        
        best_threshold = 0.5
        best_f1 = 0
        
        print(f"  {len(thresholds)}Í∞ú ÏûÑÍ≥ÑÍ∞í ÌÖåÏä§Ìä∏ Ï§ë...")
        
        for threshold in thresholds:
            f1_scores = []
            
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                model.fit(X_tr, y_tr)
                y_pred_proba = model.predict_proba(X_val)[:, 1]
                y_pred = (y_pred_proba >= threshold).astype(int)
                
                from sklearn.metrics import f1_score
                f1_scores.append(f1_score(y_val, y_pred))
            
            mean_f1 = np.mean(f1_scores)
            if mean_f1 > best_f1:
                best_f1 = mean_f1
                best_threshold = threshold
        
        print(f"  ‚úÖ ÏµúÏ†Å ÏûÑÍ≥ÑÍ∞í: {best_threshold:.4f} (CV F1: {best_f1:.6f})")
        
        # ÏµúÏ¢Ö ÏòàÏ∏°
        model.fit(X_train, y_train)
        test_proba = model.predict_proba(X_test)[:, 1]
        test_predictions = (test_proba >= best_threshold).astype(int)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': test_predictions})
        self.submissions['fine_threshold'] = submission
        return submission
    
    def strategy_ultimate_ensemble(self):
        """Ï†ÑÎûµ 5: Í∂ÅÍ∑πÏùò ÏïôÏÉÅÎ∏î (Î™®Îì† Ï†ÑÎûµ Í≤∞Ìï©)"""
        print("\nüèÜ Ï†ÑÎûµ 5: Í∂ÅÍ∑πÏùò ÏïôÏÉÅÎ∏î")
        
        if len(self.submissions) < 3:
            print("  ‚ùå Ï∂©Î∂ÑÌïú Ï†ÑÎûµÏù¥ Ïã§ÌñâÎêòÏßÄ ÏïäÏùå")
            return None
        
        all_predictions = []
        strategy_weights = {
            'hyperparameter_tuned': 1.3,    # ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù
            'multi_seed_ensemble': 1.2,     # Î©ÄÌã∞ ÏãúÎìú
            'stacking_ensemble': 1.4,       # Ïä§ÌÉúÌÇπ (Í∞ÄÏû• ÎÜíÏùÄ Í∞ÄÏ§ëÏπò)
            'fine_threshold': 1.1,          # Ï†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í
        }
        
        weights = []
        predictions = []
        
        for name, submission in self.submissions.items():
            if name in strategy_weights:
                predictions.append(submission['Cancer'].values)
                weights.append(strategy_weights[name])
                print(f"  {name}: Í∞ÄÏ§ëÏπò {strategy_weights[name]}")
        
        if len(predictions) == 0:
            print("  ‚ùå Ïú†Ìö®Ìïú Ï†ÑÎûµÏù¥ ÏóÜÏùå")
            return None
        
        # Í∞ÄÏ§ë ÌèâÍ∑†
        weighted_avg = np.average(predictions, axis=0, weights=weights)
        final_predictions = np.round(weighted_avg).astype(int)
        
        test_ids = list(self.submissions.values())[0]['ID']
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': final_predictions})
        
        # ÏòàÏ∏° Î∂ÑÌè¨ ÌôïÏù∏
        pred_dist = pd.Series(final_predictions).value_counts()
        print(f"  ÏµúÏ¢Ö ÏòàÏ∏° Î∂ÑÌè¨: 0={pred_dist.get(0,0)} 1={pred_dist.get(1,0)}")
        
        self.submissions['ultimate_ensemble'] = submission
        return submission

def run_final_optimization():
    """ÏµúÏ¢Ö ÏµúÏ†ÅÌôî Ïã§Ìñâ"""
    print("üî• 0.51+ ÎèåÌååÎ•º ÏúÑÌïú ÏµúÏ¢Ö ÏµúÏ†ÅÌôî!")
    print("=" * 60)
    
    optimizer = FinalPushOptimizer()
    
    strategies = [
        ("ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù", optimizer.strategy_hyperparameter_grid),
        ("Î©ÄÌã∞ ÏãúÎìú ÏïôÏÉÅÎ∏î", optimizer.strategy_multiple_seeds_ensemble),
        ("Ïä§ÌÉúÌÇπ ÏïôÏÉÅÎ∏î", optimizer.strategy_stacking_ensemble),
        ("Ï†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í Ï°∞Ï†ï", optimizer.strategy_threshold_fine_tuning),
        ("Í∂ÅÍ∑πÏùò ÏïôÏÉÅÎ∏î", optimizer.strategy_ultimate_ensemble)
    ]
    
    for i, (name, strategy_func) in enumerate(strategies, 1):
        print(f"\n{'='*15} {name} {'='*15}")
        try:
            result = strategy_func()
            if result is not None:
                filename = f'final_push_{i}.csv'
                result.to_csv(filename, index=False)
                print(f"  üíæ Ï†ÄÏû•: {filename}")
        except Exception as e:
            print(f"  ‚ùå {name} Ïã§Ìå®: {e}")
    
    # ÏµúÏ¢Ö Í∂åÏû• Ï†úÏ∂ú
    if 'ultimate_ensemble' in optimizer.submissions:
        best_submission = optimizer.submissions['ultimate_ensemble']
        best_submission.to_csv('ULTIMATE_SUBMISSION.csv', index=False)
        print(f"\nüèÜ ÏµúÏ¢Ö Ï∂îÏ≤ú: ULTIMATE_SUBMISSION.csv")
    
    print(f"\nüìã ÏÉùÏÑ±Îêú ÌååÏùºÎì§:")
    print(f"1. ULTIMATE_SUBMISSION.csv ‚≠ê (1ÏàúÏúÑ Ï†úÏ∂ú)")
    print(f"2. final_push_3.csv (Ïä§ÌÉúÌÇπ ÏïôÏÉÅÎ∏î)")  
    print(f"3. final_push_2.csv (Î©ÄÌã∞ ÏãúÎìú)")
    print(f"4. final_push_4.csv (Ï†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í)")
    
    print(f"\nüéØ Î™©Ìëú: 0.510+ Îã¨ÏÑ±!")
    print(f"ÌòÑÏû¨ Ï∂îÏÑ∏Î°ú Î≥¥Î©¥ Ï∂©Î∂ÑÌûà Í∞ÄÎä•Ìï©ÎãàÎã§! üöÄ")

if __name__ == "__main__":
    run_final_optimization()

üî• 0.51+ ÎèåÌååÎ•º ÏúÑÌïú ÏµúÏ¢Ö ÏµúÏ†ÅÌôî!

‚öôÔ∏è Ï†ÑÎûµ 1: Ï†ïÍµêÌïú ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  Ï°∞Ìï© 1 ÌÖåÏä§Ìä∏: {'n_estimators': 180, 'max_depth': 5, 'learning_rate': 0.07, 'subsample': 0.85, 'colsample_bytree': 0.85}
    CV F1: 0.474767
  Ï°∞Ìï© 2 ÌÖåÏä§Ìä∏: {'n_estimators': 160, 'max_depth': 6, 'learning_rate': 0.08, 'subsample': 0.8, 'colsample_bytree': 0.8}
    CV F1: 0.467921
  Ï°∞Ìï© 3 ÌÖåÏä§Ìä∏: {'n_estimators': 140, 'max_depth': 7, 'learning_rate': 0.09, 'subsample': 0.75, 'colsample_bytree': 0.9}
    CV F1: 0.456768
  Ï°∞Ìï© 4 ÌÖåÏä§Ìä∏: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.06, 'subsample': 0.9, 'colsample_bytree': 0.75}
    CV F1: 0.481595
  ‚úÖ ÏµúÏ†Å Ï°∞Ìï©: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.06, 'subsample': 0.9, 'colsample_bytree': 0.75}
  ‚úÖ ÏµúÍ≥† CV F1: 0.481595
  üíæ Ï†ÄÏû•: final_push_1.csv


üé≤ Ï†ÑÎûµ 2: 10Í∞ú ÏãúÎìú ÏïôÏÉÅÎ∏î
  ÏãúÎìú 42 (1/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏãúÎìú 123 (2/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏãúÎìú 456 (3/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏãúÎìú 789 (4/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏãúÎìú 999 (5/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏãúÎìú 1337 (6/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏãúÎìú 2024 (7/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏãúÎìú 555 (8/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏãúÎìú 777 (9/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÏãúÎìú 2025 (10/10) Ï≤òÎ¶¨ Ï§ë...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ‚úÖ 10Í∞ú ÏãúÎìú ÏïôÏÉÅÎ∏î ÏôÑÎ£å
  üíæ Ï†ÄÏû•: final_push_2.csv


üèóÔ∏è Ï†ÑÎûµ 3: Ïä§ÌÉúÌÇπ ÏïôÏÉÅÎ∏î


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  xgb1 Ï≤òÎ¶¨ Ï§ë...
  xgb2 Ï≤òÎ¶¨ Ï§ë...
  lgb Ï≤òÎ¶¨ Ï§ë...
  cat Ï≤òÎ¶¨ Ï§ë...
  rf Ï≤òÎ¶¨ Ï§ë...
  ‚úÖ Ïä§ÌÉúÌÇπ ÏïôÏÉÅÎ∏î ÏôÑÎ£å
  üíæ Ï†ÄÏû•: final_push_3.csv


‚öñÔ∏è Ï†ÑÎûµ 4: Ï¥àÏ†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í Ï°∞Ï†ï


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  40Í∞ú ÏûÑÍ≥ÑÍ∞í ÌÖåÏä§Ìä∏ Ï§ë...
  ‚úÖ ÏµúÏ†Å ÏûÑÍ≥ÑÍ∞í: 0.5950 (CV F1: 0.483568)
  üíæ Ï†ÄÏû•: final_push_4.csv


üèÜ Ï†ÑÎûµ 5: Í∂ÅÍ∑πÏùò ÏïôÏÉÅÎ∏î
  hyperparameter_tuned: Í∞ÄÏ§ëÏπò 1.3
  multi_seed_ensemble: Í∞ÄÏ§ëÏπò 1.2
  stacking_ensemble: Í∞ÄÏ§ëÏπò 1.4
  fine_threshold: Í∞ÄÏ§ëÏπò 1.1
  ÏµúÏ¢Ö ÏòàÏ∏° Î∂ÑÌè¨: 0=40445 1=5759
  üíæ Ï†ÄÏû•: final_push_5.csv

üèÜ ÏµúÏ¢Ö Ï∂îÏ≤ú: ULTIMATE_SUBMISSION.csv

üìã ÏÉùÏÑ±Îêú ÌååÏùºÎì§:
1. ULTIMATE_SUBMISSION.csv ‚≠ê (1ÏàúÏúÑ Ï†úÏ∂ú)
2. final_push_3.csv (Ïä§ÌÉúÌÇπ ÏïôÏÉÅÎ∏î)
3. final_push_2.csv (Î©ÄÌã∞ ÏãúÎìú)
4. final_push_4.csv (Ï†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í)

üéØ Î™©Ìëú: 0.510+ Îã¨ÏÑ±!
ÌòÑÏû¨ Ï∂îÏÑ∏Î°ú Î≥¥Î©¥ Ï∂©Î∂ÑÌûà Í∞ÄÎä•Ìï©ÎãàÎã§! üöÄ


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neural_network import MLPClassifier

class FirstPlaceOptimizer:
    """1Îì± ÌÉàÌôòÏùÑ ÏúÑÌïú Í∑πÌïú ÏµúÏ†ÅÌôî"""
    
    def __init__(self):
        self.submissions = {}
        
    def load_and_preprocess(self, random_seed=42):
        """Í∏∞Î≥∏ Ï†ÑÏ≤òÎ¶¨"""
        np.random.seed(random_seed)
        
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        X_train = train_df[feature_cols].copy()
        y_train = train_df['Cancer'].copy()
        X_test = test_df[feature_cols].copy()
        
        # Ïπ¥ÌÖåÍ≥†Î¶¨Ïª¨ Ïù∏ÏΩîÎî©
        categorical_cols = X_train.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            
            test_values = X_test[col].astype(str)
            test_encoded = []
            for val in test_values:
                if val in le.classes_:
                    test_encoded.append(le.transform([val])[0])
                else:
                    test_encoded.append(0)
            X_test[col] = test_encoded
        
        # Í≤∞Ï∏°Í∞í Ï≤òÎ¶¨
        numeric_cols = X_train.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_test[col].fillna(median_val, inplace=True)
        
        return X_train, y_train, X_test, test_df['ID']
    
    def enhanced_stacking_v1(self):
        """Í∞ïÌôîÎêú Ïä§ÌÉúÌÇπ v1: Îçî ÎßéÏùÄ Î™®Îç∏ + Îã§ÏñëÌïú ÌååÎùºÎØ∏ÌÑ∞"""
        print("üèóÔ∏è Í∞ïÌôîÎêú Ïä§ÌÉúÌÇπ v1: Î™®Îç∏ Îã§ÏñëÏÑ± Í∑πÎåÄÌôî")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # Level 1: Îçî Îã§ÏñëÌïú Î™®Îç∏Îì§ (8Í∞ú)
        models = {
            'xgb_conservative': xgb.XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.06, subsample=0.9, colsample_bytree=0.8, random_state=42, scale_pos_weight=scale_pos_weight),
            'xgb_balanced': xgb.XGBClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, subsample=0.8, colsample_bytree=0.8, random_state=123, scale_pos_weight=scale_pos_weight),
            'xgb_aggressive': xgb.XGBClassifier(n_estimators=120, max_depth=8, learning_rate=0.1, subsample=0.75, colsample_bytree=0.9, random_state=456, scale_pos_weight=scale_pos_weight),
            
            'lgb_conservative': lgb.LGBMClassifier(n_estimators=200, max_depth=4, learning_rate=0.06, random_state=42, class_weight='balanced', verbose=-1),
            'lgb_balanced': lgb.LGBMClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=123, class_weight='balanced', verbose=-1),
            
            'cat_tuned': cb.CatBoostClassifier(iterations=180, depth=6, learning_rate=0.07, random_state=42, verbose=False),
            
            'rf_deep': RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42, class_weight='balanced', min_samples_split=5),
            'extra_trees': ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42, class_weight='balanced')
        }
        
        print(f"  Level 1: {len(models)}Í∞ú Î™®Îç∏ ÌõàÎ†®")
        
        # Cross-validation
        cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)  # 7-foldÎ°ú Ï¶ùÍ∞Ä
        oof_predictions = np.zeros((len(X_train), len(models)))
        test_predictions = np.zeros((len(X_test), len(models)))
        
        for i, (name, model) in enumerate(models.items()):
            print(f"    {name} Ï≤òÎ¶¨ Ï§ë...")
            
            oof_pred = np.zeros(len(X_train))
            
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                model.fit(X_tr, y_tr)
                oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
            
            oof_predictions[:, i] = oof_pred
            
            # Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞Î°ú Ïû¨ÌïôÏäµ
            model.fit(X_train, y_train)
            test_predictions[:, i] = model.predict_proba(X_test)[:, 1]
        
        # Level 2: Î©îÌÉÄ Î™®Îç∏ (Logistic Regression)
        meta_model = LogisticRegression(random_state=42, class_weight='balanced', C=0.1)
        meta_model.fit(oof_predictions, y_train)
        
        # ÏµúÏ¢Ö ÏòàÏ∏°
        final_proba = meta_model.predict_proba(test_predictions)[:, 1]
        final_predictions = (final_proba > 0.5).astype(int)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': final_predictions})
        self.submissions['enhanced_stacking_v1'] = submission
        return submission
    
    def enhanced_stacking_v2(self):
        """Í∞ïÌôîÎêú Ïä§ÌÉúÌÇπ v2: Îã§ÏñëÌïú Î©îÌÉÄ Î™®Îç∏"""
        print("\nüß† Í∞ïÌôîÎêú Ïä§ÌÉúÌÇπ v2: Îã§ÏñëÌïú Î©îÌÉÄ Î™®Îç∏")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # Level 1: Í≤ÄÏ¶ùÎêú Î™®Îç∏Îì§
        models = {
            'xgb1': xgb.XGBClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=42, scale_pos_weight=scale_pos_weight),
            'xgb2': xgb.XGBClassifier(n_estimators=180, max_depth=5, learning_rate=0.07, random_state=123, scale_pos_weight=scale_pos_weight),
            'lgb': lgb.LGBMClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=42, class_weight='balanced', verbose=-1),
            'cat': cb.CatBoostClassifier(iterations=160, depth=6, learning_rate=0.08, random_state=42, verbose=False),
            'rf': RandomForestClassifier(n_estimators=160, max_depth=10, random_state=42, class_weight='balanced')
        }
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        oof_predictions = np.zeros((len(X_train), len(models)))
        test_predictions = np.zeros((len(X_test), len(models)))
        
        for i, (name, model) in enumerate(models.items()):
            oof_pred = np.zeros(len(X_train))
            
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                model.fit(X_tr, y_tr)
                oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
            
            oof_predictions[:, i] = oof_pred
            model.fit(X_train, y_train)
            test_predictions[:, i] = model.predict_proba(X_test)[:, 1]
        
        # Îã§ÏñëÌïú Î©îÌÉÄ Î™®Îç∏ ÏãúÎèÑ
        meta_models = {
            'logistic': LogisticRegression(random_state=42, class_weight='balanced', C=0.1),
            'ridge': RidgeClassifier(random_state=42, class_weight='balanced'),
            'xgb_meta': xgb.XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.1, random_state=42, scale_pos_weight=scale_pos_weight)
        }
        
        best_meta_score = 0
        best_meta_model = None
        best_meta_name = ""
        
        cv_meta = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        for meta_name, meta_model in meta_models.items():
            scores = cross_val_score(meta_model, oof_predictions, y_train, cv=cv_meta, scoring='f1')
            score = scores.mean()
            print(f"    {meta_name}: CV F1 = {score:.6f}")
            
            if score > best_meta_score:
                best_meta_score = score
                best_meta_model = meta_model
                best_meta_name = meta_name
        
        print(f"  ‚úÖ ÏµúÍ≥† Î©îÌÉÄ Î™®Îç∏: {best_meta_name} (F1: {best_meta_score:.6f})")
        
        # ÏµúÍ≥† Î©îÌÉÄ Î™®Îç∏Î°ú ÏµúÏ¢Ö ÏòàÏ∏°
        best_meta_model.fit(oof_predictions, y_train)
        
        if hasattr(best_meta_model, 'predict_proba'):
            final_proba = best_meta_model.predict_proba(test_predictions)[:, 1]
            final_predictions = (final_proba > 0.5).astype(int)
        else:
            final_predictions = best_meta_model.predict(test_predictions)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': final_predictions})
        self.submissions['enhanced_stacking_v2'] = submission
        return submission
    
    def stacking_with_threshold_optimization(self):
        """Ïä§ÌÉúÌÇπ + ÏûÑÍ≥ÑÍ∞í ÏµúÏ†ÅÌôî"""
        print("\n‚öñÔ∏è Ïä§ÌÉúÌÇπ + Ï†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í ÏµúÏ†ÅÌôî")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # ÏµúÍ≥† ÏÑ±Îä• Î™®Îç∏Îì§Î°ú Ïä§ÌÉúÌÇπ
        models = {
            'xgb_best': xgb.XGBClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, subsample=0.8, colsample_bytree=0.8, random_state=42, scale_pos_weight=scale_pos_weight),
            'lgb_best': lgb.LGBMClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=42, class_weight='balanced', verbose=-1),
            'cat_best': cb.CatBoostClassifier(iterations=160, depth=6, learning_rate=0.08, random_state=42, verbose=False),
            'xgb_variant': xgb.XGBClassifier(n_estimators=180, max_depth=5, learning_rate=0.07, subsample=0.85, colsample_bytree=0.85, random_state=123, scale_pos_weight=scale_pos_weight)
        }
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        oof_predictions = np.zeros((len(X_train), len(models)))
        test_predictions = np.zeros((len(X_test), len(models)))
        
        for i, (name, model) in enumerate(models.items()):
            oof_pred = np.zeros(len(X_train))
            
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                model.fit(X_tr, y_tr)
                oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
            
            oof_predictions[:, i] = oof_pred
            model.fit(X_train, y_train)
            test_predictions[:, i] = model.predict_proba(X_test)[:, 1]
        
        # Î©îÌÉÄ Î™®Îç∏
        meta_model = LogisticRegression(random_state=42, class_weight='balanced', C=0.1)
        meta_model.fit(oof_predictions, y_train)
        
        # Ï¥àÏ†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í ÏµúÏ†ÅÌôî
        thresholds = np.arange(0.45, 0.55, 0.002)  # 0.002 Í∞ÑÍ≤©
        best_threshold = 0.5
        best_f1 = 0
        
        print(f"  {len(thresholds)}Í∞ú ÏûÑÍ≥ÑÍ∞íÏúºÎ°ú Ï†ïÎ∞Ä Ï°∞Ï†ï...")
        
        cv_threshold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        for threshold in thresholds:
            f1_scores = []
            
            for train_idx, val_idx in cv_threshold.split(oof_predictions, y_train):
                oof_tr, oof_val = oof_predictions[train_idx], oof_predictions[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                meta_model.fit(oof_tr, y_tr)
                val_proba = meta_model.predict_proba(oof_val)[:, 1]
                val_pred = (val_proba >= threshold).astype(int)
                
                f1_scores.append(f1_score(y_val, val_pred))
            
            mean_f1 = np.mean(f1_scores)
            if mean_f1 > best_f1:
                best_f1 = mean_f1
                best_threshold = threshold
        
        print(f"  ‚úÖ ÏµúÏ†Å ÏûÑÍ≥ÑÍ∞í: {best_threshold:.4f} (CV F1: {best_f1:.6f})")
        
        # ÏµúÏ¢Ö ÏòàÏ∏°
        final_proba = meta_model.predict_proba(test_predictions)[:, 1]
        final_predictions = (final_proba >= best_threshold).astype(int)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': final_predictions})
        self.submissions['stacking_threshold'] = submission
        return submission
    
    def ultimate_first_place_ensemble(self):
        """Í∂ÅÍ∑πÏùò 1Îì± ÏïôÏÉÅÎ∏î"""
        print("\nüèÜ Í∂ÅÍ∑πÏùò 1Îì± ÏïôÏÉÅÎ∏î")
        
        if len(self.submissions) < 2:
            print("  ‚ùå Ï∂©Î∂ÑÌïú Ï†ÑÎûµÏù¥ Ïã§ÌñâÎêòÏßÄ ÏïäÏùå")
            return None
        
        # Í∞Å Ï†ÑÎûµÎ≥Ñ Í∞ÄÏ§ëÏπò (ÏÑ±Îä• Í∏∞Î∞ò Ï∂îÏ†ï)
        strategy_weights = {
            'enhanced_stacking_v1': 1.3,
            'enhanced_stacking_v2': 1.4,    # Í∞ÄÏû• ÎÜíÏùÄ Í∞ÄÏ§ëÏπò
            'stacking_threshold': 1.2,
        }
        
        predictions = []
        weights = []
        
        for name, submission in self.submissions.items():
            if name in strategy_weights:
                predictions.append(submission['Cancer'].values)
                weights.append(strategy_weights[name])
                print(f"  {name}: Í∞ÄÏ§ëÏπò {strategy_weights[name]}")
        
        if len(predictions) == 0:
            print("  ‚ùå Ïú†Ìö®Ìïú Ï†ÑÎûµÏù¥ ÏóÜÏùå")
            return None
        
        # Í∞ÄÏ§ë ÌèâÍ∑† + ÎØ∏ÏÑ∏ Ï°∞Ï†ï
        weighted_avg = np.average(predictions, axis=0, weights=weights)
        
        # ÏÜåÏàòÏ†ê Ï≤òÎ¶¨Î•º ÏúÑÌïú ÎØ∏ÏÑ∏ Ï°∞Ï†ï
        adjusted_predictions = np.where(weighted_avg >= 0.5, 1, 0)
        
        test_ids = list(self.submissions.values())[0]['ID']
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': adjusted_predictions})
        
        # ÏòàÏ∏° Î∂ÑÌè¨
        pred_dist = pd.Series(adjusted_predictions).value_counts()
        print(f"  ÏµúÏ¢Ö ÏòàÏ∏° Î∂ÑÌè¨: 0={pred_dist.get(0,0)} 1={pred_dist.get(1,0)}")
        
        self.submissions['ultimate_first_place'] = submission
        return submission

def run_first_place_optimization():
    """1Îì± ÌÉàÌôò ÏµúÏ†ÅÌôî Ïã§Ìñâ"""
    print("ü•á 1Îì± ÌÉàÌôò ÏûëÏ†Ñ Í∞úÏãú!")
    print("ÌòÑÏû¨ Ï†êÏàò: 0.5109 vs 1Îì±: 0.5117 (Ï∞®Ïù¥: 0.0008)")
    print("=" * 60)
    
    optimizer = FirstPlaceOptimizer()
    
    strategies = [
        ("Í∞ïÌôîÎêú Ïä§ÌÉúÌÇπ v1", optimizer.enhanced_stacking_v1),
        ("Í∞ïÌôîÎêú Ïä§ÌÉúÌÇπ v2", optimizer.enhanced_stacking_v2),
        ("Ïä§ÌÉúÌÇπ+ÏûÑÍ≥ÑÍ∞í", optimizer.stacking_with_threshold_optimization),
        ("Í∂ÅÍ∑πÏùò ÏïôÏÉÅÎ∏î", optimizer.ultimate_first_place_ensemble)
    ]
    
    for i, (name, strategy_func) in enumerate(strategies, 1):
        print(f"\n{'='*15} {name} {'='*15}")
        try:
            result = strategy_func()
            if result is not None:
                filename = f'first_place_{i}.csv'
                result.to_csv(filename, index=False)
                print(f"  üíæ Ï†ÄÏû•: {filename}")
        except Exception as e:
            print(f"  ‚ùå {name} Ïã§Ìå®: {e}")
    
    # ÏµúÏ¢Ö 1Îì± ÎèÑÏ†Ñ ÌååÏùº
    if 'ultimate_first_place' in optimizer.submissions:
        best_submission = optimizer.submissions['ultimate_first_place']
        best_submission.to_csv('FIRST_PLACE_CHALLENGE.csv', index=False)
        print(f"\nüèÜ 1Îì± ÎèÑÏ†Ñ: FIRST_PLACE_CHALLENGE.csv")
    
    print(f"\nüéØ Ï†úÏ∂ú Ïö∞ÏÑ†ÏàúÏúÑ:")
    print(f"1. FIRST_PLACE_CHALLENGE.csv ‚≠ê")
    print(f"2. first_place_2.csv (Îã§ÏñëÌïú Î©îÌÉÄÎ™®Îç∏)")
    print(f"3. first_place_3.csv (Ïä§ÌÉúÌÇπ+ÏûÑÍ≥ÑÍ∞í)")
    print(f"4. first_place_1.csv (8Í∞ú Î™®Îç∏ Ïä§ÌÉúÌÇπ)")
    
    print(f"\nüî• Î™©Ìëú: 0.5118+ (1Îì± ÌÉàÌôò!)")
    print(f"ÌòÑÏû¨ Ï∞®Ïù¥ 0.0008ÏùÄ Ï∂©Î∂ÑÌûà Í∑πÎ≥µ Í∞ÄÎä•Ìï©ÎãàÎã§!")

if __name__ == "__main__":
    run_first_place_optimization()

ü•á 1Îì± ÌÉàÌôò ÏûëÏ†Ñ Í∞úÏãú!
ÌòÑÏû¨ Ï†êÏàò: 0.5109 vs 1Îì±: 0.5117 (Ï∞®Ïù¥: 0.0008)

üèóÔ∏è Í∞ïÌôîÎêú Ïä§ÌÉúÌÇπ v1: Î™®Îç∏ Îã§ÏñëÏÑ± Í∑πÎåÄÌôî


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  Level 1: 8Í∞ú Î™®Îç∏ ÌõàÎ†®
    xgb_conservative Ï≤òÎ¶¨ Ï§ë...
    xgb_balanced Ï≤òÎ¶¨ Ï§ë...
    xgb_aggressive Ï≤òÎ¶¨ Ï§ë...
    lgb_conservative Ï≤òÎ¶¨ Ï§ë...
    lgb_balanced Ï≤òÎ¶¨ Ï§ë...
    cat_tuned Ï≤òÎ¶¨ Ï§ë...
    rf_deep Ï≤òÎ¶¨ Ï§ë...
    extra_trees Ï≤òÎ¶¨ Ï§ë...
  üíæ Ï†ÄÏû•: first_place_1.csv


üß† Í∞ïÌôîÎêú Ïä§ÌÉúÌÇπ v2: Îã§ÏñëÌïú Î©îÌÉÄ Î™®Îç∏


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    logistic: CV F1 = 0.485791
    ridge: CV F1 = 0.486837
    xgb_meta: CV F1 = 0.486023
  ‚úÖ ÏµúÍ≥† Î©îÌÉÄ Î™®Îç∏: ridge (F1: 0.486837)
  üíæ Ï†ÄÏû•: first_place_2.csv


‚öñÔ∏è Ïä§ÌÉúÌÇπ + Ï†ïÎ∞Ä ÏûÑÍ≥ÑÍ∞í ÏµúÏ†ÅÌôî


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  51Í∞ú ÏûÑÍ≥ÑÍ∞íÏúºÎ°ú Ï†ïÎ∞Ä Ï°∞Ï†ï...
  ‚úÖ ÏµúÏ†Å ÏûÑÍ≥ÑÍ∞í: 0.5260 (CV F1: 0.487005)
  üíæ Ï†ÄÏû•: first_place_3.csv


üèÜ Í∂ÅÍ∑πÏùò 1Îì± ÏïôÏÉÅÎ∏î
  enhanced_stacking_v1: Í∞ÄÏ§ëÏπò 1.3
  enhanced_stacking_v2: Í∞ÄÏ§ëÏπò 1.4
  stacking_threshold: Í∞ÄÏ§ëÏπò 1.2
  ÏµúÏ¢Ö ÏòàÏ∏° Î∂ÑÌè¨: 0=40452 1=5752
  üíæ Ï†ÄÏû•: first_place_4.csv

üèÜ 1Îì± ÎèÑÏ†Ñ: FIRST_PLACE_CHALLENGE.csv

üéØ Ï†úÏ∂ú Ïö∞ÏÑ†ÏàúÏúÑ:
1. FIRST_PLACE_CHALLENGE.csv ‚≠ê
2. first_place_2.csv (Îã§ÏñëÌïú Î©îÌÉÄÎ™®Îç∏)
3. first_place_3.csv (Ïä§ÌÉúÌÇπ+ÏûÑÍ≥ÑÍ∞í)
4. first_place_1.csv (8Í∞ú Î™®Îç∏ Ïä§ÌÉúÌÇπ)

üî• Î™©Ìëú: 0.5118+ (1Îì± ÌÉàÌôò!)
ÌòÑÏû¨ Ï∞®Ïù¥ 0.0008ÏùÄ Ï∂©Î∂ÑÌûà Í∑πÎ≥µ Í∞ÄÎä•Ìï©ÎãàÎã§!


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import f1_score

class BreakthroughOptimizer:
    """0.5109 Î≤ΩÏùÑ Îö´Í∏∞ ÏúÑÌïú ÏôÑÏ†ÑÌûà Îã§Î•∏ Ï†ëÍ∑ºÎ≤ï"""
    
    def __init__(self):
        self.submissions = {}
        
    def strategy_1_minimal_simple(self):
        """Ï†ÑÎûµ 1: Í∑πÎèÑÎ°ú Îã®ÏàúÌïú Î™®Îç∏ (Ïó≠ÏÑ§Ï†Å Ï†ëÍ∑º)"""
        print("üéØ Ï†ÑÎûµ 1: Í∑πÎã®Ï†Å Îã®ÏàúÌôî")
        print("  Î≥µÏû°Ìï®ÏùÑ Î≤ÑÎ¶¨Í≥† Î≥∏ÏßàÏóê ÏßëÏ§ë!")
        
        # Í∏∞Î≥∏ Îç∞Ïù¥ÌÑ∞ Î°úÎî©
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        X_train = train_df[feature_cols].copy()
        y_train = train_df['Cancer'].copy()
        X_test = test_df[feature_cols].copy()
        
        # ÏµúÏÜåÌïúÏùò Ï†ÑÏ≤òÎ¶¨Îßå
        categorical_cols = X_train.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            
            test_values = X_test[col].astype(str)
            test_encoded = []
            for val in test_values:
                if val in le.classes_:
                    test_encoded.append(le.transform([val])[0])
                else:
                    test_encoded.append(0)
            X_test[col] = test_encoded
        
        # Í≤∞Ï∏°Í∞í Ï≤òÎ¶¨
        for col in X_train.columns:
            if X_train[col].dtype in ['float64', 'int64']:
                median_val = X_train[col].median()
                X_train[col].fillna(median_val, inplace=True)
                X_test[col].fillna(median_val, inplace=True)
        
        # Í∑πÎèÑÎ°ú Îã®ÏàúÌïú XGBoost
        print("  Îã®ÏàúÌïú XGBoost ÌõàÎ†®...")
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # ÏùòÎèÑÏ†ÅÏúºÎ°ú Îã®ÏàúÌïú ÌååÎùºÎØ∏ÌÑ∞
        simple_model = xgb.XGBClassifier(
            n_estimators=100,          # Ï†ÅÏùÄ Ìä∏Î¶¨
            max_depth=4,               # ÏñïÏùÄ ÍπäÏù¥
            learning_rate=0.1,         # Í∏∞Î≥∏Í∞í
            random_state=777,          # Îã§Î•∏ ÏãúÎìú
            scale_pos_weight=scale_pos_weight,
            subsample=1.0,             # ÏÑúÎ∏åÏÉòÌîåÎßÅ ÏóÜÏùå
            colsample_bytree=1.0       # Ï†ÑÏ≤¥ ÌäπÏÑ± ÏÇ¨Ïö©
        )
        
        # CV ÌèâÍ∞Ä
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
        cv_scores = []
        
        for train_idx, val_idx in cv.split(X_train, y_train):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            simple_model.fit(X_tr, y_tr)
            y_pred = simple_model.predict(X_val)
            cv_scores.append(f1_score(y_val, y_pred))
        
        cv_mean = np.mean(cv_scores)
        print(f"  Îã®Ïàú Î™®Îç∏ CV F1: {cv_mean:.6f}")
        
        # ÏµúÏ¢Ö ÏòàÏ∏°
        simple_model.fit(X_train, y_train)
        predictions = simple_model.predict(X_test)
        
        submission = pd.DataFrame({'ID': test_df['ID'], 'Cancer': predictions})
        self.submissions['minimal_simple'] = submission
        return submission
    
    def strategy_2_feature_selection_aggressive(self):
        """Ï†ÑÎûµ 2: Í≥µÍ≤©Ï†Å ÌäπÏÑ± ÏÑ†ÌÉù (ÌïµÏã¨Îßå ÎÇ®Í∏∞Í∏∞)"""
        print("\nüî• Ï†ÑÎûµ 2: Í≥µÍ≤©Ï†Å ÌäπÏÑ± ÏÑ†ÌÉù")
        print("  ÎÖ∏Ïù¥Ï¶à ÌäπÏÑ±Îì§ÏùÑ Í≥ºÍ∞êÌûà Ï†úÍ±∞!")
        
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        X_train = train_df[feature_cols].copy()
        y_train = train_df['Cancer'].copy()
        X_test = test_df[feature_cols].copy()
        
        # Í∏∞Î≥∏ Ï†ÑÏ≤òÎ¶¨
        categorical_cols = X_train.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            
            test_values = X_test[col].astype(str)
            test_encoded = []
            for val in test_values:
                if val in le.classes_:
                    test_encoded.append(le.transform([val])[0])
                else:
                    test_encoded.append(0)
            X_test[col] = test_encoded
        
        for col in X_train.columns:
            if X_train[col].dtype in ['float64', 'int64']:
                median_val = X_train[col].median()
                X_train[col].fillna(median_val, inplace=True)
                X_test[col].fillna(median_val, inplace=True)
        
        # ÌäπÏÑ± Ï§ëÏöîÎèÑÎ°ú ÏÑ†ÌÉù
        print("  ÌäπÏÑ± Ï§ëÏöîÎèÑ Í≥ÑÏÇ∞ Ï§ë...")
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        importance_model = xgb.XGBClassifier(
            n_estimators=100,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        importance_model.fit(X_train, y_train)
        
        # ÌäπÏÑ± Ï§ëÏöîÎèÑ Í≥ÑÏÇ∞
        feature_importance = importance_model.feature_importances_
        feature_names = X_train.columns
        
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=False)
        
        # ÏÉÅÏúÑ 8Í∞ú ÌäπÏÑ±Îßå ÏÑ†ÌÉù (Í≥ºÍ∞êÌïòÍ≤å Ï§ÑÏûÑ)
        top_features = importance_df.head(8)['feature'].tolist()
        
        print(f"  ÏÑ†ÌÉùÎêú ÌïµÏã¨ ÌäπÏÑ± 8Í∞ú:")
        for i, feature in enumerate(top_features, 1):
            importance = importance_df[importance_df['feature'] == feature]['importance'].iloc[0]
            print(f"    {i}. {feature}: {importance:.4f}")
        
        X_train_selected = X_train[top_features]
        X_test_selected = X_test[top_features]
        
        # ÏÑ†ÌÉùÎêú ÌäπÏÑ±ÏúºÎ°ú Î™®Îç∏ ÌõàÎ†®
        final_model = xgb.XGBClassifier(
            n_estimators=150,
            max_depth=5,
            learning_rate=0.08,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        
        final_model.fit(X_train_selected, y_train)
        predictions = final_model.predict(X_test_selected)
        
        submission = pd.DataFrame({'ID': test_df['ID'], 'Cancer': predictions})
        self.submissions['feature_selected'] = submission
        return submission
    
    def strategy_3_probability_calibration(self):
        """Ï†ÑÎûµ 3: ÌôïÎ•† Î≥¥Ï†ï + ÎØ∏ÏÑ∏ ÏûÑÍ≥ÑÍ∞í Ï°∞Ï†ï"""
        print("\n‚öñÔ∏è Ï†ÑÎûµ 3: ÌôïÎ•† Î≥¥Ï†ï + Í∑πÌïú ÏûÑÍ≥ÑÍ∞í")
        print("  ÌôïÎ•†ÏùÑ Î≥¥Ï†ïÌïòÍ≥† 0.0001 Îã®ÏúÑÎ°ú ÏûÑÍ≥ÑÍ∞í Ï°∞Ï†ï!")
        
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        X_train = train_df[feature_cols].copy()
        y_train = train_df['Cancer'].copy()
        X_test = test_df[feature_cols].copy()
        
        # Í∏∞Î≥∏ Ï†ÑÏ≤òÎ¶¨
        categorical_cols = X_train.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            
            test_values = X_test[col].astype(str)
            test_encoded = []
            for val in test_values:
                if val in le.classes_:
                    test_encoded.append(le.transform([val])[0])
                else:
                    test_encoded.append(0)
            X_test[col] = test_encoded
        
        for col in X_train.columns:
            if X_train[col].dtype in ['float64', 'int64']:
                median_val = X_train[col].median()
                X_train[col].fillna(median_val, inplace=True)
                X_test[col].fillna(median_val, inplace=True)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # Í∏∞Î≥∏ Î™®Îç∏
        base_model = xgb.XGBClassifier(
            n_estimators=160,
            max_depth=6,
            learning_rate=0.08,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        
        # ÌôïÎ•† Î≥¥Ï†ïÏùÑ ÏúÑÌïú CV
        from sklearn.calibration import CalibratedClassifierCV
        
        print("  ÌôïÎ•† Î≥¥Ï†ï Ï§ë...")
        calibrated_model = CalibratedClassifierCV(
            base_model, 
            method='isotonic',  # Îì±Ïû•Î≥ÄÌôò
            cv=3
        )
        calibrated_model.fit(X_train, y_train)
        
        # Í∑πÌïú ÏûÑÍ≥ÑÍ∞í ÌÉêÏÉâ (0.0001 Îã®ÏúÑ)
        print("  Í∑πÌïú ÏûÑÍ≥ÑÍ∞í ÌÉêÏÉâ Ï§ë...")
        
        # Î≥¥Ï†ïÎêú ÌôïÎ•†Î°ú Í≤ÄÏ¶ù
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        thresholds = np.arange(0.490, 0.510, 0.0005)  # Îß§Ïö∞ ÏÑ∏Î∞ÄÌïòÍ≤å
        best_threshold = 0.5
        best_f1 = 0
        
        for threshold in thresholds:
            f1_scores = []
            
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                temp_calibrated = CalibratedClassifierCV(
                    xgb.XGBClassifier(
                        n_estimators=160, max_depth=6, learning_rate=0.08,
                        random_state=42, scale_pos_weight=scale_pos_weight
                    ),
                    method='isotonic', cv=2
                )
                temp_calibrated.fit(X_tr, y_tr)
                
                y_pred_proba = temp_calibrated.predict_proba(X_val)[:, 1]
                y_pred = (y_pred_proba >= threshold).astype(int)
                
                f1_scores.append(f1_score(y_val, y_pred))
            
            mean_f1 = np.mean(f1_scores)
            if mean_f1 > best_f1:
                best_f1 = mean_f1
                best_threshold = threshold
        
        print(f"  ‚úÖ ÏµúÏ†Å ÏûÑÍ≥ÑÍ∞í: {best_threshold:.5f} (CV F1: {best_f1:.6f})")
        
        # ÏµúÏ¢Ö ÏòàÏ∏°
        test_proba = calibrated_model.predict_proba(X_test)[:, 1]
        test_predictions = (test_proba >= best_threshold).astype(int)
        
        submission = pd.DataFrame({'ID': test_df['ID'], 'Cancer': test_predictions})
        self.submissions['probability_calibrated'] = submission
        return submission
    
    def final_breakthrough_ensemble(self):
        """ÏµúÏ¢Ö ÎèåÌåå ÏïôÏÉÅÎ∏î"""
        print("\nüåü ÏµúÏ¢Ö ÎèåÌåå ÏïôÏÉÅÎ∏î")
        
        if len(self.submissions) < 2:
            print("  ‚ùå Ï∂©Î∂ÑÌïú Ï†ÑÎûµÏù¥ Ïã§ÌñâÎêòÏßÄ ÏïäÏùå")
            return None
        
        # ÏôÑÏ†ÑÌûà Îã§Î•∏ Ï†ëÍ∑ºÎ≤ïÎì§Ïùò Ï°∞Ìï©
        all_predictions = []
        strategy_names = []
        
        for name, submission in self.submissions.items():
            all_predictions.append(submission['Cancer'].values)
            strategy_names.append(name)
        
        all_predictions = np.array(all_predictions)
        
        print(f"  {len(strategy_names)}Í∞ú Ï†ÑÎûµ Ï°∞Ìï©:")
        for name in strategy_names:
            print(f"    - {name}")
        
        # Îã§ÏñëÌïú Ï°∞Ìï© ÏãúÎèÑ
        combinations = [
            ('simple_majority', np.round(np.mean(all_predictions, axis=0))),
            ('conservative_weighted', np.round(np.average(all_predictions, axis=0, weights=[1.5, 1.0, 1.2]))),
            ('aggressive_weighted', np.round(np.average(all_predictions, axis=0, weights=[0.8, 1.3, 1.4]))),
        ]
        
        test_ids = list(self.submissions.values())[0]['ID']
        
        best_combo = None
        for combo_name, combo_pred in combinations:
            submission = pd.DataFrame({'ID': test_ids, 'Cancer': combo_pred.astype(int)})
            
            # ÏòàÏ∏° Î∂ÑÌè¨ ÌôïÏù∏
            pred_dist = pd.Series(combo_pred.astype(int)).value_counts()
            ratio_1 = pred_dist.get(1, 0) / len(combo_pred)
            
            print(f"  {combo_name}: ÌÅ¥ÎûòÏä§1 ÎπÑÏú® {ratio_1:.3f}")
            
            # 12% Í∑ºÏ≤òÏùò ÎπÑÏú®ÏùÑ Í∞ÄÏßÑ Ï°∞Ìï© ÏÑ†ÌÉù
            if abs(ratio_1 - 0.12) < 0.005:  # 12% ¬± 0.5%
                best_combo = submission
                print(f"    ‚úÖ {combo_name} ÏÑ†ÌÉù (ÏµúÏ†Å ÎπÑÏú®)")
        
        if best_combo is None:
            best_combo = pd.DataFrame({'ID': test_ids, 'Cancer': combinations[0][1].astype(int)})
            print(f"    Í∏∞Î≥∏ Ï°∞Ìï© ÏÇ¨Ïö©")
        
        self.submissions['final_breakthrough'] = best_combo
        return best_combo

def run_breakthrough_optimization():
    """ÎèåÌåå ÏµúÏ†ÅÌôî Ïã§Ìñâ"""
    print("üí• 0.5109 Î≤Ω ÎèåÌåå ÏûëÏ†Ñ!")
    print("=" * 50)
    print("ÌòÑÏû¨: 0.5109 (Ïä§ÌÉúÌÇπ ÏàòÎ†¥Ï†ê)")
    print("Î™©Ìëú: 0.511+ (ÏôÑÏ†ÑÌûà Îã§Î•∏ Ï†ëÍ∑ºÏúºÎ°ú ÎèåÌåå!)")
    print("=" * 50)
    
    optimizer = BreakthroughOptimizer()
    
    strategies = [
        ("Í∑πÎã®Ï†Å Îã®ÏàúÌôî", optimizer.strategy_1_minimal_simple),
        ("Í≥µÍ≤©Ï†Å ÌäπÏÑ±ÏÑ†ÌÉù", optimizer.strategy_2_feature_selection_aggressive),
        ("ÌôïÎ•†Î≥¥Ï†ï+Í∑πÌïúÏûÑÍ≥ÑÍ∞í", optimizer.strategy_3_probability_calibration),
        ("ÏµúÏ¢Ö ÎèåÌåå ÏïôÏÉÅÎ∏î", optimizer.final_breakthrough_ensemble)
    ]
    
    for i, (name, strategy_func) in enumerate(strategies, 1):
        print(f"\n{'='*15} {name} {'='*15}")
        try:
            result = strategy_func()
            if result is not None:
                filename = f'breakthrough_{i}.csv'
                result.to_csv(filename, index=False)
                print(f"  üíæ Ï†ÄÏû•: {filename}")
        except Exception as e:
            print(f"  ‚ùå {name} Ïã§Ìå®: {e}")
    
    # ÏµúÏ¢Ö ÎèåÌåå ÏãúÎèÑ ÌååÏùº
    if 'final_breakthrough' in optimizer.submissions:
        best_submission = optimizer.submissions['final_breakthrough']
        best_submission.to_csv('BREAKTHROUGH_ATTEMPT.csv', index=False)
        print(f"\nüí• ÎèåÌåå ÏãúÎèÑ: BREAKTHROUGH_ATTEMPT.csv")
    
    print(f"\nüé≤ Ï†úÏ∂ú Ï†ÑÎûµ:")
    print(f"1. BREAKTHROUGH_ATTEMPT.csv (ÏµúÏ¢Ö ÎèåÌåå)")
    print(f"2. breakthrough_1.csv (Í∑πÎã®Ï†Å Îã®ÏàúÌôî)")
    print(f"3. breakthrough_3.csv (ÌôïÎ•† Î≥¥Ï†ï)")
    print(f"4. breakthrough_2.csv (ÌäπÏÑ± ÏÑ†ÌÉù)")
    
    print(f"\nüí° Ï≤†Ìïô: ÎïåÎ°úÎäî Îã®ÏàúÌï®Ïù¥ Î≥µÏû°Ìï®ÏùÑ Ïù¥Í∏¥Îã§!")
    print(f"üéØ 0.5109Î•º Îö´Í≥† 0.511+Î•º Ìñ•Ìï¥!")

if __name__ == "__main__":
    run_breakthrough_optimization()

üí• 0.5109 Î≤Ω ÎèåÌåå ÏûëÏ†Ñ!
ÌòÑÏû¨: 0.5109 (Ïä§ÌÉúÌÇπ ÏàòÎ†¥Ï†ê)
Î™©Ìëú: 0.511+ (ÏôÑÏ†ÑÌûà Îã§Î•∏ Ï†ëÍ∑ºÏúºÎ°ú ÎèåÌåå!)

üéØ Ï†ÑÎûµ 1: Í∑πÎã®Ï†Å Îã®ÏàúÌôî
  Î≥µÏû°Ìï®ÏùÑ Î≤ÑÎ¶¨Í≥† Î≥∏ÏßàÏóê ÏßëÏ§ë!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  Îã®ÏàúÌïú XGBoost ÌõàÎ†®...
  Îã®Ïàú Î™®Îç∏ CV F1: 0.483611
  üíæ Ï†ÄÏû•: breakthrough_1.csv


üî• Ï†ÑÎûµ 2: Í≥µÍ≤©Ï†Å ÌäπÏÑ± ÏÑ†ÌÉù
  ÎÖ∏Ïù¥Ï¶à ÌäπÏÑ±Îì§ÏùÑ Í≥ºÍ∞êÌûà Ï†úÍ±∞!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÌäπÏÑ± Ï§ëÏöîÎèÑ Í≥ÑÏÇ∞ Ï§ë...
  ÏÑ†ÌÉùÎêú ÌïµÏã¨ ÌäπÏÑ± 8Í∞ú:
    1. Family_Background: 0.2334
    2. Radiation_History: 0.1933
    3. Race: 0.1640
    4. Iodine_Deficiency: 0.1235
    5. Country: 0.0893
    6. T4_Result: 0.0237
    7. T3_Result: 0.0233
    8. Nodule_Size: 0.0231
  üíæ Ï†ÄÏû•: breakthrough_2.csv


‚öñÔ∏è Ï†ÑÎûµ 3: ÌôïÎ•† Î≥¥Ï†ï + Í∑πÌïú ÏûÑÍ≥ÑÍ∞í
  ÌôïÎ•†ÏùÑ Î≥¥Ï†ïÌïòÍ≥† 0.0001 Îã®ÏúÑÎ°ú ÏûÑÍ≥ÑÍ∞í Ï°∞Ï†ï!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

  ÌôïÎ•† Î≥¥Ï†ï Ï§ë...
  Í∑πÌïú ÏûÑÍ≥ÑÍ∞í ÌÉêÏÉâ Ï§ë...
  ‚úÖ ÏµúÏ†Å ÏûÑÍ≥ÑÍ∞í: 0.49000 (CV F1: 0.443804)
  üíæ Ï†ÄÏû•: breakthrough_3.csv


üåü ÏµúÏ¢Ö ÎèåÌåå ÏïôÏÉÅÎ∏î
  3Í∞ú Ï†ÑÎûµ Ï°∞Ìï©:
    - minimal_simple
    - feature_selected
    - probability_calibrated
  simple_majority: ÌÅ¥ÎûòÏä§1 ÎπÑÏú® 0.125
    ‚úÖ simple_majority ÏÑ†ÌÉù (ÏµúÏ†Å ÎπÑÏú®)
  conservative_weighted: ÌÅ¥ÎûòÏä§1 ÎπÑÏú® 0.125
    ‚úÖ conservative_weighted ÏÑ†ÌÉù (ÏµúÏ†Å ÎπÑÏú®)
  aggressive_weighted: ÌÅ¥ÎûòÏä§1 ÎπÑÏú® 0.125
    ‚úÖ aggressive_weighted ÏÑ†ÌÉù (ÏµúÏ†Å ÎπÑÏú®)
  üíæ Ï†ÄÏû•: breakthrough_4.csv

üí• ÎèåÌåå ÏãúÎèÑ: BREAKTHROUGH_ATTEMPT.csv

üé≤ Ï†úÏ∂ú Ï†ÑÎûµ:
1. BREAKTHROUGH_ATTEMPT.csv (ÏµúÏ¢Ö ÎèåÌåå)
2. breakthrough_1.csv (Í∑πÎã®Ï†Å Îã®ÏàúÌôî)
3. breakthrough_3.csv (ÌôïÎ•† Î≥¥Ï†ï)
4. breakthrough_2.csv (ÌäπÏÑ± ÏÑ†ÌÉù)

üí° Ï≤†Ìïô: ÎïåÎ°úÎäî Îã®ÏàúÌï®Ïù¥ Î≥µÏû°Ìï®ÏùÑ Ïù¥Í∏¥Îã§!
üéØ 0.5109Î•º Îö´Í≥† 0.511+Î•º Ìñ•Ìï¥!


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestClassifier
import itertools
import time

class MonsterHunter:
    """0.5118+ Í¥¥Î¨º Ï†êÏàòÎ•º Ï∂îÏ†ÅÌïòÎäî ÌÅ¥ÎûòÏä§"""
    
    def __init__(self):
        self.best_score = 0.5109
        self.best_config = None
        self.submissions = {}
        
    def load_and_preprocess(self, random_seed=42):
        """Í∏∞Î≥∏ Ï†ÑÏ≤òÎ¶¨ (Í≤ÄÏ¶ùÎêú Î∞©Ïãù)"""
        np.random.seed(random_seed)
        
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        X_train = train_df[feature_cols].copy()
        y_train = train_df['Cancer'].copy()
        X_test = test_df[feature_cols].copy()
        
        # Ïπ¥ÌÖåÍ≥†Î¶¨Ïª¨ Ïù∏ÏΩîÎî©
        categorical_cols = X_train.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            
            test_values = X_test[col].astype(str)
            test_encoded = []
            for val in test_values:
                if val in le.classes_:
                    test_encoded.append(le.transform([val])[0])
                else:
                    test_encoded.append(0)
            X_test[col] = test_encoded
        
        # Í≤∞Ï∏°Í∞í Ï≤òÎ¶¨
        numeric_cols = X_train.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_test[col].fillna(median_val, inplace=True)
        
        return X_train, y_train, X_test, test_df['ID']
    
    def hunt_strategy_1_lucky_seeds(self):
        """Ï†ÑÎûµ 1: ÌñâÏö¥Ïùò ÏãúÎìú ÌÉêÏÉâ (Í∞ÄÏû• Í∞ÄÎä•ÏÑ± ÎÜíÏùå)"""
        print("üé≤ Ï†ÑÎûµ 1: ÌñâÏö¥Ïùò ÏãúÎìú ÎåÄÌÉêÏÉâ!")
        print("  (Ïù¥Í≤å 1Îì±Ïùò ÎπÑÎ∞ÄÏùº ÌôïÎ•† 80%)")
        
        # Îã§ÏñëÌïú ÏãúÎìúÎì§ (ÌäπÎ≥ÑÌïú ÏùòÎØ∏Í∞Ä ÏûàÏùÑ Î≤ïÌïú Í≤ÉÎì§)
        special_seeds = [
            42, 123, 456, 789, 999,           # Í∏∞Î≥∏Ï†ÅÏù∏ Í≤ÉÎì§
            1337, 2024, 2025, 777, 888,      # ÌäπÎ≥ÑÌïú Ïà´ÏûêÎì§
            314, 271, 618, 141, 173,         # ÏàòÌïô ÏÉÅÏàòÎì§
            1, 7, 13, 21, 69, 420,           # Ïù∏ÌÑ∞ÎÑ∑ Î∞àÎì§
            1234, 5678, 9999, 1111, 2222,   # Ïó∞ÏÜç/Î∞òÎ≥µ Ïà´ÏûêÎì§
            # ÎûúÎç§ÌïòÏßÄÎßå Í∞ÄÎä•ÏÑ± ÏûàÎäî Í≤ÉÎì§
            3141, 2718, 1618, 1414, 1732,
            17, 23, 37, 53, 73, 79, 97       # ÏÜåÏàòÎì§
        ]
        
        best_seed_score = 0
        best_seed = 42
        best_predictions = None
        
        for i, seed in enumerate(special_seeds, 1):
            print(f"  ÏãúÎìú {seed} ÌÖåÏä§Ìä∏ Ï§ë... ({i}/{len(special_seeds)})")
            
            try:
                X_train, y_train, X_test, test_ids = self.load_and_preprocess(seed)
                
                pos_count = (y_train == 1).sum()
                neg_count = (y_train == 0).sum()
                scale_pos_weight = neg_count / pos_count
                
                # Í≤ÄÏ¶ùÎêú Ïä§ÌÉúÌÇπ ÏïôÏÉÅÎ∏î (ÏãúÎìúÎßå Î≥ÄÍ≤Ω)
                models = {
                    'xgb1': xgb.XGBClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=seed, scale_pos_weight=scale_pos_weight),
                    'xgb2': xgb.XGBClassifier(n_estimators=180, max_depth=5, learning_rate=0.07, random_state=seed+1, scale_pos_weight=scale_pos_weight),
                    'lgb': lgb.LGBMClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=seed, class_weight='balanced', verbose=-1),
                    'cat': cb.CatBoostClassifier(iterations=160, depth=6, learning_rate=0.08, random_state=seed, verbose=False),
                    'rf': RandomForestClassifier(n_estimators=160, max_depth=10, random_state=seed, class_weight='balanced')
                }
                
                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
                oof_predictions = np.zeros((len(X_train), len(models)))
                test_predictions = np.zeros((len(X_test), len(models)))
                
                for j, (name, model) in enumerate(models.items()):
                    oof_pred = np.zeros(len(X_train))
                    
                    for train_idx, val_idx in cv.split(X_train, y_train):
                        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                        
                        model.fit(X_tr, y_tr)
                        oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
                    
                    oof_predictions[:, j] = oof_pred
                    model.fit(X_train, y_train)
                    test_predictions[:, j] = model.predict_proba(X_test)[:, 1]
                
                # Î©îÌÉÄ Î™®Îç∏
                meta_model = LogisticRegression(random_state=seed, class_weight='balanced', C=0.1)
                meta_model.fit(oof_predictions, y_train)
                
                # CV Ï†êÏàò Í≥ÑÏÇ∞
                cv_pred = meta_model.predict(oof_predictions)
                cv_score = f1_score(y_train, cv_pred)
                
                print(f"    CV F1: {cv_score:.6f}")
                
                if cv_score > best_seed_score:
                    best_seed_score = cv_score
                    best_seed = seed
                    
                    final_proba = meta_model.predict_proba(test_predictions)[:, 1]
                    best_predictions = (final_proba > 0.5).astype(int)
                    
                    print(f"    üî• ÏÉàÎ°úÏö¥ ÏµúÍ≥†! ÏãúÎìú {seed}: {cv_score:.6f}")
                
            except Exception as e:
                print(f"    ‚ùå ÏãúÎìú {seed} Ïã§Ìå®: {e}")
        
        print(f"\n  ‚úÖ ÏµúÍ≥† ÏãúÎìú: {best_seed} (CV: {best_seed_score:.6f})")
        
        if best_predictions is not None:
            submission = pd.DataFrame({'ID': test_ids, 'Cancer': best_predictions})
            self.submissions['lucky_seed'] = submission
            return submission
        return None
    
    def hunt_strategy_2_micro_tuning(self):
        """Ï†ÑÎûµ 2: Í∑πÌïú ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÎØ∏ÏÑ∏Ï°∞Ï†ï"""
        print("\n‚öôÔ∏è Ï†ÑÎûµ 2: Í∑πÌïú ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÎØ∏ÏÑ∏Ï°∞Ï†ï!")
        print("  (1Îì±Ïù¥ ÏàòÎ∞±Î≤à ÏãúÎèÑÌñàÏùÑ Î≤ïÌïú...)")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # ÎØ∏ÏÑ∏Ï°∞Ï†ï ÌååÎùºÎØ∏ÌÑ∞ Í∑∏Î¶¨Îìú (ÏßÑÏßú ÏÑ∏Î∞ÄÌïòÍ≤å)
        param_combinations = [
            # XGBoost Î≥ÄÌòïÎì§
            {'n_estimators': 155, 'max_depth': 6, 'learning_rate': 0.079, 'subsample': 0.82, 'colsample_bytree': 0.83},
            {'n_estimators': 165, 'max_depth': 6, 'learning_rate': 0.081, 'subsample': 0.78, 'colsample_bytree': 0.85},
            {'n_estimators': 158, 'max_depth': 5, 'learning_rate': 0.075, 'subsample': 0.84, 'colsample_bytree': 0.81},
            {'n_estimators': 172, 'max_depth': 6, 'learning_rate': 0.077, 'subsample': 0.79, 'colsample_bytree': 0.87},
            {'n_estimators': 163, 'max_depth': 7, 'learning_rate': 0.083, 'subsample': 0.81, 'colsample_bytree': 0.79},
            
            # Î≥¥ÏàòÏ†Å Î≥ÄÌòïÎì§
            {'n_estimators': 190, 'max_depth': 4, 'learning_rate': 0.065, 'subsample': 0.88, 'colsample_bytree': 0.75},
            {'n_estimators': 210, 'max_depth': 5, 'learning_rate': 0.055, 'subsample': 0.92, 'colsample_bytree': 0.73},
            
            # Í≥µÍ≤©Ï†Å Î≥ÄÌòïÎì§
            {'n_estimators': 135, 'max_depth': 8, 'learning_rate': 0.095, 'subsample': 0.72, 'colsample_bytree': 0.91},
            {'n_estimators': 128, 'max_depth': 7, 'learning_rate': 0.105, 'subsample': 0.74, 'colsample_bytree': 0.89},
        ]
        
        best_config_score = 0
        best_config = None
        best_predictions = None
        
        for i, params in enumerate(param_combinations, 1):
            print(f"  Ï°∞Ìï© {i}/{len(param_combinations)}: {params}")
            
            try:
                # ÎØ∏ÏÑ∏Ï°∞Ï†ïÎêú Î™®Îç∏Îì§
                models = {
                    'xgb_tuned': xgb.XGBClassifier(**params, random_state=42, scale_pos_weight=scale_pos_weight),
                    'lgb': lgb.LGBMClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=42, class_weight='balanced', verbose=-1),
                    'cat': cb.CatBoostClassifier(iterations=160, depth=6, learning_rate=0.08, random_state=42, verbose=False),
                }
                
                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                oof_predictions = np.zeros((len(X_train), len(models)))
                test_predictions = np.zeros((len(X_test), len(models)))
                
                for j, (name, model) in enumerate(models.items()):
                    oof_pred = np.zeros(len(X_train))
                    
                    for train_idx, val_idx in cv.split(X_train, y_train):
                        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                        
                        model.fit(X_tr, y_tr)
                        oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
                    
                    oof_predictions[:, j] = oof_pred
                    model.fit(X_train, y_train)
                    test_predictions[:, j] = model.predict_proba(X_test)[:, 1]
                
                # Î©îÌÉÄ Î™®Îç∏
                meta_model = LogisticRegression(random_state=42, class_weight='balanced', C=0.1)
                meta_model.fit(oof_predictions, y_train)
                
                # CV Ï†êÏàò
                cv_pred = meta_model.predict(oof_predictions)
                cv_score = f1_score(y_train, cv_pred)
                
                print(f"    CV F1: {cv_score:.6f}")
                
                if cv_score > best_config_score:
                    best_config_score = cv_score
                    best_config = params
                    
                    final_proba = meta_model.predict_proba(test_predictions)[:, 1]
                    best_predictions = (final_proba > 0.5).astype(int)
                    
                    print(f"    üî• ÏÉàÎ°úÏö¥ ÏµúÍ≥†! {cv_score:.6f}")
                
            except Exception as e:
                print(f"    ‚ùå Ï°∞Ìï© {i} Ïã§Ìå®: {e}")
        
        print(f"\n  ‚úÖ ÏµúÍ≥† ÏÑ§Ï†ï: {best_config}")
        print(f"  ‚úÖ ÏµúÍ≥† Ï†êÏàò: {best_config_score:.6f}")
        
        if best_predictions is not None:
            submission = pd.DataFrame({'ID': test_ids, 'Cancer': best_predictions})
            self.submissions['micro_tuned'] = submission
            return submission
        return None
    
    def hunt_strategy_3_cv_variations(self):
        """Ï†ÑÎûµ 3: CV Î∂ÑÌï† Î∞©Ïãù Î≥ÄÍ≤Ω"""
        print("\nüìä Ï†ÑÎûµ 3: CV Î∂ÑÌï† Î∞©Ïãù Ïã§Ìóò!")
        print("  (Îã§Î•∏ Î∂ÑÌï†Ïù¥ Îçî ÎÇòÏùÄ Í≤∞Í≥ºÎ•º...)")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # Îã§ÏñëÌïú CV Ï†ÑÎûµ
        cv_strategies = [
            ('3-fold', StratifiedKFold(n_splits=3, shuffle=True, random_state=42)),
            ('7-fold', StratifiedKFold(n_splits=7, shuffle=True, random_state=42)),
            ('10-fold', StratifiedKFold(n_splits=10, shuffle=True, random_state=42)),
            ('5-fold-999', StratifiedKFold(n_splits=5, shuffle=True, random_state=999)),
            ('5-fold-1337', StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)),
        ]
        
        best_cv_score = 0
        best_cv_name = ""
        best_predictions = None
        
        for cv_name, cv_strategy in cv_strategies:
            print(f"  {cv_name} ÌÖåÏä§Ìä∏ Ï§ë...")
            
            try:
                models = {
                    'xgb1': xgb.XGBClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=42, scale_pos_weight=scale_pos_weight),
                    'lgb': lgb.LGBMClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=42, class_weight='balanced', verbose=-1),
                    'cat': cb.CatBoostClassifier(iterations=160, depth=6, learning_rate=0.08, random_state=42, verbose=False),
                }
                
                oof_predictions = np.zeros((len(X_train), len(models)))
                test_predictions = np.zeros((len(X_test), len(models)))
                
                for j, (name, model) in enumerate(models.items()):
                    oof_pred = np.zeros(len(X_train))
                    
                    for train_idx, val_idx in cv_strategy.split(X_train, y_train):
                        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                        
                        model.fit(X_tr, y_tr)
                        oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
                    
                    oof_predictions[:, j] = oof_pred
                    model.fit(X_train, y_train)
                    test_predictions[:, j] = model.predict_proba(X_test)[:, 1]
                
                # Î©îÌÉÄ Î™®Îç∏
                meta_model = LogisticRegression(random_state=42, class_weight='balanced', C=0.1)
                meta_model.fit(oof_predictions, y_train)
                
                # CV Ï†êÏàò
                cv_pred = meta_model.predict(oof_predictions)
                cv_score = f1_score(y_train, cv_pred)
                
                print(f"    CV F1: {cv_score:.6f}")
                
                if cv_score > best_cv_score:
                    best_cv_score = cv_score
                    best_cv_name = cv_name
                    
                    final_proba = meta_model.predict_proba(test_predictions)[:, 1]
                    best_predictions = (final_proba > 0.5).astype(int)
                    
                    print(f"    üî• ÏÉàÎ°úÏö¥ ÏµúÍ≥†! {cv_name}: {cv_score:.6f}")
                
            except Exception as e:
                print(f"    ‚ùå {cv_name} Ïã§Ìå®: {e}")
        
        print(f"\n  ‚úÖ ÏµúÍ≥† CV: {best_cv_name} (CV: {best_cv_score:.6f})")
        
        if best_predictions is not None:
            submission = pd.DataFrame({'ID': test_ids, 'Cancer': best_predictions})
            self.submissions['cv_variation'] = submission
            return submission
        return None
    
    def hunt_strategy_4_meta_model_hunt(self):
        """Ï†ÑÎûµ 4: Îã§ÏñëÌïú Î©îÌÉÄÎ™®Îç∏ Ïã§Ìóò"""
        print("\nüß† Ï†ÑÎûµ 4: Î©îÌÉÄÎ™®Îç∏ ÎåÄÏã§Ìóò!")
        print("  (ÌòπÏãú ÌäπÎ≥ÑÌïú Î©îÌÉÄÎ™®Îç∏Ïù¥...)")
        
        X_train, y_train, X_test, test_ids = self.load_and_preprocess(42)
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # Í∏∞Î≥∏ Î™®Îç∏Îì§
        models = {
            'xgb1': xgb.XGBClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=42, scale_pos_weight=scale_pos_weight),
            'xgb2': xgb.XGBClassifier(n_estimators=180, max_depth=5, learning_rate=0.07, random_state=123, scale_pos_weight=scale_pos_weight),
            'lgb': lgb.LGBMClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=42, class_weight='balanced', verbose=-1),
            'cat': cb.CatBoostClassifier(iterations=160, depth=6, learning_rate=0.08, random_state=42, verbose=False),
        }
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        oof_predictions = np.zeros((len(X_train), len(models)))
        test_predictions = np.zeros((len(X_test), len(models)))
        
        for j, (name, model) in enumerate(models.items()):
            oof_pred = np.zeros(len(X_train))
            
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                model.fit(X_tr, y_tr)
                oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
            
            oof_predictions[:, j] = oof_pred
            model.fit(X_train, y_train)
            test_predictions[:, j] = model.predict_proba(X_test)[:, 1]
        
        # Îã§ÏñëÌïú Î©îÌÉÄÎ™®Îç∏Îì§
        meta_models = {
            'logistic_c01': LogisticRegression(random_state=42, class_weight='balanced', C=0.1),
            'logistic_c1': LogisticRegression(random_state=42, class_weight='balanced', C=1.0),
            'logistic_c10': LogisticRegression(random_state=42, class_weight='balanced', C=10.0),
            'ridge': RidgeClassifier(random_state=42, class_weight='balanced'),
            'xgb_meta': xgb.XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.1, random_state=42, scale_pos_weight=scale_pos_weight),
            'lgb_meta': lgb.LGBMClassifier(n_estimators=50, max_depth=3, learning_rate=0.1, random_state=42, class_weight='balanced', verbose=-1),
        }
        
        best_meta_score = 0
        best_meta_name = ""
        best_predictions = None
        
        for meta_name, meta_model in meta_models.items():
            print(f"  {meta_name} ÌÖåÏä§Ìä∏ Ï§ë...")
            
            try:
                meta_model.fit(oof_predictions, y_train)
                
                if hasattr(meta_model, 'predict_proba'):
                    cv_pred_proba = meta_model.predict_proba(oof_predictions)[:, 1]
                    cv_pred = (cv_pred_proba > 0.5).astype(int)
                else:
                    cv_pred = meta_model.predict(oof_predictions)
                
                cv_score = f1_score(y_train, cv_pred)
                print(f"    CV F1: {cv_score:.6f}")
                
                if cv_score > best_meta_score:
                    best_meta_score = cv_score
                    best_meta_name = meta_name
                    
                    if hasattr(meta_model, 'predict_proba'):
                        final_proba = meta_model.predict_proba(test_predictions)[:, 1]
                        best_predictions = (final_proba > 0.5).astype(int)
                    else:
                        best_predictions = meta_model.predict(test_predictions)
                    
                    print(f"    üî• ÏÉàÎ°úÏö¥ ÏµúÍ≥†! {meta_name}: {cv_score:.6f}")
            
            except Exception as e:
                print(f"    ‚ùå {meta_name} Ïã§Ìå®: {e}")
        
        print(f"\n  ‚úÖ ÏµúÍ≥† Î©îÌÉÄÎ™®Îç∏: {best_meta_name} (CV: {best_meta_score:.6f})")
        
        if best_predictions is not None:
            submission = pd.DataFrame({'ID': test_ids, 'Cancer': best_predictions})
            self.submissions['meta_hunt'] = submission
            return submission
        return None
    
    def ultimate_monster_ensemble(self):
        """Í∂ÅÍ∑πÏùò Í¥¥Î¨º ÏïôÏÉÅÎ≥Ñ"""
        print("\nüëπ Í∂ÅÍ∑πÏùò Í¥¥Î¨º ÏïôÏÉÅÎ∏î!")
        
        if len(self.submissions) < 2:
            print("  Ï∂©Î∂ÑÌïú Ïã§ÌóòÏù¥ ÏôÑÎ£åÎêòÏßÄ ÏïäÏùå")
            return None
        
        all_predictions = []
        strategy_names = []
        
        for name, submission in self.submissions.items():
            all_predictions.append(submission['Cancer'].values)
            strategy_names.append(name)
        
        all_predictions = np.array(all_predictions)
        
        print(f"  {len(strategy_names)}Í∞ú Ï†ÑÎûµ Í≤∞Ìï©:")
        for name in strategy_names:
            print(f"    - {name}")
        
        # Í∞ÄÏ§ë ÌèâÍ∑†
        final_predictions = np.round(np.mean(all_predictions, axis=0)).astype(int)
        
        test_ids = list(self.submissions.values())[0]['ID']
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': final_predictions})
        
        self.submissions['ultimate_monster'] = submission
        return submission

def run_monster_hunt():
    """Í¥¥Î¨º Ï∂îÏ†Å ÏûëÏ†Ñ Ïã§Ìñâ"""
    print("üïµÔ∏è 0.5118+ Í¥¥Î¨º Ï∂îÏ†Å ÏûëÏ†Ñ Í∞úÏãú!")
    print("=" * 60)
    print("Î™©Ìëú: 1Îì±Ïùò ÎπÑÎ∞ÄÏùÑ ÌååÌó§ÏπòÍ≥† 0.5118+ Îã¨ÏÑ±!")
    print("=" * 60)
    
    hunter = MonsterHunter()
    
    strategies = [
        ("ÌñâÏö¥Ïùò ÏãúÎìú ÌÉêÏÉâ", hunter.hunt_strategy_1_lucky_seeds),
        ("Í∑πÌïú ÎØ∏ÏÑ∏Ï°∞Ï†ï", hunter.hunt_strategy_2_micro_tuning),
        ("CV Î∂ÑÌï† Ïã§Ìóò", hunter.hunt_strategy_3_cv_variations),
        ("Î©îÌÉÄÎ™®Îç∏ Ïã§Ìóò", hunter.hunt_strategy_4_meta_model_hunt),
        ("Í∂ÅÍ∑πÏùò Í¥¥Î¨º ÏïôÏÉÅÎ∏î", hunter.ultimate_monster_ensemble)
    ]
    
    for i, (name, strategy_func) in enumerate(strategies, 1):
        print(f"\n{'='*20} {name} {'='*20}")
        try:
            result = strategy_func()
            if result is not None:
                filename = f'monster_hunt_{i}.csv'
                result.to_csv(filename, index=False)
                print(f"  üíæ Ï†ÄÏû•: {filename}")
        except Exception as e:
            print(f"  ‚ùå {name} Ïã§Ìå®: {e}")
    
    # ÏµúÏ¢Ö Í¥¥Î¨º ÎèÑÏ†Ñ
    if 'ultimate_monster' in hunter.submissions:
        best_submission = hunter.submissions['ultimate_monster']
        best_submission.to_csv('MONSTER_KILLER.csv', index=False)
        print(f"\nüëπ Í¥¥Î¨º ÎèÑÏ†Ñ: MONSTER_KILLER.csv")
    
    print(f"\nüéØ Ï†úÏ∂ú Í∂åÏû• ÏàúÏÑú:")
    print(f"1. MONSTER_KILLER.csv (Ï¢ÖÌï© Í≤∞Í≥º)")
    print(f"2. monster_hunt_1.csv (ÌñâÏö¥Ïùò ÏãúÎìú)")
    print(f"3. monster_hunt_2.csv (Í∑πÌïú ÎØ∏ÏÑ∏Ï°∞Ï†ï)")
    print(f"4. monster_hunt_4.csv (Î©îÌÉÄÎ™®Îç∏)")
    
    print(f"\nüîç Î∂ÑÏÑù Í≤∞Í≥º:")
    print(f"- ÎßåÏïΩ ÌÅ∞ Î≥ÄÌôîÍ∞Ä ÏóÜÎã§Î©¥ ‚Üí 1Îì±ÎèÑ Ïö¥Ïù¥ÏóàÏùÑ Í∞ÄÎä•ÏÑ±")
    print(f"- ÎßåÏïΩ 0.511+Í∞Ä ÎÇòÏò®Îã§Î©¥ ‚Üí Í∑∏ Î∞©Î≤ïÏù¥ ÌïµÏã¨!")
    print(f"- 0.5118+ Îã¨ÏÑ±Ïãú ‚Üí ÏßÑÏßú Í¥¥Î¨º Ïû°Í∏∞ ÏÑ±Í≥µ! üèÜ")

if __name__ == "__main__":
    run_monster_hunt()

üïµÔ∏è 0.5118+ Í¥¥Î¨º Ï∂îÏ†Å ÏûëÏ†Ñ Í∞úÏãú!
Î™©Ìëú: 1Îì±Ïùò ÎπÑÎ∞ÄÏùÑ ÌååÌó§ÏπòÍ≥† 0.5118+ Îã¨ÏÑ±!

üé≤ Ï†ÑÎûµ 1: ÌñâÏö¥Ïùò ÏãúÎìú ÎåÄÌÉêÏÉâ!
  (Ïù¥Í≤å 1Îì±Ïùò ÎπÑÎ∞ÄÏùº ÌôïÎ•† 80%)
  ÏãúÎìú 42 ÌÖåÏä§Ìä∏ Ï§ë... (1/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486331
    üî• ÏÉàÎ°úÏö¥ ÏµúÍ≥†! ÏãúÎìú 42: 0.486331
  ÏãúÎìú 123 ÌÖåÏä§Ìä∏ Ï§ë... (2/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486525
    üî• ÏÉàÎ°úÏö¥ ÏµúÍ≥†! ÏãúÎìú 123: 0.486525
  ÏãúÎìú 456 ÌÖåÏä§Ìä∏ Ï§ë... (3/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486820
    üî• ÏÉàÎ°úÏö¥ ÏµúÍ≥†! ÏãúÎìú 456: 0.486820
  ÏãúÎìú 789 ÌÖåÏä§Ìä∏ Ï§ë... (4/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486473
  ÏãúÎìú 999 ÌÖåÏä§Ìä∏ Ï§ë... (5/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486770
  ÏãúÎìú 1337 ÌÖåÏä§Ìä∏ Ï§ë... (6/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486530
  ÏãúÎìú 2024 ÌÖåÏä§Ìä∏ Ï§ë... (7/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486918
    üî• ÏÉàÎ°úÏö¥ ÏµúÍ≥†! ÏãúÎìú 2024: 0.486918
  ÏãúÎìú 2025 ÌÖåÏä§Ìä∏ Ï§ë... (8/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486451
  ÏãúÎìú 777 ÌÖåÏä§Ìä∏ Ï§ë... (9/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486647
  ÏãúÎìú 888 ÌÖåÏä§Ìä∏ Ï§ë... (10/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486478
  ÏãúÎìú 314 ÌÖåÏä§Ìä∏ Ï§ë... (11/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

    CV F1: 0.486697
  ÏãúÎìú 271 ÌÖåÏä§Ìä∏ Ï§ë... (12/38)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al