In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer, QuantileTransformer
from sklearn.metrics import f1_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

class FinalBreakthroughOptimizer:
    """마지막 돌파를 위한 데이터 레벨 최적화"""
    
    def __init__(self):
        self.submissions = {}
        self.transformers = {}
        
    def load_and_advanced_preprocess(self, strategy='standard'):
        """고급 전처리 전략들"""
        print(f"🔧 고급 전처리 전략: {strategy}")
        
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        X_train = train_df[feature_cols].copy()
        y_train = train_df['Cancer'].copy()
        X_test = test_df[feature_cols].copy()
        
        # 카테고리컬 인코딩
        categorical_cols = X_train.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            
            test_values = X_test[col].astype(str)
            test_encoded = []
            for val in test_values:
                if val in le.classes_:
                    test_encoded.append(le.transform([val])[0])
                else:
                    test_encoded.append(0)
            X_test[col] = test_encoded
        
        # 결측값 처리
        numeric_cols = X_train.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_test[col].fillna(median_val, inplace=True)
        
        # 고급 전처리 전략 적용
        if strategy == 'power_transform':
            # Power Transform (Yeo-Johnson)
            print("  Power Transform 적용...")
            pt = PowerTransformer(method='yeo-johnson', standardize=True)
            X_train[numeric_cols] = pt.fit_transform(X_train[numeric_cols])
            X_test[numeric_cols] = pt.transform(X_test[numeric_cols])
            
        elif strategy == 'quantile_transform':
            # Quantile Transform (Uniform distribution)
            print("  Quantile Transform 적용...")
            qt = QuantileTransformer(n_quantiles=1000, output_distribution='uniform', random_state=42)
            X_train[numeric_cols] = qt.fit_transform(X_train[numeric_cols])
            X_test[numeric_cols] = qt.transform(X_test[numeric_cols])
            
        elif strategy == 'robust_scaling':
            # Robust Scaling (이상치에 강함)
            print("  Robust Scaling 적용...")
            from sklearn.preprocessing import RobustScaler
            rs = RobustScaler()
            X_train[numeric_cols] = rs.fit_transform(X_train[numeric_cols])
            X_test[numeric_cols] = rs.transform(X_test[numeric_cols])
            
        elif strategy == 'log_transform':
            # Log Transform
            print("  Log Transform 적용...")
            for col in numeric_cols:
                if X_train[col].min() > 0:  # 양수만 가능
                    X_train[col] = np.log1p(X_train[col])
                    X_test[col] = np.log1p(X_test[col])
            
            # 그 후 StandardScaling
            scaler = StandardScaler()
            X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
            X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
            
        else:  # standard
            # 기본 StandardScaling
            scaler = StandardScaler()
            X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
            X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
        
        return X_train, y_train, X_test, test_df['ID']
    
    def hyperparameter_precision_tuning(self, X_train, y_train, X_test, test_ids):
        """초정밀 하이퍼파라미터 튜닝"""
        print("\n⚙️ 초정밀 하이퍼파라미터 튜닝...")
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # 매우 세밀한 파라미터 그리드
        param_grid = {
            'n_estimators': [140, 150, 160, 170, 180],
            'max_depth': [5, 6, 7],
            'learning_rate': [0.07, 0.075, 0.08, 0.085, 0.09],
            'subsample': [0.75, 0.8, 0.85],
            'colsample_bytree': [0.75, 0.8, 0.85, 0.9]
        }
        
        base_model = xgb.XGBClassifier(
            random_state=42,
            scale_pos_weight=scale_pos_weight,
            reg_alpha=0.1,
            reg_lambda=0.1,
            eval_metric='logloss'
        )
        
        # 3-fold로 빠른 탐색
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        grid_search = GridSearchCV(
            base_model, 
            param_grid,
            cv=cv,
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )
        
        print("  그리드 서치 실행 중... (시간이 걸립니다)")
        grid_search.fit(X_train, y_train)
        
        print(f"  ✅ 최적 파라미터: {grid_search.best_params_}")
        print(f"  ✅ 최적 CV F1: {grid_search.best_score_:.6f}")
        
        # 최적 모델로 예측
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': predictions})
        self.submissions['precision_tuned'] = submission
        
        return submission, grid_search.best_params_
    
    def advanced_stacking_with_best_params(self, X_train, y_train, X_test, test_ids, best_params):
        """최적 파라미터로 고급 스태킹"""
        print("\n🏗️ 최적 파라미터 기반 고급 스태킹...")
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # Level 1: 최적화된 다양한 모델들
        models = {
            'xgb_optimal': xgb.XGBClassifier(**best_params, random_state=42, scale_pos_weight=scale_pos_weight, reg_alpha=0.1, reg_lambda=0.1),
            'xgb_variant1': xgb.XGBClassifier(**best_params, random_state=123, scale_pos_weight=scale_pos_weight, reg_alpha=0.05, reg_lambda=0.15),
            'xgb_variant2': xgb.XGBClassifier(**best_params, random_state=456, scale_pos_weight=scale_pos_weight, reg_alpha=0.15, reg_lambda=0.05),
            
            'lgb_optimal': lgb.LGBMClassifier(
                n_estimators=best_params['n_estimators'],
                max_depth=best_params['max_depth'], 
                learning_rate=best_params['learning_rate'],
                subsample=best_params['subsample'],
                colsample_bytree=best_params['colsample_bytree'],
                random_state=42, class_weight='balanced', verbose=-1
            ),
            
            'cat_optimal': cb.CatBoostClassifier(
                iterations=best_params['n_estimators'],
                depth=best_params['max_depth'],
                learning_rate=best_params['learning_rate'],
                subsample=best_params['subsample'],
                random_state=42, verbose=False
            )
        }
        
        print(f"  Level 1: {len(models)}개 최적화된 모델")
        
        # 7-fold Cross-validation
        cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
        oof_predictions = np.zeros((len(X_train), len(models)))
        test_predictions = np.zeros((len(X_test), len(models)))
        
        for i, (name, model) in enumerate(models.items()):
            print(f"    {name} 처리 중...")
            
            oof_pred = np.zeros(len(X_train))
            
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                
                model.fit(X_tr, y_tr)
                oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
            
            oof_predictions[:, i] = oof_pred
            
            # 전체 데이터로 재학습
            model.fit(X_train, y_train)
            test_predictions[:, i] = model.predict_proba(X_test)[:, 1]
        
        # Level 2: 정밀 조정된 메타 모델
        meta_models = [
            LogisticRegression(random_state=42, class_weight='balanced', C=0.1),
            LogisticRegression(random_state=42, class_weight='balanced', C=1.0),
            LogisticRegression(random_state=42, class_weight='balanced', C=10.0)
        ]
        
        best_meta_score = 0
        best_meta_model = None
        
        cv_meta = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        for meta_model in meta_models:
            scores = cross_val_score(meta_model, oof_predictions, y_train, cv=cv_meta, scoring='f1')
            score = scores.mean()
            
            if score > best_meta_score:
                best_meta_score = score
                best_meta_model = meta_model
        
        print(f"  ✅ 최고 메타 모델 CV F1: {best_meta_score:.6f}")
        
        # 최종 예측
        best_meta_model.fit(oof_predictions, y_train)
        final_proba = best_meta_model.predict_proba(test_predictions)[:, 1]
        final_predictions = (final_proba > 0.5).astype(int)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': final_predictions})
        self.submissions['advanced_stacking'] = submission
        
        return submission
    
    def data_leakage_investigation(self):
        """데이터 누수 조사"""
        print("\n🔍 데이터 누수 및 패턴 조사...")
        
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        # ID 패턴 분석
        print("  ID 패턴 분석...")
        train_ids = train_df['ID'].astype(str)
        test_ids = test_df['ID'].astype(str)
        
        # ID에서 숫자 추출하여 패턴 확인
        train_id_nums = train_ids.str.extract('(\d+)').astype(float)
        test_id_nums = test_ids.str.extract('(\d+)').astype(float)
        
        print(f"    Train ID 범위: {train_id_nums.min().values[0]:.0f} ~ {train_id_nums.max().values[0]:.0f}")
        print(f"    Test ID 범위: {test_id_nums.min().values[0]:.0f} ~ {test_id_nums.max().values[0]:.0f}")
        
        # 특성 분포 비교
        print("\n  Train vs Test 분포 비교...")
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        for col in feature_cols:
            if train_df[col].dtype in ['int64', 'float64']:
                # 수치형 변수
                train_mean = train_df[col].mean()
                test_mean = test_df[col].mean()
                diff_pct = abs(train_mean - test_mean) / train_mean * 100
                
                if diff_pct > 5:  # 5% 이상 차이
                    print(f"    ⚠️  {col}: Train {train_mean:.3f} vs Test {test_mean:.3f} ({diff_pct:.1f}% 차이)")
            else:
                # 카테고리컬 변수
                train_unique = set(train_df[col].unique())
                test_unique = set(test_df[col].unique())
                
                only_in_test = test_unique - train_unique
                if only_in_test:
                    print(f"    ⚠️  {col}: 테스트에만 있는 값 {len(only_in_test)}개")
    
    def ensemble_all_strategies(self):
        """모든 전략의 최종 앙상별"""
        print("\n🎯 모든 전략 최종 앙상블...")
        
        if len(self.submissions) < 2:
            print("  ❌ 충분한 전략이 실행되지 않음")
            return None
        
        # 모든 예측 수집
        all_predictions = []
        strategy_names = []
        
        for name, submission in self.submissions.items():
            all_predictions.append(submission['Cancer'].values)
            strategy_names.append(name)
            print(f"  전략: {name}")
        
        # 단순 다수결
        all_predictions = np.array(all_predictions)
        majority_vote = np.round(np.mean(all_predictions, axis=0)).astype(int)
        
        # 가중 평균 (최신 전략에 높은 가중치)
        weights = np.linspace(0.8, 1.2, len(strategy_names))  # 최신일수록 높은 가중치
        weighted_avg = np.average(all_predictions, axis=0, weights=weights)
        weighted_predictions = np.round(weighted_avg).astype(int)
        
        # 두 가지 앙상블 결과
        test_ids = list(self.submissions.values())[0]['ID']
        
        majority_submission = pd.DataFrame({'ID': test_ids, 'Cancer': majority_vote})
        weighted_submission = pd.DataFrame({'ID': test_ids, 'Cancer': weighted_predictions})
        
        self.submissions['final_majority'] = majority_submission
        self.submissions['final_weighted'] = weighted_submission
        
        return weighted_submission  # 가중 평균을 기본으로 반환

def run_final_breakthrough():
    """마지막 돌파 시도 실행"""
    print("🚀 마지막 돌파 시도: 데이터 레벨 최적화!")
    print("목표: 0.5109 → 0.512+ (1등 탈환!)")
    print("=" * 60)
    
    optimizer = FinalBreakthroughOptimizer()
    
    # 1. 데이터 누수 조사
    optimizer.data_leakage_investigation()
    
    # 2. 다양한 전처리 전략 시도
    strategies = ['power_transform', 'quantile_transform', 'robust_scaling']
    
    for strategy in strategies:
        print(f"\n{'='*20} {strategy} {'='*20}")
        try:
            X_train, y_train, X_test, test_ids = optimizer.load_and_advanced_preprocess(strategy)
            
            # 초정밀 하이퍼파라미터 튜닝
            tuned_submission, best_params = optimizer.hyperparameter_precision_tuning(
                X_train, y_train, X_test, test_ids
            )
            
            # 최적 파라미터로 고급 스태킹
            stacking_submission = optimizer.advanced_stacking_with_best_params(
                X_train, y_train, X_test, test_ids, best_params
            )
            
            # 파일 저장
            tuned_submission.to_csv(f'breakthrough_tuned_{strategy}.csv', index=False)
            stacking_submission.to_csv(f'breakthrough_stacking_{strategy}.csv', index=False)
            
        except Exception as e:
            print(f"  ❌ {strategy} 실패: {e}")
    
    # 3. 최종 앙상별
    final_submission = optimizer.ensemble_all_strategies()
    if final_submission is not None:
        final_submission.to_csv('FINAL_BREAKTHROUGH.csv', index=False)
    
    print(f"\n🎯 생성된 파일들:")
    print(f"1. FINAL_BREAKTHROUGH.csv ⭐ (최종 돌파 시도)")
    print(f"2. breakthrough_stacking_*.csv (각 전처리별 스태킹)")
    print(f"3. breakthrough_tuned_*.csv (각 전처리별 튜닝)")
    
    print(f"\n💡 이번 시도의 핵심:")
    print(f"- 고급 전처리로 데이터 품질 향상")
    print(f"- 초정밀 하이퍼파라미터 튜닝")
    print(f"- 데이터 누수 가능성 조사")
    print(f"- 최적화된 스태킹 앙상별")

if __name__ == "__main__":
    run_final_breakthrough()

  train_id_nums = train_ids.str.extract('(\d+)').astype(float)
  test_id_nums = test_ids.str.extract('(\d+)').astype(float)


🚀 마지막 돌파 시도: 데이터 레벨 최적화!
목표: 0.5109 → 0.512+ (1등 탈환!)

🔍 데이터 누수 및 패턴 조사...
  ID 패턴 분석...
    Train ID 범위: 0 ~ 87158
    Test ID 범위: 0 ~ 46203

  Train vs Test 분포 비교...

🔧 고급 전처리 전략: power_transform
  Power Transform 적용...

⚙️ 초정밀 하이퍼파라미터 튜닝...
  그리드 서치 실행 중... (시간이 걸립니다)
Fitting 3 folds for each of 900 candidates, totalling 2700 fits
  ✅ 최적 파라미터: {'colsample_bytree': 0.8, 'learning_rate': 0.07, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.85}
  ✅ 최적 CV F1: 0.474495

🏗️ 최적 파라미터 기반 고급 스태킹...
  Level 1: 5개 최적화된 모델
    xgb_optimal 처리 중...
    xgb_variant1 처리 중...
    xgb_variant2 처리 중...
    lgb_optimal 처리 중...
    cat_optimal 처리 중...
  ✅ 최고 메타 모델 CV F1: 0.486957

🔧 고급 전처리 전략: quantile_transform
  Quantile Transform 적용...

⚙️ 초정밀 하이퍼파라미터 튜닝...
  그리드 서치 실행 중... (시간이 걸립니다)
Fitting 3 folds for each of 900 candidates, totalling 2700 fits
  ✅ 최적 파라미터: {'colsample_bytree': 0.8, 'learning_rate': 0.07, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.85}
  ✅ 최적 CV F1: 0.474495

🏗️ 최적 파라미