In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

class FeatureImportanceAnalyzer:
    """특성 중요도 분석 및 선택 기반 최적화"""
    
    def __init__(self):
        self.feature_importances = {}
        self.selected_features = {}
        self.submissions = {}
        
    def load_and_preprocess(self, random_seed=42):
        """기본 전처리"""
        np.random.seed(random_seed)
        
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        feature_cols = [col for col in train_df.columns if col not in ['ID', 'Cancer']]
        
        X_train = train_df[feature_cols].copy()
        y_train = train_df['Cancer'].copy()
        X_test = test_df[feature_cols].copy()
        
        print(f"📊 원본 특성 개수: {len(feature_cols)}")
        print(f"   특성 목록: {feature_cols}")
        
        # 카테고리컬 인코딩
        categorical_cols = X_train.select_dtypes(include=['object']).columns
        label_encoders = {}
        
        for col in categorical_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            label_encoders[col] = le
            
            test_values = X_test[col].astype(str)
            test_encoded = []
            for val in test_values:
                if val in le.classes_:
                    test_encoded.append(le.transform([val])[0])
                else:
                    test_encoded.append(0)
            X_test[col] = test_encoded
        
        # 결측값 처리
        numeric_cols = X_train.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_test[col].fillna(median_val, inplace=True)
        
        return X_train, y_train, X_test, test_df['ID'], feature_cols
    
    def analyze_feature_importance(self, X_train, y_train, feature_cols):
        """다양한 방법으로 특성 중요도 분석"""
        print("\n🔍 특성 중요도 분석 중...")
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # 1. XGBoost 중요도
        print("  XGBoost 중요도 계산...")
        xgb_model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        xgb_model.fit(X_train, y_train)
        xgb_importance = xgb_model.feature_importances_
        
        # 2. LightGBM 중요도  
        print("  LightGBM 중요도 계산...")
        lgb_model = lgb.LGBMClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=42,
            class_weight='balanced',
            verbose=-1
        )
        lgb_model.fit(X_train, y_train)
        lgb_importance = lgb_model.feature_importances_
        
        # 3. Random Forest 중요도
        print("  Random Forest 중요도 계산...")
        rf_model = RandomForestClassifier(
            n_estimators=100,
            max_depth=8,
            random_state=42,
            class_weight='balanced'
        )
        rf_model.fit(X_train, y_train)
        rf_importance = rf_model.feature_importances_
        
        # 4. 통계적 방법 (F-test)
        print("  F-test 중요도 계산...")
        f_selector = SelectKBest(f_classif, k='all')
        f_selector.fit(X_train, y_train)
        f_scores = f_selector.scores_
        f_importance = f_scores / f_scores.max()  # 정규화
        
        # 5. Mutual Information
        print("  Mutual Information 계산...")
        mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
        mi_importance = mi_scores / mi_scores.max()  # 정규화
        
        # 모든 중요도 저장
        self.feature_importances = {
            'XGBoost': xgb_importance,
            'LightGBM': lgb_importance,
            'RandomForest': rf_importance,
            'F_test': f_importance,
            'MutualInfo': mi_importance
        }
        
        # 중요도 데이터프레임 생성
        importance_df = pd.DataFrame({
            'Feature': feature_cols,
            'XGBoost': xgb_importance,
            'LightGBM': lgb_importance,
            'RandomForest': rf_importance,
            'F_test': f_importance,
            'MutualInfo': mi_importance
        })
        
        # 평균 중요도 계산
        importance_df['Average'] = importance_df[['XGBoost', 'LightGBM', 'RandomForest', 'F_test', 'MutualInfo']].mean(axis=1)
        importance_df = importance_df.sort_values('Average', ascending=False)
        
        print("\n📊 특성 중요도 순위:")
        for i, row in importance_df.iterrows():
            print(f"  {row['Feature']:20s}: {row['Average']:.4f} (XGB: {row['XGBoost']:.3f}, LGB: {row['LightGBM']:.3f}, RF: {row['RandomForest']:.3f})")
        
        return importance_df
    
    def test_feature_combinations(self, X_train, y_train, X_test, test_ids, importance_df):
        """다양한 특성 조합 테스트"""
        print("\n🧪 다양한 특성 조합 테스트...")
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # 테스트할 특성 개수들
        feature_counts = [5, 7, 8, 10, 12]
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        best_score = 0
        best_features = None
        best_count = 0
        best_predictions = None
        
        for k in feature_counts:
            print(f"\n  상위 {k}개 특성으로 테스트...")
            
            # 상위 k개 특성 선택
            top_features = importance_df.head(k)['Feature'].tolist()
            print(f"    선택된 특성: {top_features}")
            
            X_train_selected = X_train[top_features]
            X_test_selected = X_test[top_features]
            
            # XGBoost로 성능 테스트
            model = xgb.XGBClassifier(
                n_estimators=150,
                max_depth=6,
                learning_rate=0.08,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                scale_pos_weight=scale_pos_weight,
                reg_alpha=0.1,
                reg_lambda=0.1
            )
            
            # Cross Validation
            cv_scores = cross_val_score(model, X_train_selected, y_train, cv=cv, scoring='f1')
            cv_mean = cv_scores.mean()
            cv_std = cv_scores.std()
            
            print(f"    CV F1: {cv_mean:.6f} ± {cv_std:.6f}")
            
            if cv_mean > best_score:
                best_score = cv_mean
                best_features = top_features
                best_count = k
                
                # 최고 성능 모델로 예측
                model.fit(X_train_selected, y_train)
                best_predictions = model.predict(X_test_selected)
        
        print(f"\n✅ 최고 성능: {best_count}개 특성 (CV F1: {best_score:.6f})")
        print(f"   선택된 특성: {best_features}")
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': best_predictions})
        self.submissions['feature_selected'] = submission
        self.selected_features['best'] = best_features
        
        return submission, best_features
    
    def ensemble_with_selected_features(self, X_train, y_train, X_test, test_ids, selected_features):
        """선택된 특성으로 앙상블"""
        print(f"\n🤖 선택된 {len(selected_features)}개 특성으로 앙상블...")
        
        X_train_selected = X_train[selected_features]
        X_test_selected = X_test[selected_features]
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # 다양한 모델로 앙상블
        models = {
            'xgb_conservative': xgb.XGBClassifier(n_estimators=180, max_depth=5, learning_rate=0.07, random_state=42, scale_pos_weight=scale_pos_weight),
            'xgb_balanced': xgb.XGBClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=123, scale_pos_weight=scale_pos_weight),
            'lgb': lgb.LGBMClassifier(n_estimators=160, max_depth=6, learning_rate=0.08, random_state=42, class_weight='balanced', verbose=-1),
            'rf': RandomForestClassifier(n_estimators=160, max_depth=8, random_state=42, class_weight='balanced')
        }
        
        # Cross-validation으로 각 모델 성능 확인
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        model_scores = {}
        model_predictions = {}
        
        for name, model in models.items():
            cv_scores = cross_val_score(model, X_train_selected, y_train, cv=cv, scoring='f1')
            model_scores[name] = cv_scores.mean()
            print(f"  {name}: CV F1 = {cv_scores.mean():.6f}")
            
            # 전체 데이터로 학습
            model.fit(X_train_selected, y_train)
            model_predictions[name] = model.predict_proba(X_test_selected)[:, 1]
        
        # 성능 기반 가중 앙상별
        total_score = sum(model_scores.values())
        weights = [score/total_score for score in model_scores.values()]
        
        print(f"  가중치: {dict(zip(model_scores.keys(), weights))}")
        
        # 가중 평균
        ensemble_proba = np.average(list(model_predictions.values()), axis=0, weights=weights)
        ensemble_predictions = (ensemble_proba > 0.5).astype(int)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': ensemble_predictions})
        self.submissions['feature_ensemble'] = submission
        
        return submission
    
    def ultra_simple_model(self, X_train, y_train, X_test, test_ids, selected_features):
        """극단적으로 단순한 모델"""
        print(f"\n🎯 극단적 단순화: 상위 5개 특성만 사용")
        
        # 상위 5개만 선택
        top5_features = selected_features[:5]
        print(f"   사용 특성: {top5_features}")
        
        X_train_simple = X_train[top5_features]
        X_test_simple = X_test[top5_features]
        
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count
        
        # 매우 단순한 XGBoost
        simple_model = xgb.XGBClassifier(
            n_estimators=100,  # 적은 트리
            max_depth=4,       # 얕은 깊이
            learning_rate=0.1, 
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        
        # CV 평가
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(simple_model, X_train_simple, y_train, cv=cv, scoring='f1')
        
        print(f"   CV F1: {cv_scores.mean():.6f} ± {cv_scores.std():.6f}")
        
        # 학습 및 예측
        simple_model.fit(X_train_simple, y_train)
        simple_predictions = simple_model.predict(X_test_simple)
        
        submission = pd.DataFrame({'ID': test_ids, 'Cancer': simple_predictions})
        self.submissions['ultra_simple'] = submission
        
        return submission

def run_feature_selection_optimization():
    """특성 선택 기반 최적화 실행"""
    print("🎯 특성 중요도 기반 단순화 전략!")
    print("복잡한 것보다 단순한 것이 때로는 더 좋습니다!")
    print("=" * 60)
    
    analyzer = FeatureImportanceAnalyzer()
    
    # 1. 데이터 로딩 및 전처리
    X_train, y_train, X_test, test_ids, feature_cols = analyzer.load_and_preprocess()
    
    # 2. 특성 중요도 분석
    importance_df = analyzer.analyze_feature_importance(X_train, y_train, feature_cols)
    
    # 3. 다양한 특성 조합 테스트
    best_submission, best_features = analyzer.test_feature_combinations(
        X_train, y_train, X_test, test_ids, importance_df
    )
    
    # 4. 선택된 특성으로 앙상블
    ensemble_submission = analyzer.ensemble_with_selected_features(
        X_train, y_train, X_test, test_ids, best_features
    )
    
    # 5. 극단적 단순화
    simple_submission = analyzer.ultra_simple_model(
        X_train, y_train, X_test, test_ids, best_features
    )
    
    # 6. 파일 저장
    submissions_to_save = [
        ('feature_selected_best.csv', best_submission, "최적 특성 조합"),
        ('feature_ensemble.csv', ensemble_submission, "선택된 특성 앙상블"),
        ('ultra_simple.csv', simple_submission, "극단적 단순화")
    ]
    
    print(f"\n💾 제출 파일 저장...")
    for filename, submission, description in submissions_to_save:
        submission.to_csv(filename, index=False)
        
        # 예측 분포 확인
        pred_dist = submission['Cancer'].value_counts()
        print(f"  {filename}: {description}")
        print(f"    예측 분포 - 0: {pred_dist.get(0, 0)}, 1: {pred_dist.get(1, 0)}")
    
    print(f"\n🎯 제출 우선순위:")
    print(f"1. feature_ensemble.csv ⭐ (선택된 특성 앙상블)")
    print(f"2. feature_selected_best.csv (최적 특성 조합)")
    print(f"3. ultra_simple.csv (극단적 단순화)")
    
    print(f"\n💡 핵심 아이디어:")
    print(f"- 중요하지 않은 특성들이 노이즈를 만들 수 있음")
    print(f"- 단순한 모델이 일반화 성능이 더 좋을 수 있음")
    print(f"- 과적합을 줄여서 실제 성능 향상 가능")
    
    # 특성 중요도 시각화 (선택사항)
    print(f"\n📊 특성 중요도 분석 완료!")
    print(f"상위 특성들: {best_features}")

if __name__ == "__main__":
    run_feature_selection_optimization()

🎯 특성 중요도 기반 단순화 전략!
복잡한 것보다 단순한 것이 때로는 더 좋습니다!
📊 원본 특성 개수: 14
   특성 목록: ['Age', 'Gender', 'Country', 'Race', 'Family_Background', 'Radiation_History', 'Iodine_Deficiency', 'Smoke', 'Weight_Risk', 'Diabetes', 'Nodule_Size', 'TSH_Result', 'T4_Result', 'T3_Result']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al


🔍 특성 중요도 분석 중...
  XGBoost 중요도 계산...
  LightGBM 중요도 계산...
  Random Forest 중요도 계산...
  F-test 중요도 계산...
  Mutual Information 계산...

📊 특성 중요도 순위:
  T4_Result           : 94.0161 (XGB: 0.015, LGB: 470.000, RF: 0.032)
  Nodule_Size         : 88.0271 (XGB: 0.015, LGB: 440.000, RF: 0.028)
  T3_Result           : 85.6091 (XGB: 0.014, LGB: 428.000, RF: 0.030)
  TSH_Result          : 84.2101 (XGB: 0.014, LGB: 421.000, RF: 0.031)
  Age                 : 70.2214 (XGB: 0.013, LGB: 351.000, RF: 0.023)
  Country             : 63.0974 (XGB: 0.097, LGB: 315.000, RF: 0.130)
  Race                : 40.9458 (XGB: 0.201, LGB: 203.000, RF: 0.315)
  Iodine_Deficiency   : 19.9541 (XGB: 0.109, LGB: 98.000, RF: 0.108)
  Radiation_History   : 17.5171 (XGB: 0.166, LGB: 86.000, RF: 0.098)
  Family_Background   : 13.7938 (XGB: 0.303, LGB: 67.000, RF: 0.193)
  Weight_Risk         : 8.0395 (XGB: 0.014, LGB: 40.000, RF: 0.003)
  Gender              : 7.0771 (XGB: 0.014, LGB: 35.000, RF: 0.003)
  Diabetes            