<a href="https://colab.research.google.com/github/Donggeon2960/LGAIMER-PRACTICE/blob/main/Untitled7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# 필요한 패키지 설치 (category_encoders 제외)
!pip install imbalanced-learn lightgbm xgboost

# 주요 라이브러리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler, LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
import lightgbm as lgb
import xgboost as xgb

# 데이터 로드 및 결측치 높은 열 제거
try:
    train = pd.read_csv('/content/train.csv').drop('ID', axis=1)
    test = pd.read_csv('/content/test.csv').drop('ID', axis=1)
except FileNotFoundError:
    print("CSV 파일을 찾을 수 없습니다. 파일 경로를 확인해주세요.")
    # 파일이 현재 디렉토리에 있다면:
    # train = pd.read_csv('train.csv').drop('ID', axis=1)
    # test = pd.read_csv('test.csv').drop('ID', axis=1)

X_raw = train.drop('임신 성공 여부', axis=1)
y_raw = train['임신 성공 여부']

# 결측치가 많은 열 제거
high_missing_cols = ['난자 해동 경과일', 'PGS 시술 여부', 'PGD 시술 여부',
    '착상 전 유전 검사 사용 여부', '임신 시도 또는 마지막 임신 경과 연수', '배아 해동 경과일']

X_clean = X_raw.drop(columns=[col for col in high_missing_cols if col in X_raw.columns])
test_clean = test.drop(columns=[col for col in high_missing_cols if col in test.columns])

class PreprocessingPipeline:
    def __init__(self):
        self.categorical_cols = []
        self.numerical_cols = []
        self.preprocessors = {}

    def identify_column_types(self, X):
        """컬럼 타입 식별"""
        self.categorical_cols = []
        self.numerical_cols = []

        for col in X.columns:
            if X[col].dtype == 'object':
                self.categorical_cols.append(col)
            elif X[col].nunique() <= 10 and col.endswith(('여부', '원인', '유형', '횟수', '출처', '나이')):
                self.categorical_cols.append(col)
            else:
                self.numerical_cols.append(col)

    def preprocess_data(self, X_train, X_val, y_train, fit=True):
        """데이터 전처리 (LabelEncoder 사용)"""
        X_train = X_train.copy()
        X_val = X_val.copy()

        if fit:
            self.identify_column_types(X_train)

        # 범주형 변수 결측치 처리 (최빈값)
        for col in self.categorical_cols:
            if fit:
                mode = X_train[col].mode()
                mode_val = mode[0] if len(mode) > 0 else 'Unknown'
                self.preprocessors[f'{col}_mode'] = mode_val
            else:
                mode_val = self.preprocessors[f'{col}_mode']

            X_train[col].fillna(mode_val, inplace=True)
            X_val[col].fillna(mode_val, inplace=True)

        # 수치형 변수 결측치 처리 (중앙값)
        for col in self.numerical_cols:
            if fit:
                median_val = X_train[col].median()
                self.preprocessors[f'{col}_median'] = median_val
            else:
                median_val = self.preprocessors[f'{col}_median']

            X_train[col].fillna(median_val, inplace=True)
            X_val[col].fillna(median_val, inplace=True)

        # 피처 엔지니어링
        self.add_features(X_train)
        self.add_features(X_val)

        # 범주형 변수 인코딩 (LabelEncoder 사용)
        for col in self.categorical_cols:
            if fit:
                le = LabelEncoder()
                # 모든 가능한 값들을 포함시키기 위해 train과 val을 합쳐서 fit
                all_values = pd.concat([X_train[col], X_val[col]]).unique()
                le.fit(all_values)
                self.preprocessors[f'{col}_encoder'] = le
            else:
                le = self.preprocessors[f'{col}_encoder']

            # 새로운 값이 있을 경우 처리
            X_train[col] = X_train[col].apply(lambda x: x if x in le.classes_ else le.classes_[0])
            X_val[col] = X_val[col].apply(lambda x: x if x in le.classes_ else le.classes_[0])

            X_train[col] = le.transform(X_train[col])
            X_val[col] = le.transform(X_val[col])

        # 스케일링
        if fit:
            scaler = RobustScaler()
            self.preprocessors['scaler'] = scaler.fit(X_train)

        scaler = self.preprocessors['scaler']
        X_train_scaled = scaler.transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
        X_val = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)

        return X_train, X_val

    def add_features(self, df):
        """피처 엔지니어링"""
        # 배아 생성 효율
        if '총 생성 배아 수' in df.columns and '수집된 신선 난자 수' in df.columns:
            df['배아_생성_효율'] = df['총 생성 배아 수'] / (df['수집된 신선 난자 수'] + 1)

        # 배아 이식 비율
        if '이식된 배아 수' in df.columns and '총 생성 배아 수' in df.columns:
            df['배아_이식_비율'] = df['이식된 배아 수'] / (df['총 생성 배아 수'] + 1)

        # 미세주입 성공률
        if '미세주입에서 생성된 배아 수' in df.columns and '미세주입된 난자 수' in df.columns:
            df['미세주입_성공률'] = df['미세주입에서 생성된 배아 수'] / (df['미세주입된 난자 수'] + 1)

        # 나이 그룹 점수
        if '시술 당시 나이' in df.columns:
            age_map = {'만18-34세': 1, '만35-37세': 2, '만38-39세': 3,
                       '만40-42세': 4, '만43-44세': 5, '만45-50세': 6, '알 수 없음': 0}
            df['나이_그룹_점수'] = df['시술 당시 나이'].map(age_map).fillna(0)

# 모델 정의
models = {
    'LightGBM': lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        num_leaves=63,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        min_child_samples=20,
        random_state=42,
        verbose=-1
    ),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=3,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        eval_metric='logloss'
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=500,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42
    )
}

# 교차 검증 및 모델 평가
print("모델 훈련 및 평가를 시작합니다...")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model_scores = {}

for name, model in models.items():
    print(f"\n{name} 모델 훈련 중...")
    scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_clean, y_raw)):
        print(f"  Fold {fold + 1}/5 처리 중...")

        # 데이터 분할
        X_train, X_val = X_clean.iloc[train_idx], X_clean.iloc[val_idx]
        y_train, y_val = y_raw.iloc[train_idx], y_raw.iloc[val_idx]

        # 전처리 (각 fold마다 새로운 pipeline 인스턴스 생성)
        pipeline = PreprocessingPipeline()
        X_train_proc, X_val_proc = pipeline.preprocess_data(X_train, X_val, y_train, fit=True)

        # SMOTE를 사용한 클래스 균형 맞추기
        smote = SMOTE(random_state=42)
        X_train_bal, y_train_bal = smote.fit_resample(X_train_proc, y_train)

        # 모델 훈련
        model.fit(X_train_bal, y_train_bal)

        # 예측 및 평가
        y_pred = model.predict_proba(X_val_proc)[:, 1]
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)

    model_scores[name] = {
        'mean': np.mean(scores),
        'std': np.std(scores),
        'scores': scores
    }

# 결과 출력
print("\n" + "="*50)
print("모델 성능 (ROC-AUC):")
print("="*50)

for name, results in sorted(model_scores.items(), key=lambda x: x[1]['mean'], reverse=True):
    mean_score = results['mean']
    std_score = results['std']
    print(f"{name}: {mean_score:.4f} (±{std_score:.4f})")

print("\n최고 성능 모델:", max(model_scores.items(), key=lambda x: x[1]['mean'])[0])

모델 훈련 및 평가를 시작합니다...

LightGBM 모델 훈련 중...
  Fold 1/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 2/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 3/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 4/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 5/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 


XGBoost 모델 훈련 중...
  Fold 1/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 2/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 3/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 4/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 5/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 


RandomForest 모델 훈련 중...
  Fold 1/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 2/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 3/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 4/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

  Fold 5/5 처리 중...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 


모델 성능 (ROC-AUC):
LightGBM: 0.7363 (±0.0017)
XGBoost: 0.7355 (±0.0020)
RandomForest: 0.7320 (±0.0019)

최고 성능 모델: LightGBM
