In [1]:
import pandas as pd
import numpy as np
import warnings

from LG_Aimers_6th.cal_auc import calculate_auc
from LG_Aimers_6th.lgbm_process import lgbm_process

warnings.simplefilter(action='ignore', category=FutureWarning)

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

train_path = '../data/custom_train_1.csv'
test_path = '../data/custom_test_1.csv'

## LGBM

In [2]:
train = pd.read_csv(train_path).drop(columns=["ID"])
test = pd.read_csv(test_path).drop(columns=["ID"])
train, test = lgbm_process(train, test)
print(train.shape, test.shape)

(205080, 67) (51271, 66)


In [3]:
seed_list = [777] # 333, 777
folds = [10]

In [4]:
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

total_auc, total_acc, total_f1 = [], [], []
test_preds_lgbm = []

for seed in seed_list:
    for k in folds:
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        auc_scores, acc_scores,  f1_scores = [], [], []

        for fold, (train_idx, valid_idx) in enumerate(skf.split(train.drop(columns=['임신 성공 여부']), train["임신 성공 여부"])):
            fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
            fold_train2 = train.iloc[train_idx].copy().reset_index(drop=True)
            fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
            fold_test = test.copy()

            fold_train, fold_valid = lgbm_process(fold_train, fold_valid, seed=seed)
            fold_train2, fold_test = lgbm_process(fold_train2, fold_test, seed=seed)

            X_train = fold_train.drop(columns=['임신 성공 여부'])
            X_valid = fold_valid.drop(columns=['임신 성공 여부'])
            y_train = fold_train['임신 성공 여부']
            y_valid = fold_valid['임신 성공 여부']

            # Optuna
            lgbm_params = {
                'n_estimators': 1134,
                'learning_rate': 0.009183378614268902,
                'max_depth': 15,
                'num_leaves': 59,
                'min_child_samples': 56,
                'subsample': 0.5894604069264655,
                'colsample_bytree': 0.6305670256882752,
                'reg_alpha': 7.47936987466662,
                'reg_lambda': 0.0010986427203281623,
            }

            model_lgb = LGBMClassifier(
                **lgbm_params,
                verbosity=-1,
                n_jobs=-1,
                random_state=seed,
                early_stopping_rounds=100,
            )

            model_lgb.fit(
                X_train, y_train,
                eval_set=(X_valid, y_valid),
            )

            valid_preds_proba  = model_lgb.predict_proba(X_valid)[:, 1]
            valid_preds_class = model_lgb.predict(X_valid)  # 클래스 예측값 (0 또는 1)

            # AUC, Accuracy, F1-score 계산
            auc_ = roc_auc_score(y_valid, valid_preds_proba)
            acc_ = accuracy_score(y_valid, valid_preds_class)
            f1_ = f1_score(y_valid, valid_preds_class)

            print(f"Seed[{seed:<3}] Fold {fold + 1} | AUC: {auc_:.7f} | Acc: {acc_:.7f} | F1: {f1_:.7f}")

            auc_scores.append(auc_)
            acc_scores.append(acc_)
            f1_scores.append(f1_)

            total_auc.append(auc_)
            total_acc.append(acc_)
            total_f1.append(f1_)

            test_pred = model_lgb.predict_proba(fold_test)[:, 1]
            test_preds_lgbm.append(test_pred)

        # fold 별 평균 성능 계산
        avg_auc = np.mean(auc_scores)
        avg_acc = np.mean(acc_scores)
        avg_f1 = np.mean(f1_scores)

        print("-" * 80)
        print(f"Seed[{seed:<3}] Average Metrics | AUC: {avg_auc:.7f} | Acc: {avg_acc:.7f} | F1: {avg_f1:.7f}")
        print("-" * 80)

val_auc = np.mean(total_auc)
val_acc = np.mean(total_acc)
val_f1 = np.mean(total_f1)

print("-" * 80)
print(f"Validation Average Metrics | AUC: {val_auc:.7f} | Acc: {val_acc:.7f} | F1: {val_f1:.7f}")

Seed[777] Fold 1 | AUC: 0.7438852 | Acc: 0.7495611 | F1: 0.2100892
Seed[777] Fold 2 | AUC: 0.7395967 | Acc: 0.7453189 | F1: 0.1918614
Seed[777] Fold 3 | AUC: 0.7436206 | Acc: 0.7472694 | F1: 0.1968077
Seed[777] Fold 4 | AUC: 0.7356095 | Acc: 0.7426370 | F1: 0.1854938
Seed[777] Fold 5 | AUC: 0.7399971 | Acc: 0.7485372 | F1: 0.1988504
Seed[777] Fold 6 | AUC: 0.7402280 | Acc: 0.7443437 | F1: 0.1852370
Seed[777] Fold 7 | AUC: 0.7416847 | Acc: 0.7482933 | F1: 0.2041320
Seed[777] Fold 8 | AUC: 0.7366418 | Acc: 0.7480008 | F1: 0.2078479
Seed[777] Fold 9 | AUC: 0.7410521 | Acc: 0.7440999 | F1: 0.1911221
Seed[777] Fold 10 | AUC: 0.7375110 | Acc: 0.7462454 | F1: 0.2011053
--------------------------------------------------------------------------------
Seed[777] Average Metrics | AUC: 0.7399827 | Acc: 0.7464307 | F1: 0.1972547
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Validation

In [5]:
tmp_submission = pd.DataFrame({f'lgbm_{seed_list[0]}': np.mean(test_preds_lgbm, axis=0)})
tmp_submission

Unnamed: 0,lgbm_777
0,0.227250
1,0.222004
2,0.001864
3,0.202638
4,0.388557
...,...
51266,0.001403
51267,0.246478
51268,0.112169
51269,0.000793


In [7]:
score = calculate_auc(tmp_submission, seed=1)
score

0.7397153564923226

## Catboost

In [17]:
from cat_process import cb_all_process
train = pd.read_csv(train_path).drop(columns=["ID"])
test = pd.read_csv(test_path).drop(columns=["ID"])

train, test = cb_all_process(train, test)

print(train.shape, test.shape)

(256351, 65) (90067, 64)


In [18]:
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

total_auc, total_acc, total_f1 = [], [], []
test_preds_cat = []

is_first = True
for seed in seed_list:
    for k in folds:
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        auc_scores, acc_scores,  f1_scores = [], [], []

        for fold, (train_idx, valid_idx) in enumerate(skf.split(train.drop(columns=['임신 성공 여부']), train["임신 성공 여부"])):
            fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
            fold_train2 = train.iloc[train_idx].copy().reset_index(drop=True)
            fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
            fold_test = test.copy()

            fold_train, fold_valid = cb_all_process(fold_train, fold_valid)
            fold_train2, fold_test = cb_all_process(fold_train2, fold_test)

            X_train = fold_train.drop(columns=['임신 성공 여부'])
            X_valid = fold_valid.drop(columns=['임신 성공 여부'])
            y_train = fold_train['임신 성공 여부']
            y_valid = fold_valid['임신 성공 여부']

            cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

            if is_first:
                cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
                print(f'범주형 변수: {len(cat_features)}개 \n {cat_features} \n')

                num_features = [col for col in X_train.columns if col not in cat_features + ['임신 성공 여부']]
                print(f'수치형 변수: {len(num_features)}개 \n {num_features} \n')
                is_first = False

            weights_sqrt = np.sqrt(compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train))

            # Catboost 모델 초기화
            model_cat = CatBoostClassifier(iterations=2000, learning_rate=0.05, random_seed=seed,
                                       loss_function='Logloss', eval_metric='Logloss', class_weights=weights_sqrt,
                                       cat_features=cat_features, thread_count=-1)

            model_cat.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=100, verbose=1000)

            valid_preds_proba = model_cat.predict_proba(X_valid)[:, 1]
            valid_preds_class = model_cat.predict(X_valid)  # 클래스 예측값 (0 또는 1)

            # AUC, Accuracy, F1-score 계산
            auc_ = roc_auc_score(y_valid, valid_preds_proba)
            acc_ = accuracy_score(y_valid, valid_preds_class)
            f1_ = f1_score(y_valid, valid_preds_class)

            print(f"Seed[{seed:<3}] Fold {fold + 1} | AUC: {auc_:.7f} | Acc: {acc_:.7f} | F1: {f1_:.7f}")

            auc_scores.append(auc_)
            acc_scores.append(acc_)
            f1_scores.append(f1_)

            total_auc.append(auc_)
            total_acc.append(acc_)
            total_f1.append(f1_)

            test_pred = model_cat.predict_proba(fold_test)[:, 1]
            test_preds_cat.append(test_pred)

        # fold 별 평균 성능 계산
        avg_auc = np.mean(auc_scores)
        avg_acc = np.mean(acc_scores)
        avg_f1 = np.mean(f1_scores)

        print("-" * 80)
        print(f"Seed[{seed:<3}] Average Metrics | AUC: {avg_auc:.7f} | Acc: {avg_acc:.7f} | F1: {avg_f1:.7f}")
        print("-" * 80)

val_auc = np.mean(total_auc)
val_acc = np.mean(total_acc)
val_f1 = np.mean(total_f1)

print("-" * 80)
print(f"Validation Average Metrics | AUC: {val_auc:.7f} | Acc: {val_acc:.7f} | F1: {val_f1:.7f}")

범주형 변수: 18개 
 ['시술 시기 코드', '시술 당시 나이', '배란 유도 유형', '배아 생성 주요 이유', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이', '시술유형_통합'] 

수치형 변수: 46개 
 ['임신 시도 또는 마지막 임신 경과 연수', '배란 자극 여부', '단일 배아 이식 여부', '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부', '남성 주 불임 원인', '남성 부 불임 원인', '여성 주 불임 원인', '여성 부 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인', '불명확 불임 원인', '불임 원인 - 난관 질환', '불임 원인 - 남성 요인', '불임 원인 - 배란 장애', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증', '불임 원인 - 정자 농도', '불임 원인 - 정자 운동성', '불임 원인 - 정자 형태', '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '이식된 배아 수', '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수', '해동된 배아 수', '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수', '혼합된 난자 수', '파트너 정자와 혼합된 난자 수', '기증자 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부', '대리모 여부', 'PGD 시술 여부', 'PGS 시술 여부', '난자 채취 경과일', '난자 해동 경과일', '난자 혼합 경과일', '배아 이식 경과일', '배아 해동 경과일', '시술_임신'] 

0:	learn: 0.6708799	test: 0.6709346	best: 0.6709346 (

In [19]:
tmp_submission[f'cat_{seed_list[0]}'] = np.mean(test_preds_cat, axis=0)
tmp_submission

Unnamed: 0,lgbm_777,cat_777
0,0.001776,0.002551
1,0.002018,0.013353
2,0.156306,0.233818
3,0.097642,0.166973
4,0.524963,0.630286
...,...,...
90062,0.001469,0.002552
90063,0.294831,0.461483
90064,0.467621,0.545617
90065,0.172226,0.325369


In [26]:
tmp_submission.to_csv(f'./probability_{seed_list[0]}.csv', index=False)

## Ensemble

In [42]:
proba_333 = pd.read_csv('./probability_333.csv')
proba_777 = pd.read_csv('./probability_777.csv')

print(proba_333.shape, proba_777.shape)

(90067, 2) (90067, 2)


In [43]:
tmp = pd.concat([proba_333, proba_777], axis=1)
tmp['row_avg'] = tmp.mean(axis=1)
tmp

Unnamed: 0,lgbm_333,cat_333,lgbm_777,cat_777,row_avg
0,0.001763,0.002647,0.001776,0.002551,0.002184
1,0.002393,0.014727,0.002018,0.013353,0.008123
2,0.153479,0.231817,0.156306,0.233818,0.193855
3,0.096653,0.167206,0.097642,0.166973,0.132118
4,0.525038,0.631011,0.524963,0.630286,0.577825
...,...,...,...,...,...
90062,0.001625,0.002789,0.001469,0.002552,0.002109
90063,0.297328,0.457107,0.294831,0.461483,0.377687
90064,0.468525,0.547831,0.467621,0.545617,0.507399
90065,0.173623,0.340826,0.172226,0.325369,0.253011


In [44]:
submit = pd.read_csv(sample_path)
final_pred = tmp.loc[:, 'row_avg']
submit['probability'] = final_pred
submit

Unnamed: 0,ID,probability
0,TEST_00000,0.002184
1,TEST_00001,0.008123
2,TEST_00002,0.193855
3,TEST_00003,0.132118
4,TEST_00004,0.577825
...,...,...
90062,TEST_90062,0.002109
90063,TEST_90063,0.377687
90064,TEST_90064,0.507399
90065,TEST_90065,0.253011


In [45]:
tmp.to_csv(f'0730_ensemble_3.csv', index=False, encoding='utf-8-sig')