In [1]:
import pandas as pd
import numpy as np
import warnings

from LG_Aimers_6th.cal_auc import calculate_auc
from LG_Aimers_6th.lgbm_process import lgbm_process

warnings.simplefilter(action='ignore', category=FutureWarning)

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

## LGBM

In [2]:
seed = 8
folds = 10

In [3]:
train_path = f'../data/custom_train_{seed}.csv'
test_path = f'../data/custom_test_{seed}.csv'

train = pd.read_csv(train_path).drop(columns=["ID"])
test = pd.read_csv(test_path).drop(columns=["ID"])
train, test = lgbm_process(train, test)
print(train.shape, test.shape)

(205080, 67) (51271, 66)


In [None]:
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

total_auc, total_acc, total_f1 = [], [], []
test_preds_lgbm = []


skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
auc_scores, acc_scores,  f1_scores = [], [], []

for fold, (train_idx, valid_idx) in enumerate(skf.split(train.drop(columns=['임신 성공 여부']), train["임신 성공 여부"])):
    fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
    fold_train2 = train.iloc[train_idx].copy().reset_index(drop=True)
    fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
    fold_test = test.copy()

    fold_train, fold_valid = lgbm_process(fold_train, fold_valid, seed=seed)
    fold_train2, fold_test = lgbm_process(fold_train2, fold_test, seed=seed)

    X_train = fold_train.drop(columns=['임신 성공 여부'])
    X_valid = fold_valid.drop(columns=['임신 성공 여부'])
    y_train = fold_train['임신 성공 여부']
    y_valid = fold_valid['임신 성공 여부']

    # Optuna
    lgbm_params = {
        'n_estimators': 1134,
        'learning_rate': 0.009183378614268902,
        'max_depth': 15,
        'num_leaves': 59,
        'min_child_samples': 56,
        'subsample': 0.5894604069264655,
        'colsample_bytree': 0.6305670256882752,
        'reg_alpha': 7.47936987466662,
        'reg_lambda': 0.0010986427203281623,
    }

    model_lgb = LGBMClassifier(
        **lgbm_params,
        verbosity=-1,
        n_jobs=-1,
        random_state=seed,
        early_stopping_rounds=100,
    )

    model_lgb.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
    )

    valid_preds_proba  = model_lgb.predict_proba(X_valid)[:, 1]
    valid_preds_class = model_lgb.predict(X_valid)  # 클래스 예측값 (0 또는 1)

    # AUC, Accuracy, F1-score 계산
    auc_ = roc_auc_score(y_valid, valid_preds_proba)
    acc_ = accuracy_score(y_valid, valid_preds_class)
    f1_ = f1_score(y_valid, valid_preds_class)

    print(f"Seed[{seed:<3}] Fold {fold + 1} | AUC: {auc_:.7f} | Acc: {acc_:.7f} | F1: {f1_:.7f}")

    auc_scores.append(auc_)
    acc_scores.append(acc_)
    f1_scores.append(f1_)

    total_auc.append(auc_)
    total_acc.append(acc_)
    total_f1.append(f1_)

    test_pred = model_lgb.predict_proba(fold_test)[:, 1]
    test_preds_lgbm.append(test_pred)

# fold 별 평균 성능 계산
avg_auc = np.mean(auc_scores)
avg_acc = np.mean(acc_scores)
avg_f1 = np.mean(f1_scores)

print("-" * 80)
print(f"Seed[{seed:<3}] Average Metrics | AUC: {avg_auc:.7f} | Acc: {avg_acc:.7f} | F1: {avg_f1:.7f}")

In [5]:
tmp_submission = pd.DataFrame({f'lgbm_{seed}': np.mean(test_preds_lgbm, axis=0)})
tmp_submission

Unnamed: 0,lgbm_7
0,0.308669
1,0.197637
2,0.395747
3,0.002093
4,0.001020
...,...
51266,0.463850
51267,0.421454
51268,0.002172
51269,0.579917


In [7]:
score = calculate_auc(tmp_submission, seed=seed)
score

0.7425017672460732