In [None]:
# !pip install numpy==1.26.4
# !pip install pandas==2.2.2
# !pip install scikit-learn==1.5.1
# !pip install scipy==1.14.1
# !pip install statsmodels==0.14.2
# !pip install joblib==1.4.2
# !pip install threadpoolctl==3.5.0
# !pip install lightgbm==4.6.0
# !pip install catboost==1.2.3

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
CategoryEmbeddingModelConfig,
FTTransformerConfig,
TabNetModelConfig,
GANDALFConfig,
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.stacking import StackingModelConfig
# from pytorch_tabular.utils import make_mixed_dataset

from sklearn.preprocessing import LabelEncoder, FunctionTransformer, QuantileTransformer, MultiLabelBinarizer

from sklearn.impute import SimpleImputer

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

import random

import preprocessing

from pytorch_lightning.loggers import WandbLogger

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight

from embedding import TabularPipeline

## CategoryEmbedding Model

In [2]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from pytorch_tabular.models.common.heads import LinearHeadConfig

In [4]:
import sys
sys.path.append("../../")
from lgbm_process import lgbm_process

data_seed = 1
seed = 333

train_path = f'../../data/custom_train_{data_seed}.csv'
test_path = f'../../data/custom_test_{data_seed}.csv'

train = pd.read_csv(train_path).drop(columns=["ID"])
test = pd.read_csv(test_path).drop(columns=["ID"])
train, test = lgbm_process(train, test)
print(train.shape, test.shape)

(205080, 67) (51271, 66)


In [5]:
cat_cols = [col for col in train.columns if pd.api.types.is_object_dtype(train[col])]
numeric_cols = [col for col in train.columns if col not in cat_cols and col != '임신 성공 여부']

print(f'수치형 변수: {len(numeric_cols)}개 \n{numeric_cols}')
print(f'범주형 변수: {len(cat_cols)}개 \n{cat_cols}')
print(train.shape, test.shape)

수치형 변수: 59개 
['임신 시도 또는 마지막 임신 경과 연수', '배란 자극 여부', '배란 유도 유형', '단일 배아 이식 여부', '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부', '남성 주 불임 원인', '남성 부 불임 원인', '여성 주 불임 원인', '여성 부 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인', '불명확 불임 원인', '불임 원인 - 난관 질환', '불임 원인 - 남성 요인', '불임 원인 - 배란 장애', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증', '불임 원인 - 정자 농도', '불임 원인 - 정자 운동성', '불임 원인 - 정자 형태', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '이식된 배아 수', '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수', '해동된 배아 수', '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수', '혼합된 난자 수', '파트너 정자와 혼합된 난자 수', '기증자 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부', '대리모 여부', 'PGD 시술 여부', 'PGS 시술 여부', '난자 채취 경과일', '난자 혼합 경과일', '배아 이식 경과일', '배아 해동 경과일', '시술_임신', '배아생성이유_기증용', '배아생성이유_난자 저장용', '배아생성이유_배아 저장용', '배아생성이유_현재 시술용']
범주형 변수: 7개 
['시술 시기 코드', '시술 당시 나이', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이', '시술유형_통합']
(205080, 67) (51271, 66

## Categorical Embedding Transformer
- embedding_dim = min(50, (num_categories + 1) // 2)
    - 범주가 4개 → 임베딩 dim = 2
    - 범주가 10개 → 임베딩 dim = 5
    - 범주가 200개 → 임베딩 dim = 50 (최대값 제한)
    
## Embedding+LGBM
- nsplit=5 : 0.739812
- nsplit=10 : 

In [None]:
seed = 333
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

# 각 모델 폴드별 roc 확인용
roc_metrics = []

# 폴드별 평균 평가지표 확인용
auc_scores = []
acc_scores = []
f1_scores = []

# 최종 예측값 저장용
test_preds_lgbm = []

# StratifiedKFold
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['임신 성공 여부'])):
    
    # 현재 fold의 train/validation 데이터 분할
    train_fold = train.iloc[train_idx].copy().reset_index(drop=True)
    val_fold = train.iloc[val_idx].copy().reset_index(drop=True)    
    
    train2_fold = train_fold.copy()
    test_fold = test.copy() 
    
    # preprocessing
    train_fold, val_fold = lgbm_process(train_fold, val_fold, seed=seed)
    train2_fold, test_fold = lgbm_process(train2_fold, test_fold, seed=seed)
    
    # TabularPipeline 클래스를 이용해 데이터 준비, 모델 학습 및 임베딩 추출
    pipeline = TabularPipeline(train_fold, val_fold, test_fold, seed, numeric_cols, cat_cols)
    fold_train_trans, fold_valid_trans, fold_test_trans = pipeline.run_pipeline()
    
    # LGBM용 데이터 분리 (target 컬럼 분리)
    X_train = fold_train_trans.drop(columns=['임신 성공 여부'])
    y_train = fold_train_trans['임신 성공 여부']
    X_valid = fold_valid_trans.drop(columns=['임신 성공 여부'])
    y_valid = fold_valid_trans['임신 성공 여부']
    
    # LGBM 모델 학습 및 평가
    lgbm_params = {
        'n_estimators': 1134,
        'learning_rate': 0.009183378614268902,
        'max_depth': 15,
        'num_leaves': 59,
        'min_child_samples': 56,
        'subsample': 0.5894604069264655,
        'colsample_bytree': 0.6305670256882752,
        'reg_alpha': 7.47936987466662,
        'reg_lambda': 0.0010986427203281623,
    }
    
    model_lgb = LGBMClassifier(
        **lgbm_params,
        verbosity=-1,
        n_jobs=-1,
        random_state=seed,
    )
    
    model_lgb.fit(X_train, y_train)
    
    valid_preds_proba = model_lgb.predict_proba(X_valid)[:, 1]
    valid_preds_class = model_lgb.predict(X_valid)
    
    auc_ = roc_auc_score(y_valid, valid_preds_proba)
    acc_ = accuracy_score(y_valid, valid_preds_class)
    f1_  = f1_score(y_valid, valid_preds_class)
    
    print(f"Seed[{seed:<3}] Fold {fold + 1} | AUC: {auc_:.7f} | Acc: {acc_:.7f} | F1: {f1_:.7f}")
    
    auc_scores.append(auc_)
    acc_scores.append(acc_)
    f1_scores.append(f1_)
    
    test_pred = model_lgb.predict_proba(fold_test_trans)[:, 1]
    test_preds_lgbm.append(test_pred)

    # 다음 fold를 위해 모델 가중치 초기화
    tabular_model.model.reset_weights()


# k-fold 종료 후, 여러 fold의 테스트 예측 평균 내기
final_test_preds = np.mean(test_preds_lgbm, axis=0)
print("Final test predictions shape:", final_test_preds.shape)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 95 steps due to diverging loss.
Learning rate set to 0.01
Restoring states from the checkpoint path at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_7d6a817d-04c0-4339-bceb-abad4112258c.ckpt
Restored all states from the checkpoint at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_7d6a817d-04c0-4339-bceb-abad4112258c.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

Output()

Output()

Seed[333] Fold 1 | AUC: 0.7422031 | Acc: 0.7462454 | F1: 0.2003688


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 98 steps due to diverging loss.
Learning rate set to 0.0005248074602497723
Restoring states from the checkpoint path at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_2c251662-ff64-44d4-82df-889b62fa2266.ckpt
Restored all states from the checkpoint at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_2c251662-ff64-44d4-82df-889b62fa2266.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

Output()

Output()

Seed[333] Fold 2 | AUC: 0.7377796 | Acc: 0.7460991 | F1: 0.1991695


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 96 steps due to diverging loss.
Learning rate set to 0.0022908676527677745
Restoring states from the checkpoint path at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_a9253184-5e7c-41a1-a41b-35232d07135b.ckpt
Restored all states from the checkpoint at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_a9253184-5e7c-41a1-a41b-35232d07135b.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

Output()

Output()

Seed[333] Fold 3 | AUC: 0.7399399 | Acc: 0.7463673 | F1: 0.2031406


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 97 steps due to diverging loss.
Learning rate set to 0.0009120108393559097
Restoring states from the checkpoint path at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_04dec1cb-4cfa-48a2-9710-859eda24599b.ckpt
Restored all states from the checkpoint at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_04dec1cb-4cfa-48a2-9710-859eda24599b.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

Output()

Output()

Seed[333] Fold 4 | AUC: 0.7383046 | Acc: 0.7469768 | F1: 0.2009547


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.00012022644346174131
Restoring states from the checkpoint path at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_36e16f8f-e997-4121-968a-ed570eaa6e73.ckpt
Restored all states from the checkpoint at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_36e16f8f-e997-4121-968a-ed570eaa6e73.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

Output()

Output()

Seed[333] Fold 5 | AUC: 0.7387398 | Acc: 0.7451482 | F1: 0.1943738
Final test predictions shape: (51271,)


In [13]:
tmp_submission_csv = tmp_submission
tmp_submission_csv['embed_lgbm_nsplit_5'] = final_test_preds

In [14]:
tmp_submission = pd.DataFrame({f'embed_lgbm_nsplit_5_{data_seed}': final_test_preds})
tmp_submission

Unnamed: 0,embed_lgbm_nsplit_5_1
0,0.229640
1,0.214655
2,0.001916
3,0.194601
4,0.391126
...,...
51266,0.001347
51267,0.251095
51268,0.121988
51269,0.000767


In [15]:
## @@@@@ 해당 출력은 nsplit=5 입니다.@@@@@
import sys
sys.path.append("../../")
from cal_auc import calculate_auc

score = calculate_auc(tmp_submission, seed=data_seed)
print(f'[Seed: {data_seed}]: {score}')

[Seed: 1]: 0.7397517904006945


In [17]:
tmp_submission_csv.to_csv('predictions_embeding.csv', index=False, encoding='utf-8-sig')