In [39]:
!pip install numpy==1.26.4
!pip install pandas==2.2.2
!pip install scikit-learn==1.5.1
!pip install scipy==1.14.1
!pip install statsmodels==0.14.2
!pip install joblib==1.4.2
!pip install threadpoolctl==3.5.0
!pip install lightgbm==4.6.0
!pip install catboost==1.2.3



In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
CategoryEmbeddingModelConfig,
FTTransformerConfig,
TabNetModelConfig,
GANDALFConfig,
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.stacking import StackingModelConfig
# from pytorch_tabular.utils import make_mixed_dataset

from sklearn.preprocessing import LabelEncoder, FunctionTransformer, QuantileTransformer, MultiLabelBinarizer

from sklearn.impute import SimpleImputer

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score

import random

import preprocessing

from pytorch_lightning.loggers import WandbLogger

## CategoryEmbedding Model

In [None]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from pytorch_tabular.models.common.heads import LinearHeadConfig

In [None]:
from lgbm_process import lgbm_process

data_seed = 1
seed = 777

train_path = f'../../data/custom_train_{data_seed}.csv'
test_path = f'../../data/custom_test_{data_seed}.csv'

train = pd.read_csv(train_path).drop(columns=["ID"])
test = pd.read_csv(test_path).drop(columns=["ID"])
train, test = lgbm_process(train, test)
print(train.shape, test.shape)

In [None]:
cat_cols = [col for col in train.columns if pd.api.types.is_categorical_dtype(train[col])]
numeric_cols = [col for col in train.columns if col not in cat_cols and col != '임신 성공 여부']

print(f'수치형 변수: {len(numeric_cols)}개 \n{numeric_cols}')
print(f'범주형 변수: {len(cat_cols)}개 \n{cat_cols}')
print(train.shape, test.shape)

In [None]:
data_config = DataConfig(
    target=["임신 성공 여부"],
    continuous_cols=numeric_cols,
    categorical_cols=cat_cols,
    # continuous_feature_transform="quantile_normal",
    normalize_continuous_features=False,
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  
    batch_size=4096,
    max_epochs=50,
    early_stopping="valid_loss",     
    early_stopping_mode="min",
    early_stopping_patience=3,
    checkpoints="valid_loss",        
    load_best=True, 
    devices=-1,  # -1 means use all available
    seed=seed
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="",  # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming",
).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="512-256-16",  # Number of nodes in each layer
    activation="LeakyReLU",  # Activation between each layers
    dropout=0.1,
    initialization="kaiming",
    head="LinearHead",  # Linear Head
    head_config=head_config,  # Linear Head Config
    learning_rate=1e-3,
    # metrics=["accuracy", "f1_score", "auc"],        
    # metrics_params=[{}, {}, {}],
    # metrics_prob_input=[False, True, True],
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=False,
)

In [None]:
# ROC AUC
def _roc_auc_scoer(y_true, y_pred):
    return roc_auc_score(y_true, y_pred['임신 성공 여부_1_probability'])

seed = 777
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

# 각 모델 폴드별 roc 확인용
roc_metrics = []

# StratifiedKFold
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['임신 성공 여부'])):
    
    # 현재 fold의 train/validation 데이터 분할
    train_fold = train.iloc[train_idx].copy().reset_index(drop=True)
    val_fold = train.iloc[val_idx].copy().reset_index(drop=True)    
    
    train2_fold = train_fold.copy()
    test_fold = test.copy() 
    
    # preprocessing
    train_fold, val_fold = lgbm_process(train_fold, val_fold, seed=seed)
    train2_fold, test_fold = lgbm_process(train2_fold, test_fold, seed=seed)
    
    # 첫 fold일 때 datamodule과 모델 초기화, 이후 fold에서는 copy로 재사용
    if fold == 0:
        datamodule = tabular_model.prepare_dataloader(train=train_fold, validation=val_fold, seed=seed)
        model = tabular_model.prepare_model(datamodule)
    else:
        datamodule = datamodule.copy(train=train_fold, validation=val_fold)
    
    # 모델 학습 (각 Fold별로 학습 진행)
    tabular_model.train(model, datamodule)
    
    # 검증 데이터에 대해 예측 수행
    pred_df = tabular_model.predict(val_fold)

    # ROC AUC 계산
    fold_roc = _roc_auc_scoer(val_fold["임신 성공 여부"], pred_df)
    roc_metrics.append(fold_roc)
    
    print(f"Fold {fold+1} ROC AUC: {fold_roc:.8f}")

    # 다음 fold를 위해 모델 가중치 초기화
    tabular_model.model.reset_weights()

# 전체 Fold의 평균 ROC AUC 출력
average_roc_auc = np.mean(roc_metrics)
print(f"Average ROC AUC over {skf.n_splits} folds: {average_roc_auc:.8f}")    

## Categorical Embedding Transformer
- .fit_transform() 후 결과 .columns 찍어보기

In [None]:
transformer = CategoricalEmbeddingTransformer(tabular_model)
train_transform = transformer.fit_transform(train)

## LGBM

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight

train_path = './data/train.csv'
test_path = './data/test.csv'
sample_path = './data/sample_submission.csv'

In [41]:
from lgbm_process import lgbm_process

train = pd.read_csv(train_path).drop(columns=["ID"])
test = pd.read_csv(test_path).drop(columns=["ID"])
train, test = lgbm_process(train, test)
print(train.shape, test.shape)

(256351, 67) (90067, 66)


In [14]:
seed_list = [777] # 333, 777
folds = [10]

In [15]:
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

total_auc, total_acc, total_f1 = [], [], []
test_preds_lgbm = []

for seed in seed_list:
    for k in folds:
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        auc_scores, acc_scores,  f1_scores = [], [], []

        for fold, (train_idx, valid_idx) in enumerate(skf.split(train.drop(columns=['임신 성공 여부']), train["임신 성공 여부"])):
            fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
            fold_train2 = train.iloc[train_idx].copy().reset_index(drop=True)
            fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
            fold_test = test.copy()

            fold_train, fold_valid = lgbm_process(fold_train, fold_valid, seed=seed)
            fold_train2, fold_test = lgbm_process(fold_train2, fold_test, seed=seed)

            X_train = fold_train.drop(columns=['임신 성공 여부'])
            X_valid = fold_valid.drop(columns=['임신 성공 여부'])
            y_train = fold_train['임신 성공 여부']
            y_valid = fold_valid['임신 성공 여부']

            # Optuna
            lgbm_params = {
                'n_estimators': 1134,
                'learning_rate': 0.009183378614268902,
                'max_depth': 15,
                'num_leaves': 59,
                'min_child_samples': 56,
                'subsample': 0.5894604069264655,
                'colsample_bytree': 0.6305670256882752,
                'reg_alpha': 7.47936987466662,
                'reg_lambda': 0.0010986427203281623,
            }

            model_lgb = LGBMClassifier(
                **lgbm_params,
                verbosity=-1,
                n_jobs=-1,
                random_state=seed,
            )

            model_lgb.fit(
                X_train, y_train,
            )

            valid_preds_proba  = model_lgb.predict_proba(X_valid)[:, 1]
            valid_preds_class = model_lgb.predict(X_valid)  # 클래스 예측값 (0 또는 1)

            # AUC, Accuracy, F1-score 계산
            auc_ = roc_auc_score(y_valid, valid_preds_proba)
            acc_ = accuracy_score(y_valid, valid_preds_class)
            f1_ = f1_score(y_valid, valid_preds_class)

            print(f"Seed[{seed:<3}] Fold {fold + 1} | AUC: {auc_:.7f} | Acc: {acc_:.7f} | F1: {f1_:.7f}")

            auc_scores.append(auc_)
            acc_scores.append(acc_)
            f1_scores.append(f1_)

            total_auc.append(auc_)
            total_acc.append(acc_)
            total_f1.append(f1_)

            test_pred = model_lgb.predict_proba(fold_test)[:, 1]
            test_preds_lgbm.append(test_pred)

        # fold 별 평균 성능 계산
        avg_auc = np.mean(auc_scores)
        avg_acc = np.mean(acc_scores)
        avg_f1 = np.mean(f1_scores)

        print("-" * 80)
        print(f"Seed[{seed:<3}] Average Metrics | AUC: {avg_auc:.7f} | Acc: {avg_acc:.7f} | F1: {avg_f1:.7f}")
        print("-" * 80)

val_auc = np.mean(total_auc)
val_acc = np.mean(total_acc)
val_f1 = np.mean(total_f1)

print("-" * 80)
print(f"Validation Average Metrics | AUC: {val_auc:.7f} | Acc: {val_acc:.7f} | F1: {val_f1:.7f}")

Seed[777] Fold 1 | AUC: 0.7377424 | Acc: 0.7441489 | F1: 0.1947207
Seed[777] Fold 2 | AUC: 0.7379507 | Acc: 0.7454652 | F1: 0.1927502
Seed[777] Fold 3 | AUC: 0.7417659 | Acc: 0.7466355 | F1: 0.1968592
Seed[777] Fold 4 | AUC: 0.7420383 | Acc: 0.7474156 | F1: 0.1989360
Seed[777] Fold 5 | AUC: 0.7354276 | Acc: 0.7446460 | F1: 0.2057753
Seed[777] Fold 6 | AUC: 0.7402318 | Acc: 0.7469475 | F1: 0.2010100
Seed[777] Fold 7 | AUC: 0.7432295 | Acc: 0.7486249 | F1: 0.1985075
Seed[777] Fold 8 | AUC: 0.7404047 | Acc: 0.7440999 | F1: 0.1954869
Seed[777] Fold 9 | AUC: 0.7402753 | Acc: 0.7464404 | F1: 0.2006886
Seed[777] Fold 10 | AUC: 0.7453617 | Acc: 0.7486639 | F1: 0.2003227
--------------------------------------------------------------------------------
Seed[777] Average Metrics | AUC: 0.7404428 | Acc: 0.7463088 | F1: 0.1985057
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Validation

In [16]:
tmp_submission = pd.DataFrame({f'lgbm_{seed_list[0]}': np.mean(test_preds_lgbm, axis=0)})
tmp_submission

Unnamed: 0,lgbm_777
0,0.001776
1,0.002018
2,0.156306
3,0.097642
4,0.524963
...,...
90062,0.001469
90063,0.294831
90064,0.467621
90065,0.172226


## Catboost

In [17]:
from cat_process import cb_all_process
train = pd.read_csv(train_path).drop(columns=["ID"])
test = pd.read_csv(test_path).drop(columns=["ID"])

train, test = cb_all_process(train, test)

print(train.shape, test.shape)

(256351, 65) (90067, 64)


In [18]:
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

total_auc, total_acc, total_f1 = [], [], []
test_preds_cat = []

is_first = True
for seed in seed_list:
    for k in folds:
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        auc_scores, acc_scores,  f1_scores = [], [], []

        for fold, (train_idx, valid_idx) in enumerate(skf.split(train.drop(columns=['임신 성공 여부']), train["임신 성공 여부"])):
            fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
            fold_train2 = train.iloc[train_idx].copy().reset_index(drop=True)
            fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
            fold_test = test.copy()

            fold_train, fold_valid = cb_all_process(fold_train, fold_valid)
            fold_train2, fold_test = cb_all_process(fold_train2, fold_test)

            X_train = fold_train.drop(columns=['임신 성공 여부'])
            X_valid = fold_valid.drop(columns=['임신 성공 여부'])
            y_train = fold_train['임신 성공 여부']
            y_valid = fold_valid['임신 성공 여부']

            cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

            if is_first:
                cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
                print(f'범주형 변수: {len(cat_features)}개 \n {cat_features} \n')

                num_features = [col for col in X_train.columns if col not in cat_features + ['임신 성공 여부']]
                print(f'수치형 변수: {len(num_features)}개 \n {num_features} \n')
                is_first = False

            weights_sqrt = np.sqrt(compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train))

            # Catboost 모델 초기화
            model_cat = CatBoostClassifier(iterations=2000, learning_rate=0.05, random_seed=seed,
                                       loss_function='Logloss', eval_metric='Logloss', class_weights=weights_sqrt,
                                       cat_features=cat_features, thread_count=-1)

            model_cat.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=100, verbose=1000)

            valid_preds_proba = model_cat.predict_proba(X_valid)[:, 1]
            valid_preds_class = model_cat.predict(X_valid)  # 클래스 예측값 (0 또는 1)

            # AUC, Accuracy, F1-score 계산
            auc_ = roc_auc_score(y_valid, valid_preds_proba)
            acc_ = accuracy_score(y_valid, valid_preds_class)
            f1_ = f1_score(y_valid, valid_preds_class)

            print(f"Seed[{seed:<3}] Fold {fold + 1} | AUC: {auc_:.7f} | Acc: {acc_:.7f} | F1: {f1_:.7f}")

            auc_scores.append(auc_)
            acc_scores.append(acc_)
            f1_scores.append(f1_)

            total_auc.append(auc_)
            total_acc.append(acc_)
            total_f1.append(f1_)

            test_pred = model_cat.predict_proba(fold_test)[:, 1]
            test_preds_cat.append(test_pred)

        # fold 별 평균 성능 계산
        avg_auc = np.mean(auc_scores)
        avg_acc = np.mean(acc_scores)
        avg_f1 = np.mean(f1_scores)

        print("-" * 80)
        print(f"Seed[{seed:<3}] Average Metrics | AUC: {avg_auc:.7f} | Acc: {avg_acc:.7f} | F1: {avg_f1:.7f}")
        print("-" * 80)

val_auc = np.mean(total_auc)
val_acc = np.mean(total_acc)
val_f1 = np.mean(total_f1)

print("-" * 80)
print(f"Validation Average Metrics | AUC: {val_auc:.7f} | Acc: {val_acc:.7f} | F1: {val_f1:.7f}")

범주형 변수: 18개 
 ['시술 시기 코드', '시술 당시 나이', '배란 유도 유형', '배아 생성 주요 이유', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이', '시술유형_통합'] 

수치형 변수: 46개 
 ['임신 시도 또는 마지막 임신 경과 연수', '배란 자극 여부', '단일 배아 이식 여부', '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부', '남성 주 불임 원인', '남성 부 불임 원인', '여성 주 불임 원인', '여성 부 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인', '불명확 불임 원인', '불임 원인 - 난관 질환', '불임 원인 - 남성 요인', '불임 원인 - 배란 장애', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증', '불임 원인 - 정자 농도', '불임 원인 - 정자 운동성', '불임 원인 - 정자 형태', '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '이식된 배아 수', '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수', '해동된 배아 수', '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수', '혼합된 난자 수', '파트너 정자와 혼합된 난자 수', '기증자 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부', '대리모 여부', 'PGD 시술 여부', 'PGS 시술 여부', '난자 채취 경과일', '난자 해동 경과일', '난자 혼합 경과일', '배아 이식 경과일', '배아 해동 경과일', '시술_임신'] 

0:	learn: 0.6708799	test: 0.6709346	best: 0.6709346 (

In [19]:
tmp_submission[f'cat_{seed_list[0]}'] = np.mean(test_preds_cat, axis=0)
tmp_submission

Unnamed: 0,lgbm_777,cat_777
0,0.001776,0.002551
1,0.002018,0.013353
2,0.156306,0.233818
3,0.097642,0.166973
4,0.524963,0.630286
...,...,...
90062,0.001469,0.002552
90063,0.294831,0.461483
90064,0.467621,0.545617
90065,0.172226,0.325369


In [26]:
tmp_submission.to_csv(f'./probability_{seed_list[0]}.csv', index=False)

## Ensemble

In [42]:
proba_333 = pd.read_csv('./probability_333.csv')
proba_777 = pd.read_csv('./probability_777.csv')

print(proba_333.shape, proba_777.shape)

(90067, 2) (90067, 2)


In [43]:
tmp = pd.concat([proba_333, proba_777], axis=1)
tmp['row_avg'] = tmp.mean(axis=1)
tmp

Unnamed: 0,lgbm_333,cat_333,lgbm_777,cat_777,row_avg
0,0.001763,0.002647,0.001776,0.002551,0.002184
1,0.002393,0.014727,0.002018,0.013353,0.008123
2,0.153479,0.231817,0.156306,0.233818,0.193855
3,0.096653,0.167206,0.097642,0.166973,0.132118
4,0.525038,0.631011,0.524963,0.630286,0.577825
...,...,...,...,...,...
90062,0.001625,0.002789,0.001469,0.002552,0.002109
90063,0.297328,0.457107,0.294831,0.461483,0.377687
90064,0.468525,0.547831,0.467621,0.545617,0.507399
90065,0.173623,0.340826,0.172226,0.325369,0.253011


In [44]:
submit = pd.read_csv(sample_path)
final_pred = tmp.loc[:, 'row_avg']
submit['probability'] = final_pred
submit

Unnamed: 0,ID,probability
0,TEST_00000,0.002184
1,TEST_00001,0.008123
2,TEST_00002,0.193855
3,TEST_00003,0.132118
4,TEST_00004,0.577825
...,...,...
90062,TEST_90062,0.002109
90063,TEST_90063,0.377687
90064,TEST_90064,0.507399
90065,TEST_90065,0.253011


In [45]:
tmp.to_csv(f'0730_ensemble_3.csv', index=False, encoding='utf-8-sig')