In [1]:
import pandas as pd
import numpy as np
import os
import random
import time

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from tabm_reference import Model, make_parameter_groups
import rtdl_num_embeddings

from Process_Function import RareCategoryTransformer
import warnings
warnings.filterwarnings(action='ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## 2. Data Load

In [2]:
data_seed = 10

train_path = f'../data/custom_train_{data_seed}.csv'
test_path = f'../data/custom_test_{data_seed}.csv'

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

print(train.shape, test.shape)

(205080, 68) (51271, 67)


In [3]:
from preprocess_DL import all_process

train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

train, test = all_process(train, test)

cat_cols = [col for col in train.columns if pd.api.types.is_categorical_dtype(train[col])]
numeric_cols = [col for col in train.columns if col not in cat_cols and col != '임신 성공 여부']

print(f'수치형 변수: {len(numeric_cols)}개 \n{numeric_cols}')
print(f'범주형 변수: {len(cat_cols)}개 \n{cat_cols}')
print(train.shape, test.shape)

수치형 변수: 57개 
['임신 시도 또는 마지막 임신 경과 연수', '배란 자극 여부', '단일 배아 이식 여부', '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부', '남성 주 불임 원인', '남성 부 불임 원인', '여성 주 불임 원인', '여성 부 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인', '불명확 불임 원인', '불임 원인 - 난관 질환', '불임 원인 - 남성 요인', '불임 원인 - 배란 장애', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증', '불임 원인 - 정자 농도', '불임 원인 - 정자 운동성', '불임 원인 - 정자 형태', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '이식된 배아 수', '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수', '해동된 배아 수', '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수', '혼합된 난자 수', '파트너 정자와 혼합된 난자 수', '기증자 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부', '대리모 여부', 'PGD 시술 여부', 'PGS 시술 여부', '난자 혼합 경과일', '배아 이식 경과일', '배아 해동 경과일', '시술_임신', '배아생성이유_기증용', '배아생성이유_난자 저장용', '배아생성이유_배아 저장용', '배아생성이유_현재 시술용']
범주형 변수: 8개 
['시술 시기 코드', '시술 당시 나이', '배란 유도 유형', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이', '시술유형_통합']
(205080, 66) (51271, 65)


In [4]:
def get_feature_info(train, target_col='임신 성공 여부'):
    n_num_features_ = len(numeric_cols)
    cat_cardinalities_ = [train[col].nunique() for col in cat_cols]

    return n_num_features_, cat_cardinalities_

def to_dataloader(df, batch_size=256, is_shuffle=True, is_train=True, target_col='임신 성공 여부'):
    X_num = torch.tensor(df[numeric_cols].values, dtype=torch.float32)
    X_cat = torch.tensor(df[cat_cols].values, dtype=torch.long)

    if is_train:
        y = torch.tensor(df[target_col].values, dtype=torch.float32)
        tensor_dataset = TensorDataset(X_num, X_cat, y)
        data_loader = DataLoader(tensor_dataset, batch_size=batch_size, shuffle=is_shuffle)
    else:
        tensor_dataset = TensorDataset(X_num, X_cat)
        data_loader = DataLoader(tensor_dataset, batch_size=batch_size, shuffle=is_shuffle)

    return data_loader

n_num_features, cat_cardinalities = get_feature_info(train)
print(n_num_features)
print(cat_cardinalities)

train_loader = to_dataloader(train)
test_loader = to_dataloader(test, is_shuffle=False, is_train=False)

57
[7, 7, 2, 3, 4, 5, 7, 9]


In [5]:
class TabMWrapper:
    def __init__(self, model_config, trainer_config):
        self.device = trainer_config.get("device") or torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
        self.lr = trainer_config.get("lr", 0.001)
        self.weight_decay = trainer_config.get("weight_decay", 3e-4)
        self.criterion = trainer_config.get("criterion", F.cross_entropy)
        self.patience = trainer_config.get("patience", 3)

        # 모델 생성
        self.model = Model(**model_config).to(self.device)

        optimizer_type = trainer_config.get("optimizer", "AdamW")
        params = make_parameter_groups(self.model)
        if optimizer_type == "AdamW":
            self.optimizer = torch.optim.AdamW(params, lr=self.lr, weight_decay=self.weight_decay)
        elif optimizer_type == "Adam":
            self.optimizer = torch.optim.Adam(params, lr=self.lr)
        else:
            raise ValueError(f"Unsupported optimizer: {optimizer_type}")

    def fit(self, train_loader, valid_loader, num_epochs=30, verbose=True):
        train_loss_history = []
        val_loss_history = []

        best_val_loss = float("inf")
        best_epoch = 0
        best_auc = 0

        best_model_state = None
        epochs_without_improvement = 0

        for epoch in range(num_epochs):
            self.model.train()
            epoch_loss = 0.0
            for x_num_batch, x_cat_batch, y_batch in train_loader:
                x_num_batch = x_num_batch.to(self.device)
                # x_cat_batch가 없는 경우에도 대응 (None 체크)
                if x_cat_batch is not None:
                    x_cat_batch = x_cat_batch.to(self.device)
                y_batch = y_batch.to(self.device).long()

                self.optimizer.zero_grad()
                # 모델 출력: (batch, k, ?)
                outputs = self.model(x_num=x_num_batch, x_cat=x_cat_batch)
                # 앙상블 멤버의 예측값 평균 후, 마지막 차원 제거 (예: (B, 1) -> (B,))
                ensemble_logits = outputs.mean(dim=1).squeeze(-1)
                loss = self.criterion(ensemble_logits, y_batch)
                loss.backward()
                self.optimizer.step()

                epoch_loss += loss.item() * x_num_batch.size(0)

            avg_train_loss = epoch_loss / len(train_loader.dataset)
            train_loss_history.append(avg_train_loss)

            avg_val_loss = self.evaluate(valid_loader)
            val_loss_history.append(avg_val_loss)

            if verbose:
                print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

            # Early stopping 체크 (validation loss 기준)
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_model_state = self.model.state_dict()
                epochs_without_improvement = 0
                best_epoch = epoch
                # print("  New best validation loss! Model saved.")
            else:
                epochs_without_improvement += 1
                # print(f"No improvement for {epochs_without_improvement} epoch(s).")

            if epochs_without_improvement >= self.patience:
                break

        # 학습 종료 후, best model의 가중치를 로드
        if best_model_state is not None:
            self.model.load_state_dict(best_model_state)
            print(f"Best model weights loaded from epoch {best_epoch+1} with validation loss {best_val_loss:.4f}.")

        return {"train_loss_history": train_loss_history, "val_loss_history": val_loss_history}

    def evaluate(self, data_loader):
        # Validation 단계
        self.model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x_num_batch, x_cat_batch, y_batch in data_loader:
                x_num_batch = x_num_batch.to(self.device)
                if x_cat_batch is not None:
                    x_cat_batch = x_cat_batch.to(self.device)
                y_batch = y_batch.to(self.device).long()

                outputs = self.model(x_num=x_num_batch, x_cat=x_cat_batch)
                ensemble_logits = outputs.mean(dim=1).squeeze(-1)
                loss = self.criterion(ensemble_logits, y_batch)
                val_loss += loss.item() * x_num_batch.size(0)

        avg_val_loss = val_loss / len(data_loader.dataset)
        return avg_val_loss

    def predict(self, data_loader):
        self.model.eval()
        preds = []
        with torch.no_grad():
            for batch in data_loader:
                # 배치에 포함된 값의 개수를 확인하여 unpack
                if len(batch) == 3:
                    x_num_batch, x_cat_batch, _ = batch
                elif len(batch) == 2:
                    x_num_batch, x_cat_batch = batch
                else:
                    raise ValueError("Unexpected number of values in batch")

                x_num_batch = x_num_batch.to(self.device)
                if x_cat_batch is not None:
                    x_cat_batch = x_cat_batch.to(self.device)
                outputs = self.model(x_num=x_num_batch, x_cat=x_cat_batch)
                ensemble_logits = outputs.mean(dim=1).squeeze(-1)
                probs = torch.softmax(ensemble_logits, dim=1)

                preds_batch = probs.detach().cpu().numpy()
                preds.extend(preds_batch.tolist())
        return np.array(preds)


In [6]:
def get_model_config(n_num_features, cat_cardinalities, bins, arch_type):
    model_config = {
        'n_num_features': n_num_features,
        'cat_cardinalities': cat_cardinalities,
        'n_classes': 2,
        'backbone': {
            'type': 'MLP',
            'n_blocks': 3 if bins is None else 2,
            'd_block': 512, # 256, 512, 1024
            'dropout': 0.1,
        },
        'bins': bins,
        'num_embeddings': (
            None if bins is None else {
                'type': 'PiecewiseLinearEmbeddings',
                'd_embedding': 16,
                'activation': False,
                'version': 'B',
            }
        ),
        'arch_type': arch_type,
        'k': 32,
        'share_training_batches': True,
    }
    return model_config


In [8]:
seed = 333
all_auc = []
test_preds = []
train_history = []

torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)



data_seeds = [7, 1]
for data_seed in data_seeds:
    train_path = f'../data/custom_train_{data_seed}.csv'
    test_path = f'../data/custom_test_{data_seed}.csv'

    train = pd.read_csv(train_path).drop(columns=['ID'])
    test = pd.read_csv(test_path).drop(columns=['ID'])

    fold_test_preds = []
    auc_scores = []
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['임신 성공 여부'])):
        fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
        fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
        fold_train2 = fold_train.copy()
        fold_test = test.copy()

        fold_train, fold_valid = all_process(fold_train, fold_valid)
        _, fold_test = all_process(fold_train2, fold_test)

        cat_cols = [col for col in fold_train.columns if pd.api.types.is_categorical_dtype(fold_train[col])]
        numeric_cols = [col for col in fold_train.columns if col not in cat_cols and col != '임신 성공 여부']

        # TabM
        # arch_type = 'tabm'
        # bins = None

        # TabM-mini with the piecewise-linear embeddings.
        arch_type = 'tabm-mini'
        bins = rtdl_num_embeddings.compute_bins(torch.tensor(fold_train[numeric_cols].values))
        n_num_features, cat_cardinalities = get_feature_info(fold_train)
        model_config = get_model_config(n_num_features, cat_cardinalities, bins, arch_type)

        trainer_config = {
            'device': None,
            'optimizer': 'AdamW',
            'lr': 2e-3,
            'weight_decay': 3e-4,
            'criterion': F.cross_entropy,
            # 'criterion': F.binary_cross_entropy_with_logits,
            'patience': 3,
        }

        batch_size = 4096
        train_loader = to_dataloader(fold_train, batch_size=batch_size)
        valid_loader = to_dataloader(fold_valid, batch_size=batch_size, is_shuffle=False)
        test_loader = to_dataloader(fold_test, batch_size=batch_size, is_shuffle=False, is_train=False)

        model = TabMWrapper(model_config, trainer_config)
        history = model.fit(train_loader, valid_loader, verbose=False)
        train_history.append(history)

        valid_preds = model.predict(valid_loader)[:, 1]
        fold_auc = roc_auc_score(fold_valid['임신 성공 여부'], valid_preds)
        print(f'Fold {fold + 1} AUC: {fold_auc}')

        auc_scores.append(fold_auc)
        test_pred = model.predict(test_loader)[:, 1]
        fold_test_preds.append(test_pred)

    # test_preds.append(fold_test_preds) # 나중에 sample 데이터 사용할때 사용

    valid_score = np.mean(auc_scores, axis=0)
    print(f'[Data Seed {data_seed}] Valid AUC: {valid_score}')

    from LG_Aimers_6th.cal_auc import calculate_auc
    # mean_test_preds_for_this_seed = np.mean(fold_test_preds, axis=0)
    test_score = calculate_auc(np.mean(fold_test_preds, axis=0), seed=data_seed)
    print(f'[Data Seed {data_seed}] Test AUC: {test_score}')


# 전체 결과에 대한 평균 및 표준편차 출력
total_auc_mean = np.mean(auc_scores)
total_auc_std = np.std(auc_scores)
print('-' * 60)
print(f'Total Average AUC: {total_auc_mean:.6f} (STD: {total_auc_std:.6f})')

Best model weights loaded from epoch 5 with validation loss 0.4896.
Fold 1 AUC: 0.7379780856482877
Best model weights loaded from epoch 7 with validation loss 0.4867.
Fold 2 AUC: 0.7394981986181133
Best model weights loaded from epoch 6 with validation loss 0.4899.
Fold 3 AUC: 0.7365417950753128
Best model weights loaded from epoch 9 with validation loss 0.4893.
Fold 4 AUC: 0.7367039139176061
Best model weights loaded from epoch 8 with validation loss 0.4869.
Fold 5 AUC: 0.7409627223276896
[Data Seed 7] Valid AUC: 0.738336943117402
[Data Seed 7] Test AUC: 0.7423944131894281
Best model weights loaded from epoch 7 with validation loss 0.4875.
Fold 1 AUC: 0.7407844660719476
Best model weights loaded from epoch 6 with validation loss 0.4892.
Fold 2 AUC: 0.737130458282671
Best model weights loaded from epoch 8 with validation loss 0.4887.
Fold 3 AUC: 0.7394164346686344
Best model weights loaded from epoch 7 with validation loss 0.4886.
Fold 4 AUC: 0.7375383807414659
Best model weights loade

In [9]:
# seed = 333
# torch.manual_seed(seed)
# np.random.seed(seed)
# random.seed(seed)
#
# # data_seed 값 목록
# data_seeds = [1, 7]
#
# # 각 data_seed별 AUC를 저장할 딕셔너리
# auc_data = {}
#
# # model_config와 trainer_config는 첫 번째 data_seed에서 저장 (모든 실험에서 동일한 구성이면 이를 사용)
# model_config_saved = None
# trainer_config_saved = None
#
# # data_seed 별 실험 수행
# for data_seed in data_seeds:
#     print(f"\n*** Data seed: {data_seed} ***")
#     train_path = f'../data/custom_train_{data_seed}.csv'
#     test_path = f'../data/custom_test_{data_seed}.csv'
#
#     train = pd.read_csv(train_path).drop(columns=['ID'])
#     test = pd.read_csv(test_path).drop(columns=['ID'])
#
#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
#     auc_scores = []
#     fold_test_preds = []
#
#     for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['임신 성공 여부'])):
#         # 데이터 분할 및 복사
#         fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
#         fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
#         fold_train2 = fold_train.copy()  # 후처리에 사용할 별도 복사본
#         fold_test = test.copy()
#
#         # 데이터 전처리 (all_process 함수 사용)
#         fold_train, fold_valid = all_process(fold_train, fold_valid)
#         fold_train2, fold_test = all_process(fold_train2, fold_test)
#
#         # 카테고리형, 수치형 변수 구분
#         cat_cols = [col for col in fold_train.columns if pd.api.types.is_categorical_dtype(fold_train[col])]
#         numeric_cols = [col for col in fold_train.columns if col not in cat_cols and col != '임신 성공 여부']
#
#         # TabM-mini 모델 설정 (piecewise-linear embeddings 사용)
#         arch_type = 'tabm-mini'
#         bins = rtdl_num_embeddings.compute_bins(torch.tensor(fold_train[numeric_cols].values))
#         n_num_features, cat_cardinalities = get_feature_info(fold_train)
#         model_config = get_model_config(n_num_features, cat_cardinalities, bins, arch_type)
#
#         # trainer 설정
#         trainer_config = {
#             'device': None,
#             'optimizer': 'AdamW',
#             'lr': 2e-3,
#             'weight_decay': 3e-4,
#             'criterion': F.cross_entropy,
#             'patience': 3,
#         }
#
#         # 데이터 로더 생성
#         batch_size = 4096
#         train_loader = to_dataloader(fold_train, batch_size=batch_size)
#         valid_loader = to_dataloader(fold_valid, batch_size=batch_size, is_shuffle=False)
#         test_loader = to_dataloader(fold_test, batch_size=batch_size, is_shuffle=False, is_train=False)
#
#         # 모델 생성 및 학습
#         model = TabMWrapper(model_config, trainer_config)
#         history = model.fit(train_loader, valid_loader, verbose=False)
#
#         # 검증 데이터 예측 및 AUC 계산
#         valid_preds = model.predict(valid_loader)[:, 1]
#         fold_auc = roc_auc_score(fold_valid['임신 성공 여부'], valid_preds)
#         print(f'Data seed {data_seed}, Fold {fold + 1} AUC: {fold_auc:.5f}')
#         auc_scores.append(fold_auc)
#
#         # 테스트 예측 (추후 ensemble 등 활용 가능)
#         test_pred = model.predict(test_loader)[:, 1]
#         fold_test_preds.append(test_pred)
#
#     from LG_Aimers_6th.cal_auc import calculate_auc
#     score = calculate_auc(np.mean(fold_test_preds, axis=0), seed=data_seed)
#
#     # 현재 data_seed의 평균 AUC 계산
#     seed_auc_mean = np.mean(auc_scores)
#     auc_data[data_seed] = seed_auc_mean
#     print(f"Data seed {data_seed} - Average AUC: {seed_auc_mean:.5f}\n")
#
#
# # 최종 평균 AUC (전체 data_seed의 평균)
# avg_auc = np.mean(list(auc_data.values()))
#
# # 결과 DataFrame 구성: model_config, trainer_config와 각 data_seed의 평균 AUC, 그리고 전체 평균
# final_results = pd.DataFrame({
#     'model_config': [model_config_saved],
#     'trainer_config': [trainer_config_saved],
#     'data1': [auc_data.get(1)],
#     'data7': [auc_data.get(7)],
#     'avg': [avg_auc]
# })
#
# # 결과를 CSV 파일로 저장
# final_results.to_csv('final_results.csv', index=False)
# print("최종 결과가 'final_results.csv' 파일로 저장되었습니다.")


*** Data seed: 1 ***


KeyboardInterrupt: 

In [8]:
old_auc = 0.744533 * 100
old_std = 0.001171 * 100

new_auc = total_auc_mean * 100
new_std = total_auc_std * 100

def calculate_change(old_value, new_value):
    change = new_value - old_value
    percentage_change = (change / old_value) * 100 if old_value != 0 else float('inf')
    return change, percentage_change

def format_change(change):
    return f"{change:+.6f}"

# 각 지표의 변화량 계산
auc_change, auc_pct = calculate_change(old_auc, new_auc)
std_change, std_pct = calculate_change(old_std, new_std)

# 결과 출력
print("\n========== 모델 성능 변화 ==========")
print(f"{'Metric':<8}  {'AUC':>12}  {'Acc':>12}")
print("-" * 36)
print(f"{'Old':<8}  {old_auc:>12.6f}  {old_std:>12.6f}")
print(f"{'New':<8}  {new_auc:>12.6f}  {new_std:>12.6f}")
print(f"{'Change':<8}  {format_change(auc_change):>12}  {format_change(std_change):>12}")
print(f"{'% Change':<8}  {auc_pct:>11.4f}%  {std_pct:>11.4f}%")
print("=" * 36)


Metric             AUC           Acc
------------------------------------
Old          74.453300      0.117100
New          73.845036      0.000000
Change       -0.608264     -0.117100
% Change      -0.8170%    -100.0000%


In [9]:
tmp_submission = pd.DataFrame({f'tabm_{data_seed}': np.mean(test_preds, axis=0)})
tmp_submission

Unnamed: 0,tabm_10
0,0.341478
1,0.105108
2,0.000232
3,0.114351
4,0.446093
...,...
51266,0.231313
51267,0.255745
51268,0.141795
51269,0.029850


In [10]:
from LG_Aimers_6th.cal_auc import calculate_auc

score = calculate_auc(tmp_submission, seed=data_seed)
print(f'[seed {data_seed}]: {score}')

[seed 10]: 0.74150757679765
