In [None]:
# conda install -c conda-forge faiss-gpu

# conda 가상환경 상에서 설치 (로컬로 돌릴때)

In [None]:
import sys
import os
from pathlib import Path

# 현재 작업 디렉토리(Eunhak)에서 tabular_dl_tabr 경로 추가
project_path = os.path.join(os.getcwd(), "tabular_dl_tabr")
if project_path not in sys.path:
    sys.path.insert(0, project_path)


project_dir = Path(r"C:\workspace\LG_Aimers_6th\Eunhak\tabular_dl_tabr")
os.environ['PROJECT_DIR'] = str(project_dir)

# 경로가 존재하지 않으면 생성
if not project_dir.exists():
    project_dir.mkdir(parents=True, exist_ok=True)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd
import numpy as np
import random
import math
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import delu
import torch
import torch.nn as nn
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from bin.tabr import Model
from LG_Aimers_6th.cal_auc import calculate_auc

In [None]:
data_seed = 1

train_path = f'../data/custom_train_{data_seed}.csv'
test_path = f'../data/custom_test_{data_seed}.csv'

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID']) # test에는 target이 없음

print(train.shape, test.shape)

In [None]:
from preprocess_DL import all_process

train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

train, test = all_process(train, test)

print(train.shape, test.shape)

In [None]:
def get_cols(df_train, target_col='임신 성공 여부'):
    cat_cols = [col for col in df_train.columns if pd.api.types.is_categorical_dtype(df_train[col])]
    numeric_cols = [col for col in df_train.columns if col not in cat_cols and col != '임신 성공 여부']

    num_cols = []
    bin_cols = []
    for col in numeric_cols:
        if df_train[col].nunique() == 2:
            bin_cols.append(col)
        else:
            num_cols.append(col)

    return num_cols, bin_cols, cat_cols

num_cols, bin_cols, cat_cols = get_cols(train)
cat_cardinalities = [train[col].nunique() for col in cat_cols]

print(f'수치형 변수: {len(num_cols)}개 \n{num_cols}')
print(f'이진형 변수: {len(bin_cols)}개 \n{bin_cols}')
print(f'범주형 변수: {len(cat_cols)}개 \n{cat_cols}')

In [None]:
def build_dataset_from_dfs(train_df, valid_df, test_df, num_cols, bin_cols, cat_cols, target_col='임신 성공 여부'):
    data = {}
    data['X_num'] = {
        'train': torch.tensor(train_df[num_cols].values, dtype=torch.float32),
        'val':   torch.tensor(valid_df[num_cols].values, dtype=torch.float32),
        'test':  torch.tensor(test_df[num_cols].values, dtype=torch.float32),
    }
    data['X_bin'] = {
        'train': torch.tensor(train_df[bin_cols].values, dtype=torch.float32),
        'val':   torch.tensor(valid_df[bin_cols].values, dtype=torch.float32),
        'test':  torch.tensor(test_df[bin_cols].values, dtype=torch.float32),
    }
    if cat_cols:
        data['X_cat'] = {
            'train': torch.tensor(train_df[cat_cols].values, dtype=torch.long),
            'val':   torch.tensor(valid_df[cat_cols].values, dtype=torch.long),
            'test':  torch.tensor(test_df[cat_cols].values, dtype=torch.long),
        }
    else:
        data['X_cat'] = None
    data['Y'] = {
        'train': torch.tensor(train_df[target_col].values, dtype=torch.long),
        'val':   torch.tensor(valid_df[target_col].values, dtype=torch.long),
        # test 데이터에는 타깃이 없을 수 있습니다.
    }
    return data

def move_data_to_device(data, device):
    # data는 dict 형식: 예) {'X_num': {'train': tensor, 'val': tensor, ...}, ...}
    for key in data:
        if data[key] is None:
            continue
        if isinstance(data[key], dict):
            for part in data[key]:
                data[key][part] = data[key][part].to(device)
        else:
            data[key] = data[key].to(device)
    return data


class MyDataset:
    def __init__(self, data, n_num_features, n_bin_features, cat_cardinalities, is_regression=False, is_multiclass=True):
        self.data = data
        self._n_num_features = n_num_features
        self._n_bin_features = n_bin_features
        self._cat_cardinalities = cat_cardinalities
        self.is_regression = is_regression
        self.is_multiclass = is_multiclass

    @property
    def n_num_features(self):
        return self._n_num_features

    @property
    def n_bin_features(self):
        return self._n_bin_features

    def cat_cardinalities(self):
        return self._cat_cardinalities

    @property
    def Y(self):
        return self.data['Y']

    def size(self, part: str) -> int:
        # target이 있는 경우 사용
        if part in self.data['Y']:
            return self.data['Y'][part].shape[0]
        else:
            return self.data['X_num'][part].shape[0]

In [None]:
model = Model(
    n_num_features=len(num_cols),
    n_bin_features=len(bin_cols),
    cat_cardinalities=cat_cardinalities,
    n_classes=2,
    num_embeddings=None,      # 임베딩 사용하지 않을 경우 None
    d_main=64,
    d_multiplier=2.0,
    encoder_n_blocks=2,
    predictor_n_blocks=2,
    mixer_normalization=True,
    context_dropout=0.1,
    dropout0=0.1,
    dropout1='dropout0',      # 'dropout0' 문자열을 지정하면 내부에서 dropout0 값이 사용됩니다.
    normalization="BatchNorm1d",
    activation="ReLU",
    memory_efficient=False,
    candidate_encoding_batch_size=None,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

In [None]:
seed = 333

def objective(trial):
    ### 하이퍼파라미터 탐색 공간 설정 ###

    d_main = trial.suggest_int("d_main", 32, 128, step=32)
    d_multiplier = trial.suggest_float("d_multiplier", 1.0, 3.0, step=0.5)
    encoder_n_blocks = trial.suggest_int("encoder_n_blocks", 1, 3)
    predictor_n_blocks = trial.suggest_int("predictor_n_blocks", 1, 3)
    dropout0 = trial.suggest_float("dropout0", 0.0, 0.5, step=0.05)
    context_size = trial.suggest_int("context_size", 2, 64)
    context_dropout = trial.suggest_float("context_dropout", 0.0, 0.5, step=0.1)
    learning_rate = trial.suggest_loguniform("lr", 1e-4, 1e-2)

    #####################################

    def get_Xy(part: str, idx: torch.Tensor = None) -> tuple[dict, torch.Tensor]:
        batch = (
            { key[2:]: dataset.data[key][part] for key in dataset.data if key.startswith('X_') },
            dataset.data['Y'][part] if 'Y' in dataset.data and part in dataset.data['Y'] else None
        )
        if idx is None:
            return batch
        else:
            return (
                {k: v[idx] for k, v in batch[0].items()},
                batch[1][idx] if batch[1] is not None else None
            )

    def apply_model(part: str, idx: torch.Tensor, is_train: bool) -> torch.Tensor:
        x, y = get_Xy(part, idx)
        candidate_indices = train_indices
        if is_train:
            # training part: 후보에서 현재 배치 제거
            candidate_indices = candidate_indices[~torch.isin(candidate_indices, idx)]
        # 후보 데이터: 조건에 따라 전체 train 또는 선택된 인덱스 사용
        candidate_x, candidate_y = get_Xy('train', None if candidate_indices.equal(train_indices) else candidate_indices)
        return model(
            x_=x,
            y=y if is_train else None,
            candidate_x_=candidate_x,
            candidate_y=candidate_y,
            context_size=context_size,
            is_train=is_train,
        ).squeeze(-1)


    valid_aucs = []
    test_aucs = []

    data_seeds = [1, 7]
    for data_seed in data_seeds:
        valid_scores = []
        test_preds = []

        torch.manual_seed(333)
        delu.random.seed(seed)
        np.random.seed(333)
        random.seed(333)

        train_path = f'../data/custom_train_{data_seed}.csv'
        test_path = f'../data/custom_test_{data_seed}.csv'

        train = pd.read_csv(train_path).drop(columns=['ID'])
        test = pd.read_csv(test_path).drop(columns=['ID'])

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['임신 성공 여부'])):
            fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
            fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
            fold_train2 = fold_train.copy()
            fold_test = test.copy()

            fold_train, fold_valid = all_process(fold_train, fold_valid)
            _, fold_test = all_process(fold_train2, fold_test)

            num_cols, bin_cols, cat_cols = get_cols(fold_train)
            cat_cardinalities = [fold_train[col].nunique() for col in cat_cols]

            data_dict = build_dataset_from_dfs(
                fold_train, fold_valid, fold_test,
                num_cols, bin_cols, cat_cols, target_col='임신 성공 여부'
            )
            data_dict = move_data_to_device(data_dict, device)
            dataset = MyDataset(data_dict, n_num_features=len(num_cols), n_bin_features=len(bin_cols), cat_cardinalities=cat_cardinalities)

            train_size = dataset.size('train')
            train_indices = torch.arange(train_size, device=device)

            model = Model(
                n_num_features=len(num_cols),
                n_bin_features=len(bin_cols),
                cat_cardinalities=cat_cardinalities,
                n_classes=2,
                num_embeddings=None,      # 임베딩 사용하지 않을 경우 None
                d_main=d_main,
                d_multiplier=d_multiplier,
                encoder_n_blocks=encoder_n_blocks,
                predictor_n_blocks=predictor_n_blocks,
                mixer_normalization=True,
                context_dropout=context_dropout,
                dropout0=dropout0,
                dropout1='dropout0',      # 'dropout0' 문자열을 지정하면 내부에서 dropout0 값이 사용됩니다.
                normalization="BatchNorm1d",
                activation="ReLU",
                memory_efficient=False,
                candidate_encoding_batch_size=None,
            ).to(device)

            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            criterion = nn.BCEWithLogitsLoss()

            num_epochs = 100000
            batch_size = 2048

            patience = 10
            best_val_loss = float('inf')
            early_stop_counter = 0

            checkpoint_path = "best_model_TabR.pth"

            for epoch in range(num_epochs):
                model.train()
                # 학습 데이터 인덱스 섞기
                shuffled_indices = train_indices[torch.randperm(train_size)]
                num_batches = math.ceil(train_size / batch_size)
                epoch_loss = 0.0
                for i in range(num_batches):
                    idx = shuffled_indices[i * batch_size : (i + 1) * batch_size]
                    outputs = apply_model('train', idx, is_train=True)

                    # 해당 인덱스의 타깃
                    _, y_batch = get_Xy('train', idx)

                    y_batch = y_batch.float()
                    loss = criterion(outputs.squeeze(), y_batch.squeeze()) # squeeze해서 shape 맞추기 (예: (batch_size,))

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.item() * idx.numel()

                avg_loss = epoch_loss / train_size

                model.eval()
                with torch.no_grad():
                    val_indices = torch.arange(dataset.size('val'), device=device)
                    outputs_val = apply_model('val', val_indices, is_train=False)
                    _, y_val = get_Xy('val', val_indices)

                    val_loss = criterion(outputs_val.squeeze(), y_val.float().squeeze()).item() # validation loss 계산

                    outputs_val = torch.sigmoid(outputs_val)
                    outputs_val_np = outputs_val.detach().cpu().numpy().squeeze()
                    y_val_np = y_val.detach().cpu().numpy().squeeze()

                    val_auc = roc_auc_score(y_val_np, outputs_val_np)

                # print(f"[Epoch {epoch+1}] Train Loss: {avg_loss:.4f}, Valid Loss: {val_loss:.4f}, Valid AUC: {val_auc:.6f}")

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    early_stop_counter = 0
                    best = {'epoch': epoch+1, 'val_loss': val_loss, 'val_auc': val_auc}
                    torch.save(model.state_dict(), checkpoint_path)
                else:
                    early_stop_counter += 1
                    if early_stop_counter >= patience:
                        break

            model.load_state_dict(torch.load(checkpoint_path))
            # print(f'\n[Fold{fold+1} Result]')
            # print(best)


            model.eval()
            with torch.no_grad():
                test_indices = torch.arange(dataset.size('test'), device=device)

                outputs_val = torch.sigmoid(apply_model('val', val_indices, is_train=False))
                outputs_val_np = outputs_val.detach().cpu().numpy().squeeze()
                _, y_val = get_Xy('val', val_indices)
                y_val_np = y_val.detach().cpu().numpy().squeeze()

                valid_score = roc_auc_score(y_val_np, outputs_val_np)
                valid_scores.append(valid_score)

                y_pred_test = torch.sigmoid(apply_model('test', test_indices, is_train=False))
                y_pred_test_np = y_pred_test.detach().cpu().numpy().squeeze()
                test_preds.append(y_pred_test_np)
                test_auc_fold = calculate_auc(y_pred_test_np, seed=data_seed)
                # print(f'[Data Seed {data_seed} Fold {fold+1}] Valid AUC: {valid_score:.5f}, Test AUC: {test_auc_fold:.5f}')

            torch.cuda.empty_cache()

        valid_auc = np.mean(valid_scores, axis=0)
        valid_aucs.append(valid_auc)
        test_auc = calculate_auc(np.mean(test_preds, axis=0), data_seed)
        test_aucs.append(test_auc)

        print('-' * 60)
        print(f'[Data Seed {data_seed}] AVG Valid AUC: {valid_auc:.5f}, Test AUC: {test_auc}')

    avg_valid_auc = np.mean(valid_aucs)
    avg_test_auc = np.mean(test_aucs)

    print('-' * 60)
    print(f'[Data Seed 1,7] AVG Valid AUC: {avg_valid_auc:.5f}, Test AUC: {avg_test_auc}')
    print('-' * 60)

    return avg_test_auc

# Optuna 스터디 생성 (최대화 방향)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")