In [4]:
import os
import torch
os.environ["RDKIT_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"]   = "1"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
## or CUBLAS_WORKSPACE_CONFIG=:16:8

torch.use_deterministic_algorithms(True, warn_only=True)

In [2]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
import copy
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from torch.distributions.beta import Beta
import math
from torch_geometric.nn import PNA, global_mean_pool
from torch_geometric.data import Dataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import degree
from torch_ema import ExponentialMovingAverage
from rdkit import Chem

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

from graph_dataset_ogb import OnTheFlyOGBCompatibleSmilesDataset, mask_edges, mask_nodes

In [3]:
def normalized_rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse / (np.max(y_true) - np.min(y_true))

def pearson_correlation(y_true, y_pred):
    corr = np.corrcoef(y_true, y_pred)[0, 1]
    return np.clip(corr, 0, 1)

def competition_score(y_true, y_pred):
    nrmse = min(normalized_rmse(y_true, y_pred), 1)
    pearson = pearson_correlation(y_true, y_pred)
    return 0.5 * (1 - nrmse) + 0.5 * pearson

In [7]:
CFG = {
    'EPOCHS': 200,
    'PATIENCE': 25,
    'LEARNING_RATE': 1e-4,
    'WEIGHT_DECAY': 1e-4,
    'BATCH_SIZE': 512,
    'AUG_PROB': 0.5,
    'SEED': 2025, # 6174
    'NUM_WORKERS': 10,
    'USE_AMP': False,
}

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

seed_everything(CFG['SEED'])

In [9]:
import torch
import torch_geometric.transforms as T
from torch_geometric.data import Data

# [기존 코드] MASK_NODE_FEATURE_VECTOR는 그대로 사용합니다.
ATOM_FEATURE_VOCAB_SIZES = [119, 4, 12, 12, 10, 6, 6, 2, 2]
MASK_NODE_FEATURE_VECTOR = torch.tensor(ATOM_FEATURE_VOCAB_SIZES, dtype=torch.long)

class MaskNodes(object):
    """
    하나의 그래프(Data 객체)에 대해 노드 마스킹을 적용하는 transform.
    """
    def __init__(self, mask_ratio: float):
        self.mask_ratio = mask_ratio

    def __call__(self, data: Data) -> Data:
        if self.mask_ratio == 0.0:
            return data

        num_nodes = data.num_nodes
        num_to_mask = int(num_nodes * self.mask_ratio)

        if num_to_mask == 0:
            return data

        # 원본 데이터를 수정하지 않도록 복제
        data = data.clone()

        # 마스킹할 노드 인덱스 선택
        perm = torch.randperm(num_nodes)
        masked_node_indices = perm[:num_to_mask]

        # [MASK] 토큰으로 교체
        data.x[masked_node_indices] = MASK_NODE_FEATURE_VECTOR
        return data

class MaskEdges(object):
    """
    하나의 그래프(Data 객체)에 대해 엣지 마스킹을 적용하는 transform.
    """
    def __init__(self, mask_ratio: float):
        self.mask_ratio = mask_ratio

    def __call__(self, data: Data) -> Data:
        if self.mask_ratio == 0.0:
            return data

        num_edges = data.num_edges
        if num_edges == 0:
            return data

        # 가정: 엣지는 (i,j)와 (j,i) 쌍으로 존재
        num_edge_pairs = num_edges // 2
        num_pairs_to_remove = int(num_edge_pairs * self.mask_ratio)

        if num_pairs_to_remove == 0:
            return data

        # 원본 데이터를 수정하지 않도록 복제
        data = data.clone()

        # 제거할 엣지 쌍 인덱스 선택
        perm = torch.randperm(num_edge_pairs)
        pairs_to_remove_idx = perm[:num_pairs_to_remove]

        reverse_pairs_to_remove_idx = pairs_to_remove_idx + num_edge_pairs
        indices_to_remove = torch.cat([pairs_to_remove_idx, reverse_pairs_to_remove_idx])

        keep_mask = torch.ones(num_edges, dtype=torch.bool)
        keep_mask[indices_to_remove] = False

        data.edge_index = data.edge_index[:, keep_mask]
        if data.edge_attr is not None:
            data.edge_attr = data.edge_attr[keep_mask]

        return data

class RandomApply(object):
    """
    주어진 transform을 확률 p로 적용합니다.
    """
    def __init__(self, transform, p: float):
        self.transform = transform
        self.p = p

    def __call__(self, data: Data) -> Data:
        if random.random() < self.p:
            return self.transform(data)
        return data

In [10]:
def cal_degree(dataset_):
    loader_ = DataLoader(
        dataset_, batch_size=128, shuffle=False,
        num_workers=3, pin_memory=True, persistent_workers=True, prefetch_factor=1,
        worker_init_fn=seed_worker
    )

    deg = torch.zeros(10, dtype=torch.long)
    # for batch in tqdm(loader_, desc="차수 계산 중"):
    for batch in loader_:
        d = degree(batch.edge_index[1], num_nodes=batch.num_nodes, dtype=torch.long)
        deg += torch.bincount(d, minlength=deg.numel())

    del loader_

    print(f'차수 계산 완료, degree: {deg}')
    return deg

class ScaledSigmoid(nn.Module):
    def __init__(self, scale=2.5):
        super().__init__()
        self.scale = scale

    def forward(self, x):
        return torch.sigmoid(x * self.scale)

class ClampedGeneralizedSigmoid(nn.Module):
    def __init__(self, intermediate_range=(-50.0, 150.0), final_range=(0.0, 100.0)):
        super().__init__()
        self.inter_min, self.inter_max = intermediate_range
        self.final_min, self.final_max = final_range
        self.inter_range = self.inter_max - self.inter_min

    def forward(self, x):
        # Generalized Sigmoid를 적용하여 중간 범위로 예측
        sigmoid_x = torch.sigmoid(x)

        final_pred = sigmoid_x * self.inter_range + self.inter_min

        # 최종 범위로 예측값을 강제로 제한 (clamping)
        final_pred = torch.clamp(final_pred, self.final_min, self.final_max)

        return final_pred


class pna4finetuning(torch.nn.Module):
    def __init__(
            self,
            input_dim: int, hidden_dim: int, num_layers: int, deg: torch.Tensor, edge_dim: int,
            model_path=None, freeze_encoder: bool = False, scale_factor = 2.0,
    ):
        super(pna4finetuning, self).__init__()
        self.activation = ScaledSigmoid(scale=scale_factor)
        self.final_activation = ClampedGeneralizedSigmoid(
            intermediate_range=(-50.0, 150.0), # 시그모이드의 목표 범위를 넓게 설정
            final_range=(0.0, 100.0)           # 최종 출력 범위
        )

        aggregators = ['mean', 'min', 'max', 'std']
        scalers = ['identity', 'amplification', 'attenuation']

        self.encoder = PNA(
            in_channels=input_dim,
            hidden_channels=hidden_dim,
            num_layers=num_layers,
            out_channels=None,
            dropout=0.1,
            act='relu',
            # norm='BatchNorm',
            norm=None,
            jk='cat',
            # PNA 필수 인자들
            aggregators=aggregators,
            scalers=scalers,
            deg=deg,
            edge_dim=edge_dim,
        )

        if model_path is not None:
            # state_dict = torch.load(model_path, map_location='cpu')
            # self.encoder.load_state_dict(state_dict)

            pretrained_state_dict = torch.load(model_path, map_location='cpu')
            encoder_state_dict = {}
            for k, v in pretrained_state_dict.items():
                if k.startswith("encoder."):
                    # 'encoder.convs.0.weight' -> 'convs.0.weight'
                    # replace의 세 번째 인자 1은 한 번만 바꾸도록 보장합니다.
                    new_key = k.replace("encoder.", "", 1)
                    encoder_state_dict[new_key] = v

            # 필터링 및 수정된 state_dict를 self.encoder에 로드합니다.
            self.encoder.load_state_dict(encoder_state_dict)
            print("가중치 로드 완료.")

        if freeze_encoder:
            print("사전 학습된 인코더를 동결합니다 (학습되지 않음).")
            for param in self.encoder.parameters():
                param.requires_grad = False
        else:
            print("사전 학습된 인코더를 동결하지 않습니다 (전체 모델 학습).")


        self.pool = global_mean_pool

        self.finetune_head = nn.Sequential(
            # nn.LayerNorm(hidden_dim),

            nn.Linear(hidden_dim, hidden_dim // 2),

            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(0.1),
            nn.ReLU(),
            # nn.GELU(),

            nn.Linear(hidden_dim // 2, 1),
        )

        # self.apply(self._init_weights)

    def forward(self, batch):
        node_features = batch.x.float()
        edge_features = batch.edge_attr.float() if batch.edge_attr is not None else None

        node_repr = self.encoder(
            x=node_features,
            edge_index=batch.edge_index,
            edge_attr=edge_features,
            batch=batch.batch
        )

        graph_repr = self.pool(node_repr, batch.batch)
        out = self.finetune_head(graph_repr).squeeze(1)

        # prediction = self.activation(out) * 100
        prediction = torch.sigmoid(out) * 100
        # prediction = (torch.tanh(out) + 1) * 50

        return prediction

    def encode_graph(self, batch):
        node_features = batch.x.float()
        edge_features = batch.edge_attr.float() if batch.edge_attr is not None else None

        node_repr = self.encoder(
            x=node_features,
            edge_index=batch.edge_index,
            edge_attr=edge_features,
            batch=batch.batch
        )
        graph_repr = self.pool(node_repr, batch.batch)  # [B, hidden_dim]
        return graph_repr

    def predict_from_repr(self, graph_repr):
        out = self.finetune_head(graph_repr).squeeze(1)
        pred = torch.sigmoid(out) * 100
        return pred

    def forward_mixup(self, batch, lam, perm):
        """
        batch: PyG Batch
        lam: scalar tensor in [0,1]
        perm: permutation indices over graphs in the batch (len == batch_size)
        """
        repr_ = self.encode_graph(batch)                 # [B, H]
        repr_perm = repr_[perm]                          # [B, H]

        if lam.dim() == 0: # 스칼라
            mixed = lam * repr_ + (1. - lam) * repr_perm
        else: # 벡터
            lam = lam.view(-1, 1)            # [B, 1]
            mixed = lam * repr_ + (1. - lam) * repr_perm

        pred = self.predict_from_repr(mixed)        # [B]
        return pred

In [11]:
def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        # if smile != canon_smile:
        #     print(f'{smile} > {canon_smile}')
        return canon_smile
    except:
        return np.nan

In [12]:
data_name_list = ['AID_1851.csv', 'AID_884.csv', 'AID_885.csv']
external_data_full = pd.DataFrame()

for i in range(len(data_name_list)):
    external_data_ = pd.read_csv(f'../external_data/{data_name_list[i]}')
    external_data_full = pd.concat([external_data_full, external_data_], ignore_index=True)

external_data_full = external_data_full.reset_index(drop=True)
print(external_data_full.shape)
external_data_full = external_data_full.drop_duplicates(subset=["Canonical_Smiles"], keep="first").reset_index(drop=True)
print(external_data_full.shape)

external_data_full["Canonical_Smiles"] = external_data_full["Canonical_Smiles"].map(make_smile_canonical)

# external_data_full['Inhibition'] = external_data_full['Inhibition'].clip(lower=7.5)

(29179, 5)
(14421, 5)


In [13]:
external_data_full['Inhibition'] = external_data_full['Inhibition_max'].clip(lower=0.0, upper=100.0)
external_data_full

Unnamed: 0,Canonical_Smiles,Inhibition_hill,Inhibition_fit,Inhibition_max,Inhibition_eff,Inhibition
0,CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1.Cl,0.000000,,2.2935,,2.2935
1,O=c1[nH]c2cc3c(cc2cc1CN(CCCO)Cc1nnnn1Cc1ccc(F)...,75.434847,105.4660,97.4718,,97.4718
2,CC(=O)N(c1ccc2oc(=O)sc2c1)S(=O)(=O)c1cccs1,47.039728,113.7150,94.7622,,94.7622
3,COc1ccccc1C(c1nnnn1C(C)(C)C)N1CCN(Cc2ccncc2)CC1,85.388103,96.2811,97.0468,,97.0468
4,CC(=O)Nc1cccc(NC(=O)C2CCCN2C(=O)Nc2ccccc2C)c1,0.000000,,-0.0054,,0.0000
...,...,...,...,...,...,...
14416,N#CC(C#N)=Cc1ccc([N+](=O)[O-])c(O)c1,44.867806,44.9374,51.3134,36.8698,51.3134
14417,COC(=O)[C@@]1(Cc2ccc(F)cc2)[C@H]2c3cc(C(=O)N4C...,55.181273,57.8365,58.4264,51.2518,58.4264
14418,CC(=O)O[C@H]1C(=O)[C@]2(C)[C@H]([C@H](OC(=O)c3...,40.557578,40.5823,52.7098,42.5540,52.7098
14419,CC(C)(C)c1ccc(NS(=O)(=O)c2ccc(N)cc2)cc1,49.942458,50.2667,50.2546,55.2667,50.2546


In [14]:
data = pd.read_csv('../data/train.csv')
data["Canonical_Smiles"] = data["Canonical_Smiles"].map(make_smile_canonical)
data['Inhibition_binary'] = (data['Inhibition'] > 0.1).astype(int)
data['Inhibition_bin10'] = pd.qcut(data['Inhibition'], q=10, labels=False, duplicates='drop').astype(int)

print(data.shape)

(1681, 5)


In [15]:
data

Unnamed: 0,ID,Canonical_Smiles,Inhibition,Inhibition_binary,Inhibition_bin10
0,TRAIN_0000,Cl.OC1(Cc2cccc(Br)c2)CCNCC1,12.500000,1,2
1,TRAIN_0001,Brc1ccc2c3c(ccnc13)CCO2,4.450000,1,1
2,TRAIN_0002,CC1(CO)CC(c2cc([N+](=O)[O-])c(F)cc2Cl)=NO1,4.920000,1,1
3,TRAIN_0003,O=C(c1cccc(OCc2cccc(Nc3nc4ccc(F)cc4[nH]3)c2)c1...,71.500000,1,8
4,TRAIN_0004,CC(C)CC(=O)c1c(S(C)=O)[nH]c2c(Cl)ccc(Cl)c2c1=O,18.300000,1,3
...,...,...,...,...,...
1676,TRAIN_1676,Cc1cc2ncn(CC3CCN(S(=O)(=O)CCN4C(=O)CCCC4=O)CC3...,0.500000,1,0
1677,TRAIN_1677,O=C(Cn1ncccc1=O)N1Cc2cnc(N3CCOCC3)nc2C1,0.500000,1,0
1678,TRAIN_1678,COc1coc(C(=O)Nc2cccc3c2ccn3C)cc1=O,0.500000,1,0
1679,TRAIN_1679,Cc1cc(=O)n(CCNC(=O)c2nc3nc(C)cc(C)n3n2)cn1,0.500000,1,0


In [16]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {DEVICE}')

Using device: cuda


In [17]:
def train_fn(
        model, loader, optimizer, criterion, ema,
        mixup_alpha=0.4, mixup_prob=1.0, mixup_off_epoch=None, epoch_idx=0):

    model.train()
    running_loss = 0.0
    do_mixup_now = (mixup_prob > 0.0) and (mixup_off_epoch is None or epoch_idx < mixup_off_epoch)

    for batch in loader:
        optimizer.zero_grad()
        batch = batch.to(DEVICE)
        y = batch.y.view(-1).to(torch.float32)
        B = y.shape[0]

        if do_mixup_now and B > 1:
            # 샘플별 마스크 m ~ Bernoulli(mixup_prob)
            m = (torch.rand(B, device=DEVICE) < mixup_prob).to(torch.float32)  # [B]

            # 샘플별 λ (벡터) — 원하면 스칼라 한 개만 써도 됨
            lam_vec = Beta(mixup_alpha, mixup_alpha).sample((B,)).to(DEVICE)   # [B]
            # (옵션) 대칭화: 충분히 섞이도록
            lam_vec = torch.maximum(lam_vec, 1.0 - lam_vec)

            # 선택되지 않은 샘플은 lam_eff=1 → 원본 유지
            lam_eff = m * lam_vec + (1.0 - m) * 1.0                              # [B]

            perm = torch.randperm(B, device=DEVICE)
            outputs = model.forward_mixup(batch, lam_eff, perm).view(-1)
            y_perm = y[perm]
            y_mix = lam_eff * y + (1.0 - lam_eff) * y_perm
            loss = criterion(outputs, y_mix)
        else:
            outputs = model(batch).view(-1)
            loss = criterion(outputs, y)

        loss.backward()
        clip_grad_norm_(model.parameters(), 1000)
        optimizer.step()
        ema.update()

        running_loss += loss.item()

    return running_loss / len(loader)

def valid_fn(model, loader, is_tta):
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for data in loader:
            if is_tta:
                mask_data1 = mask_nodes(copy.deepcopy(data), 0.1)
                mask_data2 = mask_edges(copy.deepcopy(data), 0.1)

                data = data.to(DEVICE)
                mask_data1 = mask_data1.to(DEVICE)
                mask_data2 = mask_data2.to(DEVICE)

                # 원본 + 증강 예측
                out_orig = model(data).view(-1)
                out_node = model(mask_data1).view(-1)
                out_edge = model(mask_data2).view(-1)

                # 평균(TTA)
                outputs = torch.stack([out_orig, out_node, out_edge], dim=0).mean(dim=0)
            else:
                data = data.to(DEVICE)
                outputs = model(data).view(-1)

            all_preds.append(outputs.detach().cpu().float().numpy())
            all_labels.append(data.y.view(-1).detach().cpu().float().numpy())

    final_preds = np.concatenate(all_preds)
    final_labels = np.concatenate(all_labels)

    nrmse = normalized_rmse(final_labels, final_preds)
    pearson = pearson_correlation(final_labels, final_preds)
    score = competition_score(final_labels, final_preds)

    return nrmse, pearson, score

def evaluate_with_bootstrap(model, loader, is_tta, n_repeats=30, sample_ratio=0.8):
    model.eval()

    all_preds = []
    all_labels = []

    # 1. 먼저 전체 검증 데이터에 대한 예측을 한 번에 수행합니다.
    with torch.no_grad():
        for data in loader:
            if is_tta:
                # 원본 데이터와 증강 데이터 준비
                mask_data1 = mask_nodes(copy.deepcopy(data), 0.1)
                mask_data2 = mask_edges(copy.deepcopy(data), 0.1)

                data = data.to(DEVICE)
                mask_data1 = mask_data1.to(DEVICE)
                mask_data2 = mask_data2.to(DEVICE)

                # 원본 + 증강 예측
                out_orig = model(data).view(-1)
                out_node = model(mask_data1).view(-1)
                out_edge = model(mask_data2).view(-1)

                # 평균(TTA)
                outputs = torch.stack([out_orig, out_node, out_edge], dim=0).mean(dim=0)
            else:
                data = data.to(DEVICE)
                outputs = model(data).view(-1)

            all_preds.append(outputs.detach().cpu().float().numpy())
            all_labels.append(data.y.view(-1).detach().cpu().float().numpy())

    final_preds = np.concatenate(all_preds)
    final_labels = np.concatenate(all_labels)

    # 2. 부트스트랩(Bootstrap) 샘플링 및 점수 계산
    bootstrap_scores = []
    n_total = len(final_labels)
    sample_size = int(n_total * sample_ratio)

    for _ in range(n_repeats):
        # 비복원추출로 전체 데이터의 80%에 해당하는 인덱스를 랜덤하게 선택
        indices = np.random.choice(n_total, size=sample_size, replace=False)

        # 해당 인덱스로 라벨과 예측값 샘플링
        sampled_labels = final_labels[indices]
        sampled_preds = final_preds[indices]

        # 샘플링된 데이터로 점수 계산 후 리스트에 추가
        score = competition_score(sampled_labels, sampled_preds)
        bootstrap_scores.append(score)

    # 3. 반복하여 얻은 점수들의 평균과 표준편차 계산
    mean_score = np.mean(bootstrap_scores)
    std_score = np.std(bootstrap_scores)

    return mean_score, std_score



def predict(model, loader, is_tta):
    model.eval()

    all_preds = []
    with torch.no_grad():
        for data in loader:
            if is_tta:
                mask_data1 = mask_nodes(copy.deepcopy(data), 0.1)
                mask_data2 = mask_edges(copy.deepcopy(data), 0.1)

                data = data.to(DEVICE)
                mask_data1 = mask_data1.to(DEVICE)
                mask_data2 = mask_data2.to(DEVICE)

                # 원본 + 증강 예측
                out_orig = model(data).view(-1)
                out_node = model(mask_data1).view(-1)
                out_edge = model(mask_data2).view(-1)

                # 평균(TTA)
                outputs = torch.stack([out_orig, out_node, out_edge], dim=0).mean(dim=0)

            else:
                data = data.to(DEVICE)
                outputs = model(data).view(-1)

            all_preds.append(outputs.detach().cpu().float().numpy())

    final_preds = np.concatenate(all_preds)
    return final_preds

In [20]:
N_SPLITS = 3
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=CFG['SEED'])
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CFG['SEED'])

In [21]:
external_train_list = []
for fold, (tr_idx, val_idx) in enumerate(kf.split(external_data_full)):
    external_train = external_data_full.iloc[tr_idx].reset_index(drop=True)

    # external_train_list.append(external_train)

    mask_zero = external_train['Inhibition'] == 0
    zeros_df = external_train[mask_zero]
    nonzeros_df = external_train[~mask_zero]

    # 남길 0 샘플 개수 계산 (기존 데이터에서 5%정도 되었으므로 비슷하게 유지)
    # zero_ratio = 0.05
    zero_ratio = 0.058299
    n_nonzero = len(nonzeros_df)
    target_zero_count = int(round(zero_ratio * n_nonzero / (1 - zero_ratio)))

    # 무작위 추출
    keep_zero_n = min(target_zero_count, len(zeros_df))
    sampled_zeros = zeros_df.sample(n=keep_zero_n, random_state=CFG["SEED"])

    external_sub = pd.concat([nonzeros_df, sampled_zeros], ignore_index=True).reset_index(drop=True)
    external_train_list.append(external_sub)

# for fold, (train_idx, valid_idx) in enumerate(skf.split(data, data['Inhibition_binary'])):
# for fold, (train_idx, valid_idx) in enumerate(skf.split(data, data['Inhibition_bin10'])):
for fold, (train_idx, valid_idx) in enumerate(kf.split(data)):
    print(f"\n\n========== Fold {fold+1}/{N_SPLITS} ==========")
    fold_seed = CFG['SEED'] + (fold + 1) * 999


    fold_train = data.iloc[train_idx].copy().reset_index(drop=True)
    fold_valid = data.iloc[valid_idx].copy().reset_index(drop=True)

    fold_train = pd.concat([fold_train, external_train_list[fold]], ignore_index=True).reset_index(drop=True)
    # fold_train = pd.concat([fold_train, external_data_full], ignore_index=True).reset_index(drop=True)

    if fold == 0:
        print(f'Train Size: {fold_train.shape[0]}, Valid Size: {fold_valid.shape[0]}')

    train_transforms = T.Compose([
        RandomApply(MaskNodes(mask_ratio=0.1), p=0.5),
        RandomApply(MaskEdges(mask_ratio=0.1), p=0.5),
    ])

    try:
        del train_dataset, train_loader
        del valid_dataset, valid_loader
        del model
    except:
        pass

    train_dataset = OnTheFlyOGBCompatibleSmilesDataset(
        root="../train_tmp/MySmilesDataset",
        smiles_list=fold_train['Canonical_Smiles'].to_list(),
        labels_list=fold_train['Inhibition'].to_list(),
        # transform=train_transforms,
        transform=None,
    )
    train_loader = DataLoader(
        train_dataset,
        shuffle=True,
        batch_size=CFG['BATCH_SIZE'],
        num_workers=CFG['NUM_WORKERS'],
        pin_memory=True,
        persistent_workers=True,
        prefetch_factor=2,
        worker_init_fn=seed_worker,
        generator=torch.Generator().manual_seed(CFG['SEED']),
    )

    valid_dataset = OnTheFlyOGBCompatibleSmilesDataset(
        root="../valid_tmp/MySmilesDataset",
        smiles_list=fold_valid['Canonical_Smiles'].to_list(),
        labels_list=fold_valid['Inhibition'].to_list(),
        transform=None,
    )
    valid_loader = DataLoader(
        valid_dataset,
        shuffle=False,
        batch_size=CFG['BATCH_SIZE'],
        num_workers=2,
        pin_memory=True,
        persistent_workers=True,
    )

    INPUT_DIM = train_dataset.num_node_features # 9
    EDGE_DIM = train_dataset.num_edge_features # 3
    HIDDEN_DIM = 128
    NUM_LAYERS = 5
    deg = cal_degree(train_dataset)

    model = pna4finetuning(
        input_dim=INPUT_DIM,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        deg=deg.to(DEVICE),
        edge_dim=EDGE_DIM,
        model_path='model_weights/pna_128_5_epoch30.pt',
        freeze_encoder=False,
    ).to(DEVICE)

    if fold == 0:
        print(model)

    criterion = nn.MSELoss()
    # criterion = CompetitionLoss(alpha=1.0, beta=1.0).to(DEVICE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['LEARNING_RATE'], weight_decay=CFG['WEIGHT_DECAY'])
    ema = ExponentialMovingAverage(model.parameters(), decay=0.995)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1)

    patience = 0
    best_nrmse = float('inf')
    best_pearson = -float('inf')
    best_score = -float('inf')
    best_epoch = 0
    best_model_weights = None
    for epoch in range(CFG['EPOCHS']):
        train_loss = train_fn(model, train_loader, optimizer, criterion, ema, mixup_alpha=0.4, mixup_prob=0.5,)
        nrmse, pearson, score = valid_fn(model, valid_loader, is_tta=False)
        current_lr = optimizer.param_groups[0]["lr"]
        scheduler.step()

        if score > best_score:
            best_nrmse = nrmse
            best_pearson = pearson
            best_score = score
            best_epoch = epoch + 1
            patience = 0
            best_model_weights = copy.deepcopy(model.state_dict())
        else:
            patience += 1
            if patience > CFG['PATIENCE']:
                break

        print(
            f"[Fold {fold+1} Epoch {epoch+1:02d} LR {current_lr:.2e}] "
            f"Train loss {train_loss:.3f} | "
            f"Valid NRMSE {nrmse:.4f}, Pearson {pearson:.3f}, Score {score:.4f} | "
            f"Patience: {patience}"
        )

    if best_model_weights is not None:
        seed = CFG['SEED']
        save_path = f'./model_weights/model_{seed}_{fold+1}.pt'
        torch.save(best_model_weights, save_path)
        print(f'model saved in {save_path}')

    print(f'[Fold {fold+1}] Best epoch {best_epoch} | '
          f'NRMSE: {best_nrmse:.4f} Pearson: {best_pearson:.4f} Score: {best_score:.4f}')




Train Size: 9748, Valid Size: 561


Processing...
Done!
Processing...
Done!


차수 계산 완료, degree: tensor([   578,  42843, 116284,  71680,   4666,      1,      0,      0,      0,
             0])
가중치 로드 완료.
사전 학습된 인코더를 동결하지 않습니다 (전체 모델 학습).
pna4finetuning(
  (activation): ScaledSigmoid()
  (final_activation): ClampedGeneralizedSigmoid()
  (encoder): PNA(9, 128, num_layers=5)
  (finetune_head): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.1, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
)
[Fold 1 Epoch 01 LR 1.00e-04] Train loss 1386.388 | Valid NRMSE 0.3451, Pearson 0.252, Score 0.4535 | Patience: 0
[Fold 1 Epoch 02 LR 9.76e-05] Train loss 1132.749 | Valid NRMSE 0.3441, Pearson 0.235, Score 0.4453 | Patience: 1
[Fold 1 Epoch 03 LR 9.05e-05] Train loss 1036.701 | Valid NRMSE 0.3284, Pearson 0.279, Score 0.4751 | Patience: 0
[Fold 1 Epoch 04 LR 7.94e-05] Train loss 971.891 | Valid NRM

Processing...
Done!
Processing...
Done!


차수 계산 완료, degree: tensor([   524,  42900, 115643,  71667,   4566,      1,      1,      0,      0,
             0])
가중치 로드 완료.
사전 학습된 인코더를 동결하지 않습니다 (전체 모델 학습).
[Fold 2 Epoch 01 LR 1.00e-04] Train loss 1401.397 | Valid NRMSE 0.3939, Pearson 0.209, Score 0.4076 | Patience: 0
[Fold 2 Epoch 02 LR 9.76e-05] Train loss 1218.970 | Valid NRMSE 0.4083, Pearson 0.265, Score 0.4286 | Patience: 0
[Fold 2 Epoch 03 LR 9.05e-05] Train loss 1064.577 | Valid NRMSE 0.3541, Pearson 0.319, Score 0.4822 | Patience: 0
[Fold 2 Epoch 04 LR 7.94e-05] Train loss 985.344 | Valid NRMSE 0.3556, Pearson 0.353, Score 0.4986 | Patience: 0
[Fold 2 Epoch 05 LR 6.55e-05] Train loss 925.280 | Valid NRMSE 0.3140, Pearson 0.368, Score 0.5270 | Patience: 0
[Fold 2 Epoch 06 LR 5.00e-05] Train loss 894.295 | Valid NRMSE 0.3055, Pearson 0.377, Score 0.5360 | Patience: 0
[Fold 2 Epoch 07 LR 3.45e-05] Train loss 867.300 | Valid NRMSE 0.3064, Pearson 0.394, Score 0.5438 | Patience: 0
[Fold 2 Epoch 08 LR 2.06e-05] Train loss 846.7

Processing...
Done!
Processing...
Done!


차수 계산 완료, degree: tensor([   547,  42890, 116484,  71708,   4559,      0,      1,      0,      0,
             0])
가중치 로드 완료.
사전 학습된 인코더를 동결하지 않습니다 (전체 모델 학습).
[Fold 3 Epoch 01 LR 1.00e-04] Train loss 1098.548 | Valid NRMSE 0.3029, Pearson 0.266, Score 0.4813 | Patience: 0
[Fold 3 Epoch 02 LR 9.76e-05] Train loss 975.604 | Valid NRMSE 0.2919, Pearson 0.314, Score 0.5111 | Patience: 0
[Fold 3 Epoch 03 LR 9.05e-05] Train loss 901.437 | Valid NRMSE 0.2962, Pearson 0.328, Score 0.5160 | Patience: 0
[Fold 3 Epoch 04 LR 7.94e-05] Train loss 839.399 | Valid NRMSE 0.2827, Pearson 0.354, Score 0.5356 | Patience: 0
[Fold 3 Epoch 05 LR 6.55e-05] Train loss 804.229 | Valid NRMSE 0.2570, Pearson 0.374, Score 0.5583 | Patience: 0
[Fold 3 Epoch 06 LR 5.00e-05] Train loss 784.657 | Valid NRMSE 0.2678, Pearson 0.377, Score 0.5547 | Patience: 1
[Fold 3 Epoch 07 LR 3.45e-05] Train loss 768.173 | Valid NRMSE 0.2564, Pearson 0.372, Score 0.5577 | Patience: 2
[Fold 3 Epoch 08 LR 2.06e-05] Train loss 748.945

In [22]:
test_data = pd.read_csv('../data/test.csv')
test_data["Canonical_Smiles"] = test_data["Canonical_Smiles"].map(make_smile_canonical)

In [23]:
test_dataset = OnTheFlyOGBCompatibleSmilesDataset(
    root="../test_tmp/MySmilesDataset",
    smiles_list=test_data['Canonical_Smiles'].to_list(),
    labels_list=None,
)
test_loader = DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=CFG['BATCH_SIZE'],
)

val_nrmse_list, val_pearson_list, val_score_list = [], [], []
valid_preds, valid_targets = [], []
test_preds_list = []

# for fold, (train_idx, valid_idx) in enumerate(skf.split(data, data['Inhibition_binary'])):
for fold, (train_idx, valid_idx) in enumerate(kf.split(data)):
    print(f"\n\n===== Fold {fold+1}/{N_SPLITS} =====")

    fold_train = data.iloc[train_idx].copy().reset_index(drop=True)
    fold_valid = data.iloc[valid_idx].copy().reset_index(drop=True)

    train_dataset = OnTheFlyOGBCompatibleSmilesDataset(
        root="../train_tmp/MySmilesDataset",
        smiles_list=fold_train['Canonical_Smiles'].to_list(),
        labels_list=fold_train['Inhibition'].to_list(),
    )
    valid_dataset = OnTheFlyOGBCompatibleSmilesDataset(
        root="../valid_tmp/MySmilesDataset",
        smiles_list=fold_valid['Canonical_Smiles'].to_list(),
        labels_list=fold_valid['Inhibition'].to_list(),
    )
    valid_loader = DataLoader(
        valid_dataset,
        shuffle=False,
        batch_size=CFG['BATCH_SIZE'],
    )

    INPUT_DIM = train_dataset.num_node_features # 9
    EDGE_DIM = train_dataset.num_edge_features # 3
    HIDDEN_DIM = 128
    NUM_LAYERS = 5
    deg = cal_degree(train_dataset)

    model = pna4finetuning(
        input_dim=INPUT_DIM,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        deg=deg.to(DEVICE),
        edge_dim=EDGE_DIM,
        # model_path='model_weights/pna_128_6_epoch30.pt',
        model_path='model_weights/pna_128_5_epoch30.pt',
        freeze_encoder=False,
    ).to(DEVICE)

    seed = CFG["SEED"]
    model_path = f'./model_weights/model_{seed}_{fold+1}.pt'
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()

    if CFG['USE_AMP']:
        scaler = torch.amp.GradScaler()
        use_amp = True
    else:
        use_amp = False

    nrmse, pearson, score = valid_fn(model, valid_loader, is_tta=False)
    print(f"[Fold {fold+1}] No TTA | NRMSE: {nrmse:.4f}, Pearson: {pearson:.4f}, Score: {score:.4f}")

    # nrmse, pearson, score = valid_fn(model, valid_loader, is_tta=True)
    # print(f"[Fold {fold+1}] is TTA | NRMSE: {nrmse:.4f}, Pearson: {pearson:.4f}, Score: {score:.4f}")
    #
    # mean_score, std_score = evaluate_with_bootstrap(model, valid_loader, is_tta=False, n_repeats=50, sample_ratio=0.1)
    # print(f"[Fold {fold+1}] no TTA | Mean Score: {mean_score:.4f}, std: {std_score:.4f}")
    #
    # mean_score, std_score = evaluate_with_bootstrap(model, valid_loader, is_tta=True, n_repeats=50, sample_ratio=0.1)
    # print(f"[Fold {fold+1}] is TTA | Mean Score: {mean_score:.4f}, std: {std_score:.4f}")

    val_nrmse_list.append(nrmse)
    val_pearson_list.append(pearson)
    val_score_list.append(score)

    with torch.no_grad():
        for batch in valid_loader:
            batch = batch.to(DEVICE)

            pred = model(batch).squeeze(-1)      # (batch,) shape

            target = batch.y.squeeze(-1)         # (batch,)
            valid_preds += pred.cpu().tolist()
            valid_targets += target.cpu().tolist()

            nrmse = normalized_rmse(target.cpu().tolist(), pred.cpu().tolist())

    ## Test 데이터 예측
    predictions = predict(model, test_loader, is_tta=False)
    test_preds_list.append(predictions)

avg_nrmse = np.mean(val_nrmse_list, axis=0)
avg_pearson = np.mean(val_pearson_list, axis=0)
avg_score = np.mean(val_score_list, axis=0)

seed = CFG['SEED']
total_score = competition_score(valid_targets, valid_preds)
print(f'[Seed {seed}] Average NRMSE: {avg_nrmse:.4f} Pearson: {avg_pearson:.4f} Score: {avg_score:.4f} | '
      f'Total Score: {total_score:.4f}')



===== Fold 1/3 =====


Processing...
Done!
Processing...
Done!
Processing...
Done!


차수 계산 완료, degree: tensor([   37,  4814, 13145,  8644,   523,     0,     0,     0,     0,     0])
가중치 로드 완료.
사전 학습된 인코더를 동결하지 않습니다 (전체 모델 학습).
[Fold 1] No TTA | NRMSE: 0.2466, Pearson: 0.4287, Score: 0.5911


===== Fold 2/3 =====


Processing...
Done!
Processing...
Done!


차수 계산 완료, degree: tensor([   35,  4855, 13067,  8811,   485,     0,     0,     0,     0,     0])
가중치 로드 완료.
사전 학습된 인코더를 동결하지 않습니다 (전체 모델 학습).
[Fold 2] No TTA | NRMSE: 0.2520, Pearson: 0.4800, Score: 0.6140


===== Fold 3/3 =====


Processing...
Done!
Processing...
Done!


차수 계산 완료, degree: tensor([   36,  4763, 13120,  8597,   500,     0,     0,     0,     0,     0])
가중치 로드 완료.
사전 학습된 인코더를 동결하지 않습니다 (전체 모델 학습).
[Fold 3] No TTA | NRMSE: 0.2460, Pearson: 0.4724, Score: 0.6132
[Seed 2025] Average NRMSE: 0.2482 Pearson: 0.4604 Score: 0.6061 | Total Score: 0.6084


- layernorm + batchnorm: Score: 0.6161 | Total Score: 0.6197
- batchnorm: Score: 0.6223 | Total Score: 0.6228

In [24]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['Inhibition'] = np.nan
submission['Inhibition'] = np.mean(test_preds_list, axis=0)

seed = CFG['SEED']
save_path_submit = f'../Submission/pna_sigmoid_seed{seed}.csv'
submission.to_csv(save_path_submit, index=False)
print(f'save submission in {save_path_submit}')

In [25]:
submission

Unnamed: 0,ID,Inhibition
0,TEST_000,54.837250
1,TEST_001,66.812386
2,TEST_002,34.733204
3,TEST_003,25.802750
4,TEST_004,19.682375
...,...,...
95,TEST_095,25.447227
96,TEST_096,62.866436
97,TEST_097,59.519512
98,TEST_098,95.576195


In [28]:
seed_list = [42, 2025]
submission_list = []

for seed in seed_list:
    seed_sub = pd.read_csv(f'../Submission/pna_sigmoid_seed{seed}.csv')
    submission_list.append(seed_sub['Inhibition'])

submission = pd.read_csv('../data/sample_submission.csv')
submission['Inhibition'] = np.nan
submission['Inhibition'] = np.mean(submission_list, axis=0)

save_path_submit = f'../Submission/pna_sigmoid_seed_ensemble.csv'
submission.to_csv(save_path_submit, index=False)
print(f'save submission in {save_path_submit}')

save submission in ../Submission/pna_sigmoid_seed_ensemble.csv


In [29]:
submission

Unnamed: 0,ID,Inhibition
0,TEST_000,54.925894
1,TEST_001,62.727604
2,TEST_002,38.337173
3,TEST_003,16.572712
4,TEST_004,17.799762
...,...,...
95,TEST_095,26.743040
96,TEST_096,54.501399
97,TEST_097,60.977345
98,TEST_098,93.625410
