In [1]:
!pip install numpy pathlib pandas gdown lxml hf_transfer scikit-learn peft==0.13.2 transformers==4.46.3

Collecting pathlib
  Downloading pathlib-1.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting lxml
  Downloading lxml-6.0.2-cp311-cp311-win_amd64.whl.metadata (3.7 kB)
Collecting hf_transfer
  Downloading hf_transfer-0.1.9-cp38-abi3-win_amd64.whl.metadata (1.8 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting peft==0.13.2
  Using cached peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting transformers==4.46.3
  Using cached transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting pyyaml (from peft==0.13.2)
  Downloading pyyaml-6.0.3-cp311-cp311-win_amd64.whl.metadata (2.4 kB)
Collecting tqdm (from peft==0.13.2)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting accelerate>=0.21.0 (from peft==0.13.2)
  Downloading accelerate-1.

In [None]:
!gdown --folder "https://drive.google.com/drive/folders/1wAS0umYohuR53r4sqroxxiG2ab5p5msn"

In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from peft import LoraConfig, get_peft_model

from transformers.models.esm.tokenization_esm import EsmTokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

import pandas as pd

import os
import random
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\PNC\anaconda3\envs\GLM_311\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\PNC\anaconda3\envs\GLM_311\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\PNC\anaconda3\envs\GLM_311\Lib\site-packages\ipykernel\kernelapp.py", line 75

In [3]:
def set_seed(seed: int=7) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(7)

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

DATA_PATH = "./genomic_language_model/"
df = pd.read_csv(DATA_PATH + "fine_tuning.csv")

ref_seq = df["reference_seq"]
var_seq = df["variant_seq"]
label = df["label"]

train_df, val_df = train_test_split(df, test_size=0.1, shuffle=True, stratify=df["label"])

In [7]:
max_seq_len = max(ref_seq.str.len().max(), var_seq.str.len().max())
print(f"Rows = {len(df):,}, Max Sequence Length = {max_seq_len}")

Rows = 397,182, Max Sequence Length = 512


In [8]:
BATCH_SIZE = 64

MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
backbone = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)

MODEL_CAP = tokenizer.model_max_length
MAX_LEN = min(MODEL_CAP, max_seq_len)

In [9]:
class SiameseDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: EsmTokenizer, max_len: int) -> None:
        self.ref_seq = df["reference_seq"].tolist()
        self.var_seq = df["variant_seq"].tolist()
        self.label = df["label"].tolist()
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self) -> int:
        return len(self.ref_seq)
    
    def __getitem__(self, idx: int) -> dict:
        return {"ref_seq": self.ref_seq[idx], "var_seq" : self.var_seq[idx], "label" : torch.tensor(self.label[idx], dtype=torch.float)}
    
def collate_fn(batch: torch.tensor, tok: EsmTokenizer=tokenizer, max_len: int=MAX_LEN) -> dict:
    ref_seq = [b["ref_seq"] for b in batch]
    var_seq = [b["var_seq"] for b in batch]
    label = torch.stack([b["label"] for b in batch])
    
    ref_enc = tok.batch_encode_plus(
        ref_seq,
        return_tensors="pt",
        padding="longest",
        truncation=True,
        max_length=max_len
    )

    var_enc = tok.batch_encode_plus(
        var_seq,
        return_tensors="pt",
        padding="longest",
        truncation=True,
        max_length=max_len
    )

    return {
        "ref_input_ids": ref_enc["input_ids"],
        "ref_attention_mask": ref_enc["attention_mask"],
        "var_input_ids": var_enc["input_ids"],
        "var_attention_mask": var_enc["attention_mask"],
        "label": label
    }

In [10]:
train_dataset = SiameseDataset(train_df, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                    collate_fn=collate_fn)

val_dataset = SiameseDataset(val_df, tokenizer, MAX_LEN)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
                         collate_fn=collate_fn)

In [2]:
class BackboneModel(nn.Module):
    def __init__(self, backbone, reconstruction_dim: int=2048) -> None:
        super().__init__()
        self.backbone = backbone
        
        for name, p in self.backbone.named_parameters():
            if "lora_" not in name:
                p.requires_grad = False

        hidden_size = backbone.config.hidden_size
        self.reconstruction_layer = nn.Linear(hidden_size, reconstruction_dim)

    def forward(self, input_ids: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
        outs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )

        last_hidden = outs.hidden_states[-1]
        mask_exp = attention_mask.unsqueeze(-1)

        summed = (last_hidden * mask_exp).sum(dim=1)
        counts = mask_exp.sum(dim=1).clamp(min=1)
        seq_emb = summed / counts

        seq_emb = self.reconstruction_layer(seq_emb)
        
        return seq_emb

NameError: name 'nn' is not defined

In [82]:
class SiameseModel(nn.Module):
    def __init__(self, backbone, reconstruction_dim: int=2048) -> None:
        super().__init__()
        self.encoder = BackboneModel(backbone, reconstruction_dim)

    def forward(self, ref_input_ids, ref_attention_mask,
                      var_input_ids, var_attention_mask):
        ref_emb = self.encoder(ref_input_ids, ref_attention_mask)
        var_emb = self.encoder(var_input_ids, var_attention_mask)

        return ref_emb, var_emb

In [None]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin: float=0.5):
        super().__init__()
        self.margin = margin

    def forward(self, emb1, emb2, label):
        dist = 1 - F.cosine_similarity(emb1, emb2, dim=-1)
        loss = 0.5 * label * dist + 0.5 * (1 - label) * F.relu(self.margin - dist)

        return 30 * loss.mean()

In [None]:
def validating(model, valDL, data_size, loss_fn, device):
    model.eval()

    loss_total = 0

    use_amp = (DEVICE == "cuda")

    with torch.no_grad():
        for batch in valDL:
            ref_ids = batch["ref_input_ids"].to(device)
            ref_attn_mask = batch["ref_attention_mask"].to(device)
            var_ids = batch["var_input_ids"].to(device)
            var_attn_mask = batch["var_attention_mask"].to(device)
            label = batch["label"].to(device)

            batch_size = len(ref_ids)
            
            with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=use_amp):
                ref_emb, var_emb = model(ref_ids, ref_attn_mask, var_ids, var_attn_mask)
                loss = loss_fn(ref_emb, var_emb, label).to(device)

            loss_total += loss.item() * batch_size
    
    avg_loss = loss_total / data_size

    return avg_loss


In [None]:
def training(model, trainDL, valDL, optimizer, epoch,
            data_size, val_data_size, loss_fn,
            scheduler, device, lora):
    SAVE_PATH = "./saved_models"
    os.makedirs(SAVE_PATH, exist_ok=True)

    BREAK_CNT_LOSS = 0
    LIMIT_VALUE = 5

    LOSS_HISTORY = [[], []]

    use_amp = (DEVICE == "cuda")

    for count in range(1, epoch + 1):
        model.train()

        SAVE_WEIGHT = os.path.join(SAVE_PATH, f'model_weights.pth')
        SAVE_LORA_WEIGHT = os.path.join(SAVE_PATH, f"best_lora_weights")

        loss_total = 0

        for batch in trainDL:
            ref_ids = batch["ref_input_ids"].to(device)
            ref_attn_mask = batch["ref_attention_mask"].to(device)
            var_ids = batch["var_input_ids"].to(device)
            var_attn_mask = batch["var_attention_mask"].to(device)
            label = batch["label"].to(device)

            batch_size = len(ref_ids)

            with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=use_amp):
                ref_emb, var_emb = model(ref_ids, ref_attn_mask, var_ids, var_attn_mask)
                loss = loss_fn(ref_emb, var_emb, label).to(device)

            loss_total += loss.item() * batch_size

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        val_loss = validating(model, valDL, val_data_size, loss_fn, device)

        LOSS_HISTORY[0].append(loss_total / data_size)
        LOSS_HISTORY[1].append(val_loss)

        print(f"[{count} / {epoch}]\n - TRAIN LOSS : {LOSS_HISTORY[0][-1]}")
        print(f"VAL LOSS : {LOSS_HISTORY[1][-1]}")

        scheduler.step(val_loss)

        if len(LOSS_HISTORY[0]) >= 2:
            if LOSS_HISTORY[0][-1] >= LOSS_HISTORY[0][-2]: BREAK_CNT_LOSS += 1
        
        if len(LOSS_HISTORY[0]) == 1:
            lora.save_pretrained(SAVE_LORA_WEIGHT)
            torch.save(model.encoder.reconstruction_layer.state_dict(), SAVE_WEIGHT)
        
        else:
            if LOSS_HISTORY[0][-1] < min(LOSS_HISTORY[0][:-1]):
                lora.save_pretrained(SAVE_LORA_WEIGHT)
                torch.save(model.encoder.reconstruction_layer.state_dict(), SAVE_WEIGHT)
        
        if BREAK_CNT_LOSS > LIMIT_VALUE:
            print(f"성능 및 손실 개선이 없어서 {count} EPOCH에 학습 중단")
            break
    
    return LOSS_HISTORY
    

In [None]:
lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=["query", "key", "value"],
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM"
)

lora = get_peft_model(backbone, lora_config)

In [None]:
LR = 1e-4
EPOCH = 100
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = SiameseModel(lora).to(DEVICE)

params = [p for n, p in model.named_parameters() if p.requires_grad]
optimizer = optim.AdamW(params, lr=LR)

loss_fn = ContrastiveLoss()
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=3)

data_size = len(train_dataset)
val_data_size = len(val_dataset)

In [None]:
model.encoder.backbone.print_trainable_parameters()

In [None]:
loss = training(model=model, trainDL=train_loader, valDL=val_loader, optimizer=optimizer,
                epoch=EPOCH, data_size=data_size, val_data_size=val_data_size,
                loss_fn=loss_fn, scheduler=scheduler, device=DEVICE, lora=lora)

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 14.40 GiB is allocated by PyTorch, and 203.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [1]:
data_path = './genomic_language_model/'
df = pd.read_csv(data_path + 'test.csv')

max_seq_len = df["seq"].str.len().max()
EFFECTIVE_MAX_LEN = min(MODEL_CAP, max_seq_len)

EFFECTIVE_MAX_LEN

NameError: name 'pd' is not defined

In [None]:
class SeqDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.ids  = df["ID"].tolist()
        self.seqs = df["seq"].tolist()
        self.tok  = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        return {"ID": self.ids[idx], "seq": self.seqs[idx]}

def collate_fn(batch, tok=tokenizer, max_len=EFFECTIVE_MAX_LEN):
    ids  = [b["ID"] for b in batch]
    seqs = [b["seq"] for b in batch]
    enc  = tok.batch_encode_plus(
        seqs,
        return_tensors="pt",
        padding="longest",          
        truncation=True,
        max_length=max_len
    )
    # attention_mask: pad 토큰이 0
    return {
        "ids": ids,
        "input_ids": enc["input_ids"],
        "attention_mask": enc["attention_mask"]
    }

In [None]:
dataset = SeqDataset(df, tokenizer, EFFECTIVE_MAX_LEN)
loader  = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False,
                     collate_fn=collate_fn)
print("✅ Dataloader ready.")

In [None]:
lora = PeftModel.from_pretrained(
    backbone,
    "./saved_models/best_lora_weights"
)

lora_model = BackboneModel(lora).to(DEVICE)
lora_model.reconstruction_layer.load_state_dict(torch.load("./saved_models/model_weights.pth", weights_only=True))

all_ids = []
all_embs = []
use_amp = (DEVICE == "cuda")

lora_model.eval()
with torch.no_grad():
    for batch in loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attn_mask = batch["attention_mask"].to(DEVICE)

        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=use_amp):
            outs = lora_model(
                input_ids,
                attention_mask=attn_mask
            )
    
        all_ids.extend(batch["ids"])
        all_embs.append(outs.detach().cpu())

emb = torch.vstack(all_embs).float()        # (N, H)
N, H = emb.shape
print(f"✅ Embedding shape = {N} x {H}")

In [None]:
sample_submission = pd.read_csv(data_path + 'sample_submission.csv')

emb_np = emb.numpy()
emb_cols = [f"emb_{i:04d}" for i in range(emb_np.shape[1])]
emb_df = pd.DataFrame(emb_np, columns=emb_cols)

submission = pd.concat([sample_submission['ID'], emb_df], axis=1)
submission.to_csv('lora_submission.csv', index=False)