# Prepare finetuning data

In [None]:
import unirna_tf
import torch

torch.cuda.set_device(1)
import pandas as pd

csv_dataset = pd.read_csv("train.csv").to_dict(orient="records")

from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("../weights/unirna_L16_E1024_DPRNA500M_STEP400K")


class finetune_dataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        tokenized_utr = self.tokenizer(
            self.dataset[idx]["utr"], padding="max_length", truncation=True, max_length=1024, return_tensors="pt"
        )

        return {
            "input_ids": tokenized_utr["input_ids"].squeeze(),
            "attention_mask": tokenized_utr["attention_mask"].squeeze(),
            "label": self.dataset[idx]["scaled_rl"],
        }

In [None]:
dataset = finetune_dataset(csv_dataset, tokenizer)
trainset, testset = torch.utils.data.random_split(
    dataset, [int(len(dataset) * 0.8), len(dataset) - int(len(dataset) * 0.8)]
)

# Prepare pretrained models

In [None]:
model = AutoModel.from_pretrained("../weights/unirna_L8_E512_STEP290K_DPRNA100M")

In [None]:
import torch.nn as nn


class CustomLanguageModelHead(nn.Module):
    def __init__(self, model_backbone, hidden_size):
        super(CustomLanguageModelHead, self).__init__()

        self.bert_base = model_backbone
        self.decoder = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask=None, **kwargs):
        outputs = self.bert_base(input_ids, attention_mask=attention_mask)
        sequence_output = outputs["pooler_output"]  # we use cls token as the representation of the whole sequence
        prediction_scores = self.decoder(sequence_output)
        return prediction_scores

In [None]:
# change hidden size to the model you use
# for L16 model, use 1024, for L8 model, use 512, for L12 model, use 768, for L24 model, use 1280
model_with_lm_head = CustomLanguageModelHead(model, 512)

# Model training

In [None]:
import torch
import tqdm
from torch.utils.data import DataLoader
from torch.nn import MSELoss

device = "cuda" if torch.cuda.is_available() else "cpu"
optimizer = torch.optim.Adam(model_with_lm_head.parameters(), lr=1e-4)
train_loader = DataLoader(trainset, batch_size=8, shuffle=True)

In [None]:
model_with_lm_head.train()
model_with_lm_head.to(device)

for epoch in range(3):
    tqdmer = tqdm.tqdm(train_loader, total=len(train_loader))
    mseloss = MSELoss()
    for data in tqdmer:
        label = data["label"].to(device, dtype=torch.float)
        input_ids = data["input_ids"].to(device, dtype=torch.long)
        attention_mask = data["attention_mask"].to(device, dtype=torch.long)
        outputs = model_with_lm_head(input_ids=input_ids, attention_mask=attention_mask)

        loss = mseloss(outputs, label)
        tqdmer.set_postfix(epoch=epoch, loss=loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()