# General Notebook on fine-tuning branch point prediction using any of MultiMolecule models

Any RNA model can be selected from the [MultiMolecule](https://multimolecule.danling.org/models/) website. Simple change the `MULTIMOLECULE_MODEL` variable in the cell below, and the the two cells under the tokenizer "Load the desired model and tokenizer" section.

In [12]:
# GLOBAL VARIABLES
WORKING_DIRECTORY = '/content/drive/MyDrive/epfl_ml_project'
DATASET_PATH = 'data/fresh_dataset.txt'
MODEL_MAX_INPUT_SIZE = 1024
MULTIMOLECULE_MODEL = "ernierna"
SAMPLE_N_DATAPOINTS = 20000 # Sample a small subset of data for testing purposes. Set to None if training on full dataset
SEED = 32

In [13]:
%%capture
!pip install datasets evaluate multimolecule==0.0.5

In [14]:
import pandas as pd
import torch
from transformers import (
     DataCollatorForTokenClassification,
     TrainingArguments,
     Trainer
)

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [16]:
from google.colab import drive
drive.mount('/content/drive')

# Change working directory to Project folder, you may change this as needed
%cd {WORKING_DIRECTORY}

from BP_LM.scripts.data_preprocessing import *
from BP_LM.scripts.trainer_datasets_creation import *
from BP_LM.scripts.compute_metrics import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/epfl_ml_project


## Load the desired model and tokenizer

In [17]:
# Change this import depending on the model
from multimolecule import RnaTokenizer, ErnieRnaForTokenPrediction, ErnieRnaConfig

tokenizer = RnaTokenizer.from_pretrained(f'multimolecule/{MULTIMOLECULE_MODEL}')
# Change line below depending on what model we want
config = ErnieRnaConfig()
config.problem_type = "single_label_classification"
config.num_labels = 2
model = ErnieRnaForTokenPrediction.from_pretrained(f'multimolecule/{MULTIMOLECULE_MODEL}', config=config)

Some weights of ErnieRnaForTokenPrediction were not initialized from the model checkpoint at multimolecule/ernierna and are newly initialized: ['token_head.decoder.bias', 'token_head.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Set up the collator
data_collator = DataCollatorForTokenClassification(tokenizer)

## Create dataset objects for training

In [19]:
# Load dataset
df = pd.read_csv(DATASET_PATH, sep='\t')

train_dataset, val_dataset, test_dataset = create_dataset(df, tokenizer, model, MODEL_MAX_INPUT_SIZE, SEED, SAMPLE_N_DATAPOINTS)

Chromosomes in train set: {'chr1', 'chr5', 'chr13', 'chrY', 'chrX', 'chr3', 'chr4', 'chr21', 'chr2', 'chr15', 'chr22', 'chr16', 'chr14', 'chr18', 'chr6', 'chr7', 'chr17', 'chr12', 'chr19', 'chr20'}
Chromosomes in validation set: {'chr9', 'chr10'}
Chromosomes in test set: {'chr11', 'chr8'}

Total data points: 2000
Train set contains 1663 data points (83.15%)
Validation set contains 156 data points (7.80%)
Test set contains 181 data points (9.05%)


## Train model

In [20]:
# Do not save to W&B
import os
os.environ["WANDB_MODE"] = "disabled"

In [21]:
# Define model training parameters
batch_size = 16

args = TrainingArguments(
    f"multimolecule-{MULTIMOLECULE_MODEL}-finetuned-secondary-structure",
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=10000,
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    optim = "adamw_torch",
    weight_decay=0.001,
    load_best_model_at_end=True,
    metric_for_best_model="F1",
)

In [22]:
metrics = lambda x: compute_metrics(x, "test_metrics")
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss,Validation Loss


TrainOutput(global_step=312, training_loss=0.013283839592566857, metrics={'train_runtime': 308.2708, 'train_samples_per_second': 16.184, 'train_steps_per_second': 1.012, 'total_flos': 2630452641008184.0, 'train_loss': 0.013283839592566857, 'epoch': 3.0})

In [23]:
trained_model = AutoModelForTokenClassification.from_pretrained(f"multimolecule-{MULTIMOLECULE_MODEL}-finetuned-secondary-structure/checkpoint-777") #make sure you are loading the right checkpoint
tokenizer = AutoTokenizer.from_pretrained(f"multimolecule-{MULTIMOLECULE_MODEL}-finetuned-secondary-structure/checkpoint-777")
data_collator = DataCollatorForTokenClassification(tokenizer)

NameError: name 'AutoModelForTokenClassification' is not defined

In [None]:
metric = lambda x: compute_metrics_test(x, "test_metrics", 0.001895)

testing_args = TrainingArguments(
    output_dir='/results',
    per_device_eval_batch_size=batch_size,
    do_eval=True,
    no_cuda=False,
)

tester = Trainer(
    model=trained_model,
    args=testing_args,
    eval_dataset=test_dataset,
    compute_metrics=metric,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

metrics = tester.evaluate()

print("Evaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value}")