In [None]:
%%capture
!pip install datasets evaluate multimolecule==0.0.5

In [11]:
import os
import pandas as pd
import torch
from transformers import (
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from google.colab import drive

In [None]:
drive.mount('/content/drive')

WORKING_DIRECTORY = '/content/drive/MyDrive/Machine Learning (CS-433)/Project 2'
DATASET_PATH = 'data/dataset.txt'

%cd {WORKING_DIRECTORY}

In [None]:
from BP_LM.scripts.data_preprocessing import *
from BP_LM.scripts.trainer_datasets_creation import create_dataset
from BP_LM.scripts.compute_metrics import compute_metrics
from BP_LM.scripts.model_choice import set_multimolecule_model

os.environ["WANDB_MODE"] = "disabled"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [14]:
# Model and dataset variables

MULTIMOLECULE_MODEL = "splicebert" # Available models are: rnafm, rnamsm, ernierna, utrlm-te_el, splicebert, rnabert.
SAMPLE_N_DATAPOINTS = 100  # Set to None to use the full dataset
SEED = 32

# Training hyperparameters

BATCH_SIZE = 4
LEARNING_RATE = 3e-4
TRAIN_EPOCHS = 3
WEIGHT_DECAY = 0.001
OPTIMIZER = "adamw_torch"
MODEL_OUTPUT_DIRECTORY = f"multimolecule-{MULTIMOLECULE_MODEL}-finetuned-secondary-structure"

In [None]:
# Initialize the selected multimolecule model
model, tokenizer, MODEL_MAX_INPUT_SIZE = set_multimolecule_model(MULTIMOLECULE_MODEL)

# Load data and create dataset
df = pd.read_csv(DATASET_PATH, sep='\t')
train_dataset, val_dataset, test_dataset = create_dataset(df, tokenizer, model, MODEL_MAX_INPUT_SIZE, SEED, SAMPLE_N_DATAPOINTS)

# Set up data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
# Define model training parameters
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIRECTORY,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=TRAIN_EPOCHS,
    optim=OPTIMIZER,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model="F1",
    # eval_accumulation_steps=10,
)

#Define metrics function
metrics = lambda x: compute_metrics(x, "test_metrics")

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train model
trainer.train()