### Master testing notebook
Notebook for computing test parameters on all trained models

In [13]:
%%capture
!pip install datasets evaluate multimolecule==0.0.5

In [14]:
import os
import pandas as pd
import torch
from transformers import (
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from google.colab import drive

import matplotlib.pyplot as plt

In [15]:
drive.mount('/content/drive')

WORKING_DIRECTORY = '/content/drive/MyDrive/Machine_Learning_(CS-433)/Project_2'
DATASET_PATH = 'data/dataset.txt'

%cd {WORKING_DIRECTORY}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Machine_Learning_(CS-433)/Project_2


In [16]:
from BP_LM.scripts.data_preprocessing import *
from BP_LM.scripts.trainer_datasets_creation import create_dataset
from BP_LM.scripts.compute_metrics import compute_metrics
from BP_LM.scripts.model_choice import set_multimolecule_model

os.environ["WANDB_MODE"] = "disabled"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [17]:
# Model and dataset variables
MULTIMOLECULE_MODEL = "rnabert" # Available models are: rnafm, rnamsm, ernierna, utrlm-te_el, splicebert, rnabert.
SAMPLE_N_DATAPOINTS = None  # Set to None to use the full dataset
SEED = 32
BATCH_SIZE = 16

In [None]:
# Initialize the selected multimolecule model
trained_model, tokenizer, MODEL_MAX_INPUT_SIZE, ideal_threshold = set_multimolecule_model(MULTIMOLECULE_MODEL, for_testing = True) #Requires there is saved model at f"{MULTIMOLECULE_MODEL}-finetuned-secondary-structure/best_model"

# Load data and create dataset
df = pd.read_csv(DATASET_PATH, sep='\t')
train_dataset, val_dataset, test_dataset = create_dataset(df, tokenizer, trained_model, MODEL_MAX_INPUT_SIZE, SEED, SAMPLE_N_DATAPOINTS)

# Set up data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

Chromosomes in train set: {'chr3', 'chr4', 'chrX', 'chr13', 'chr18', 'chr16', 'chr20', 'chr12', 'chr1', 'chr14', 'chr7', 'chr17', 'chrY', 'chr19', 'chr21', 'chr2', 'chr15', 'chr22', 'chr5', 'chr6'}
Chromosomes in validation set: {'chr10', 'chr9'}
Chromosomes in test set: {'chr11', 'chr8'}

Total data points: 177980
Train set contains 147559 data points (82.91%)
Validation set contains 14434 data points (8.11%)
Test set contains 15987 data points (8.98%)


In [None]:
metric = lambda x: compute_metrics(x, MULTIMOLECULE_MODEL, False, ideal_threshold)

testing_args = TrainingArguments(
    output_dir='/results',
    per_device_eval_batch_size=BATCH_SIZE,
    do_eval=True,
    no_cuda=False,
)

tester = Trainer(
    model=trained_model,
    args=testing_args,
    eval_dataset=test_dataset,
    compute_metrics=metric,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

metrics = tester.evaluate()

print("Evaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value}")