In [1]:
# GLOBAL VARIABLES
WORKING_DIRECTORY = '/content/drive/MyDrive/epfl_ml_project'
DATASET_PATH = 'dataset/dataset.txt'
MODEL_MAX_INPUT_SIZE = 1024
MODEL_PATH = "models/SpliceBERT.1024nt"  # set the path to the folder of pre-trained SpliceBERT
SAMPLE_N_DATAPOINTS = 5000 # Sample a small subset of data for testing purposes
SEED = 66 # 42

In [2]:
# Install datasets as it is not already installed on colab
%%capture
!pip install datasets

In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score
import torch.nn.functional as F

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Change working directory to Project folder, you may change this as needed
%cd {WORKING_DIRECTORY}

from BP_LM.data_preprocessing import *

Mounted at /content/drive
/content/drive/MyDrive/epfl_ml_project


In [6]:
# Load dataset
df = pd.read_csv(DATASET_PATH, sep='\t')

# Calculate BP_POS_WITHIN_STRAND
df['BP_POS_WITHIN_STRAND'] = df['IVS_SIZE'] + df['BP_ACC_DIST']
print(df.shape)


(177980, 13)


In [7]:
# Remove all data points where the BP is farther than
df = df[df['IVS_SIZE'] - df['BP_POS_WITHIN_STRAND'] <= MODEL_MAX_INPUT_SIZE]
print(df.shape)

if SAMPLE_N_DATAPOINTS:
    df = df.sample(n = SAMPLE_N_DATAPOINTS, random_state=SEED)


(177980, 13)


In [8]:
# Create a split based on chromosome types (Alis idea)
train_chrs = ["chr1", "chr2", "chr3", "chr4",
              "chr5","chr10",
              "chr11", "chr12", "chr13", "chr14",
              "chr15", "chr16", "chr17", "chr18",
              "chr19", "chr22",
              "chrX", "chrY"]

# Keep chr6 and chr7 in train if we want a 90/10/10 train/val/test split
test_chrs = ["chr8", "chr20", "chr6"]
val_chrs = ["chr9", "chr21", "chr7"]

train_df, test_df, val_df = split_train_test_on_chr(df, train_chrs, val_chrs, test_chrs, shuffle=True)

Chromosomes in train set: {'chr2', 'chr19', 'chr11', 'chr5', 'chr12', 'chrY', 'chr14', 'chr15', 'chr17', 'chr1', 'chr13', 'chr18', 'chr22', 'chr10', 'chr3', 'chr4', 'chrX', 'chr16'}
Chromosomes in validation set: {'chr21', 'chr9', 'chr7'}
Chromosomes in test set: {'chr20', 'chr8', 'chr6'}

Total data points: 1000
Train set contains 815 data points (81.50%)
Validation set contains 88 data points (8.80%)
Test set contains 97 data points (9.70%)


In [9]:
train_seqs, train_labels = extract_intron_seq_and_labels(train_df, max_model_input_size=MODEL_MAX_INPUT_SIZE, truncate=True)
test_seqs, test_labels = extract_intron_seq_and_labels(test_df, max_model_input_size=MODEL_MAX_INPUT_SIZE, truncate=True)
val_seqs, val_labels = extract_intron_seq_and_labels(val_df, max_model_input_size=MODEL_MAX_INPUT_SIZE, truncate=True)

In [10]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# finetuning SpliceBERT for token classification tasks
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH, num_labels = 2) # We want binary classification on tokens so num_labels = 2

Some weights of BertForTokenClassification were not initialized from the model checkpoint at models/SpliceBERT.1024nt and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now we do it for our data

In [11]:
# Tokenize the input data
train_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in train_seqs] # There shouldn't be any "U"s in the training data
test_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in test_seqs]
val_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in val_seqs]

In [12]:
def pad_labels(labels, max_length, pad_label=-100):
    """
    Pads labels with -100 which is apparenty standard in HuggingFace
    """
    padded_labels = []
    for label in labels:
        if len(label) < max_length:
            padded = label + [pad_label] * (max_length - len(label))
        else:
            padded = label[:max_length]
        padded_labels.append(padded)
    return padded_labels

max_length = MODEL_MAX_INPUT_SIZE # Ensure this matches the tokenizer's max_length

train_labels_padded = pad_labels(train_labels, max_length)
test_labels_padded = pad_labels(test_labels, max_length)
val_labels_padded = pad_labels(val_labels, max_length)

train_ids = tokenizer(train_seqs, padding='max_length', padding_side='left', max_length=max_length)
test_ids = tokenizer(test_seqs, padding='max_length', padding_side='left', max_length=max_length)
val_ids = tokenizer(val_seqs, padding='max_length', padding_side='left', max_length=max_length)

# Create Datasets
train_dataset = Dataset.from_dict(train_ids)
train_dataset = train_dataset.add_column("labels", train_labels_padded)

test_dataset = Dataset.from_dict(test_ids)
test_dataset = test_dataset.add_column("labels", test_labels_padded)

val_dataset = Dataset.from_dict(val_ids)
val_dataset = val_dataset.add_column("labels", val_labels_padded)

# Set up the collator (I think it does padding)
data_collator = DataCollatorForTokenClassification(tokenizer)

In [13]:
# Do not save to W&B
import os
os.environ["WANDB_MODE"] = "disabled"

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

model_name = MODEL_PATH.split("/")[-1]
batch_size = 8

def compute_metrics(pred):
    predictions, labels = pred
    predictions = predictions[0]

    predictions = np.array(predictions)
    labels = np.array(labels)

    preds = np.argmax(predictions, axis=-1)

    sequence_matches = 0
    total_sequences = 0

    # for label, prediction in zip(labels, preds):
    #     nonpadded_indices = label != -100 # Only consider non-padded tokens
    #     nonpadded_labels = label[nonpadded_indices]
    #     preds = prediction[nonpadded_indices]


    #     if np.array_equal(nonpadded_labels, preds): # If the entire label matches, count it as correct
    #         sequence_matches += 1

    #     total_sequences += 1

    # acc = sequence_matches / total_sequences if total_sequences > 0 else 0

    # return {"accuracy": acc}

    # Variables for token-level metrics
    all_labels = []
    all_preds = []

    # Iterate over each sequence
    for label, prediction in zip(labels, preds):
        # Ignore padded tokens
        nonpadded_indices = label != -100
        nonpadded_labels = label[nonpadded_indices]
        nonpadded_preds = prediction[nonpadded_indices]

        # Sequence-level accuracy
        if np.array_equal(nonpadded_labels, nonpadded_preds):  # Entire sequence matches
            sequence_matches += 1
        total_sequences += 1

        # Token-level metrics
        all_labels.extend(nonpadded_labels.tolist())
        all_preds.extend(nonpadded_preds.tolist())

    # Sequence-level accuracy
    sequence_accuracy = sequence_matches / total_sequences if total_sequences > 0 else 0

    # Token-level metrics
    precision = precision_score(all_labels, all_preds, average="binary", zero_division=0)
    recall = recall_score(all_labels, all_preds, average="binary", zero_division=0)
    f1 = f1_score(all_labels, all_preds, average="binary", zero_division=0)

    return {
        "sequence_accuracy": sequence_accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


args = TrainingArguments(
    f"{model_name}-finetuned-secondary-structure",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.001,
    metric_for_best_model="eval_sequence_accuracy",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset, # SHOULD BE VAL DATASET
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate()
print(evaluation_results)


Epoch,Training Loss,Validation Loss,Sequence Accuracy,Precision,Recall,F1
1,No log,0.009703,0.0,0.0,0.0,0.0
2,No log,0.00639,0.0,0.0,0.0,0.0
3,No log,0.009149,0.0,0.0,0.0,0.0
4,No log,0.006545,0.102273,0.298013,0.511364,0.376569
5,0.008300,0.006165,0.0,0.0,0.0,0.0


{'eval_loss': 0.006544713396579027, 'eval_sequence_accuracy': 0.10227272727272728, 'eval_precision': 0.2980132450331126, 'eval_recall': 0.5113636363636364, 'eval_f1': 0.37656903765690375, 'eval_runtime': 3.4891, 'eval_samples_per_second': 25.221, 'eval_steps_per_second': 3.153, 'epoch': 5.0}


In [None]:
trained_model = AutoModelForTokenClassification.from_pretrained(f"{model_name}-finetuned-secondary-structure/checkpoint-2991") #make sure you are loading the right checkpoint
trained_model = trained_model.to(device)

In [None]:
showcase_seq = test_seqs[20]
showcase_ids = tokenizer.encode(showcase_seq) # N -> 5, A -> 6, C -> 7, G -> 8, T(U) -> 9. NOTE: a [CLS] and a [SEP] token will be added to the start and the end of seq
print(showcase_ids)
showcase_ids = torch.as_tensor(showcase_ids)
test_id = showcase_ids.unsqueeze(0)
test_id = test_id.to(device)

test_logit = trained_model(test_id, output_hidden_states=False).logits
test_probs = torch.sigmoid(test_logit)
class1_probs = test_probs[..., 1]
max_indices = class1_probs.argmax(dim=-1)
predicted_classes = torch.zeros_like(class1_probs)
predicted_classes[torch.arange(test_logit.size(0)), max_indices] = 1

predicted_classes = predicted_classes.squeeze(0)

print(predicted_classes)

print(sum(predicted_classes))
print(predicted_classes.argmax(dim = -1))

print(torch.as_tensor(train_labels[20][predicted_classes.argmax(dim = -1)]))
