In [1]:
# Install datasets as it is not already installed on colab
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoModelForTokenClassification
from datasets import Dataset


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

### Setup colab and load scripts from github

In [4]:
from google.colab import drive
drive.mount('/content/drive')

# Change working directory to Project folder, you may change this as needed
%cd "/content/drive/MyDrive/epfl_ml_project"

#!git clone https://github.com/AliSaadatV/BP_LM.git clone repository if you haven't already

from BP_LM.data_preprocessing import *

Mounted at /content/drive
/content/drive/MyDrive/epfl_ml_project


### Load the dataset and prepare the labels and sequences

In [5]:
# I assume this directory contains two folders: data and models, containing the data and models respectively.
file_path = 'dataset/Data_GRCh38_canonical_transcript_intron_bp_database_Peng_20241113.txt'

# Load dataset
df = pd.read_csv(file_path, sep='\t')

# Pick just a random subset to make the processing to come run fast.
# Remove this line to train on the full dataset
df = df.sample(n = 1000)

df['BP_POS_WITHIN_STRAND'] = df['IVS_SIZE'] + df['BP_ACC_DIST']

  df['STRAND'] = df['STRAND'].replace({'+': 1, '-': 0})


In [6]:
# Create a split based on chromosome types (Alis idea)
train_chrs = ["chr1", "chr2", "chr3", "chr4",
              "chr5","chr10",
              "chr11", "chr12", "chr13", "chr14",
              "chr15", "chr16", "chr17", "chr18",
              "chr19", "chr22",
              "chrX", "chrY"]

# Keep chr6 and chr7 in train if we want a 90/10/10 train/val/test split
test_chrs = ["chr8", "chr20", "chr6"]
val_chrs = ["chr9", "chr21", "chr7"]

train_df, test_df, val_df = split_train_test_on_chr(df, train_chrs, val_chrs, test_chrs, shuffle=True)

Chromosomes in train set: {'chr1', 'chr19', 'chr2', 'chr11', 'chr5', 'chr10', 'chrX', 'chr18', 'chr13', 'chr3', 'chr15', 'chr16', 'chr14', 'chr4', 'chr17', 'chr12', 'chrY', 'chr22'}
Chromosomes in validation set: {'chr9', 'chr7', 'chr21'}
Chromosomes in test set: {'chr20', 'chr8', 'chr6'}

Total data points: 1000
Train set contains 788 data points (78.80%)
Validation set contains 104 data points (10.40%)
Test set contains 108 data points (10.80%)


In [7]:
train_seqs, train_labels = extract_intron_seq_and_labels(train_df, max_model_input_size=1024, truncate=True)
test_seqs, test_labels = extract_intron_seq_and_labels(test_df, max_model_input_size=1024, truncate=True)
val_seqs, val_labels = extract_intron_seq_and_labels(val_df, max_model_input_size=1024, truncate=True)

In [8]:
df.columns.values

array(['CHR', 'START', 'END', 'STRAND', 'GENE', 'TRANSCRIPT', 'IVS',
       'IVS_SIZE', 'BP_ACC_DIST', 'IVS_SEQ', 'BP_POS_WITHIN_STRAND'],
      dtype=object)

### Load the splicebert model and tokenizer

In [9]:
SPLICEBERT_PATH = "models/SpliceBERT.1024nt"  # set the path to the folder of pre-trained SpliceBERT

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(SPLICEBERT_PATH)

# finetuning SpliceBERT for token classification tasks
model = AutoModelForTokenClassification.from_pretrained(SPLICEBERT_PATH, num_labels = 2) # We want binary classification on tokens so num_labels = 2

Some weights of BertForTokenClassification were not initialized from the model checkpoint at models/SpliceBERT.1024nt and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Show a case of inference

In [10]:
# prepare a test input to show how to do inference
showcase_seq = "ACGUACGuacguaCGuGCAUGUCAUGC"
showcase_seq = "acacaguguaggaaggaACGuacguaC"  ## WARNING: this is just a demo. SpliceBERT may not work on sequences shorter than 64nt as it was trained on sequences of 64-1024nt in length
showcase_seq = ' '.join(list(showcase_seq.upper().replace("U", "T"))) # U -> T and add whitespace
showcase_ids = tokenizer.encode(showcase_seq) # N -> 5, A -> 6, C -> 7, G -> 8, T(U) -> 9. NOTE: a [CLS] and a [SEP] token will be added to the start and the end of seq
print(showcase_ids) #Print the tokenized version
showcase_ids = torch.as_tensor(showcase_ids) # convert python list to Tensor
showcase_ids = showcase_ids.unsqueeze(0) # add batch dimension, shape: (batch_size, sequence_length)

[2, 6, 7, 6, 7, 6, 8, 9, 8, 9, 6, 8, 8, 6, 6, 8, 8, 6, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 3]


In [11]:
#infer with the (untrained) model
showcase_logit = model(showcase_ids, output_hidden_states=False).logits

In [12]:
#Define a function for making the output easily interpretable
def logittobinary(logits):
  """
  This is just a ghetto function for converting binary logits into binary strings
  There might be an existing method for this in transformers, but I coulnd't find any
  so i wrote this real quick
  """
  output_list = []
  for j in range(len(logits)):
    binary = ""
    for i in range(len(logits[j,:,1])):
      if logits[j,i,0] < logits[j,i,1]:
        binary += "1"
      else:
        binary += "0"
    output_list.append(binary)
  return output_list

In [13]:
print(showcase_logit)
print(logittobinary(showcase_logit))

tensor([[[ 0.1932, -0.0200],
         [-0.0401, -0.0893],
         [ 0.2547, -0.0215],
         [-0.1226, -0.0999],
         [ 0.1999, -0.0790],
         [-0.1321, -0.0556],
         [ 0.1965, -0.2273],
         [ 0.2206,  0.1493],
         [ 0.0061, -0.1003],
         [ 0.2092,  0.1118],
         [-0.2125, -0.0328],
         [ 0.0986, -0.1546],
         [ 0.0882, -0.1458],
         [-0.1228, -0.1165],
         [-0.2095, -0.0359],
         [ 0.0291, -0.1297],
         [ 0.0239, -0.1281],
         [-0.1263, -0.0150],
         [-0.1821, -0.0824],
         [ 0.1641,  0.0965],
         [-0.0289, -0.0119],
         [ 0.0853, -0.0641],
         [-0.2176, -0.0215],
         [ 0.1937,  0.1328],
         [-0.0294, -0.0585],
         [ 0.2223, -0.1484],
         [-0.2186, -0.0195],
         [ 0.1952,  0.0893],
         [ 0.0634, -0.0005]]], grad_fn=<ViewBackward0>)
['00010100001001100110101000100']


The model is not yet finetuned so this output is just gibberish at the moment. Notice that for each token the likelyhood of a branchpoint is about the same as not a branchpoint (depends a bit on seed), which should definately not be the case once the model has been trained.

### Tokenize the training data and put it into the dataset format

In [14]:
# Tokenize the input data
train_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in train_seqs] #There shouldn't be any "U"s in the training data, but I have kept the replacer line
test_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in test_seqs]
val_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in val_seqs]

In [15]:
print(train_seqs[:2])

['T T G T C A T T T A C C T G T A T T C C G T C A A A A T A A T T G T G T C A G G G T T T A C G A A A C A A C G A A T A T T A G C G G A A T T A A A A C G A A A T C C A A T A G T A T A T T G A G G T A C C A A G T C T T T C T G A C C G A C A C A C A G T T A C A G T T A G T G A C G G A T C C T G A T G A C T T C G T A C A C T C T T C G A C C G T T T A C T T T A T C G A T A T T G G G A C A G A T T C C C C T C C T T T C C T T C T C T A G G T A A A T G T G T A C A T C G T C T T G A G T C T A A T A T T T A A G A A C C T T C A G T C C T C G A C A C A G A T A T G A C C A A C G T G T T T C C T G A C A C C T T T A T G C G A A T A G T C A T T C G G T C C C A C A A C A A A G A T A A C A A T A C G T T T G G G G T G G A G A G G T C G T C A G A T A T T T A T C A T T A A C G A G A G G G A T T A T C C T C T G T G T A A T A C A A C C T T C T T G T T G T C T A T T A G A G A T A C G G T A C A C A T A G A T T C T T T T A A T T A C T T T T A A A T C T A A A C A A A A C T T T T A C A G T T G T T T T A T C T A 

In [16]:
# Package for use in dataset class
train_ids = tokenizer(train_seqs, padding = 'max_length', padding_side = 'left', max_length = 1024)
test_ids = tokenizer(test_seqs, padding = 'max_length', padding_side = 'left', max_length = 1024)
val_ids = tokenizer(val_seqs, padding = 'max_length', padding_side = 'left', max_length = 1024)

In [17]:
# Build the dataset structure we will pass for training
train_dataset = Dataset.from_dict(train_ids)
train_dataset = train_dataset.add_column("labels", train_labels)

test_dataset = Dataset.from_dict(test_ids)
test_dataset = test_dataset.add_column("labels", test_labels)

val_dataset = Dataset.from_dict(val_ids)
val_dataset = val_dataset.add_column("labels", val_labels)

In [18]:
# Set up the collator (I think it does padding)
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)

In [19]:
# Define model training parameters
model_name = SPLICEBERT_PATH.split("/")[-1]
batch_size = 4

args = TrainingArguments(
    f"{model_name}-finetuned-secondary-structure",
    eval_strategy = "no",
    save_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.001,
    #load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    #push_to_hub=True,
)

In [20]:
# Do not save to W&B
import os
os.environ["WANDB_MODE"] = "disabled"

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=197, training_loss=0.017273976718108666, metrics={'train_runtime': 66.1643, 'train_samples_per_second': 11.91, 'train_steps_per_second': 2.977, 'total_flos': 91758239336544.0, 'train_loss': 0.017273976718108666, 'epoch': 1.0})

In [25]:
trained_model = AutoModelForTokenClassification.from_pretrained(f"{model_name}-finetuned-secondary-structure/checkpoint-197") #make sure you are loading the right checkpoint
trained_model = trained_model.to(device)

In [26]:
showcase_ids = showcase_ids.to(device)
showcase_logit = model(showcase_ids, output_hidden_states=False).logits
print(logittobinary(showcase_logit))

['00000000000000000000000000000']


In [27]:
logit = trained_model(showcase_ids, output_hidden_states=False).logits

In [28]:
print(logittobinary(logit))

['00000000000000000000000000000']


This short test will likely be all zeros after training as the model will have at least fitted to the mean

### Unused code I keep around for reference

In [None]:
#Create training data
#input_sequences = df["IVS_SEQ"].to_list()
#input_labels = []

#Sequences is a list of strings, input labels is a list of lists of token labels
#for index, sample in df.iterrows():
#    label_sequence = (sample["BP_POS"]-sample["START"])*[0] + [1] + (sample["END"]-sample["BP_POS"])*[0]
#    input_labels.append(label_sequence)

In [None]:
# use huggerface's official API to use SpliceBERT
# get nucleotide embeddings (hidden states)
#model = AutoModel.from_pretrained(SPLICEBERT_PATH) # load model
#last_hidden_state = model(input_ids).last_hidden_state # get hidden states from last layer
#hiddens_states = model(input_ids, output_hidden_states=True).hidden_states # hidden states from the embedding layer (nn.Embedding) and the 6 transformer encoder layers

# get nucleotide type logits in masked language modeling
#model = AutoModelForMaskedLM.from_pretrained(SPLICEBERT_PATH) # load model
#logits = model(input_ids).logits # shape: (batch_size, sequence_length, vocab_size)