In [1]:
# Install datasets as it is not already installed on colab
!pip install datasets



### Setup colab and load scripts from github

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# Change working directory to Project folder, you may change this as needed
%cd "/content/drive/MyDrive/Machine_Learning_(CS-433)/Project_2"

#!git clone https://github.com/AliSaadatV/BP_LM.git clone repository if you haven't already

#Import useful functions from repository
from BP_LM.data_preprocessing import preprocess_data, split_train_test_on_chr,  extract_intron_seq_and_labels, truncate_strands

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Machine_Learning_(CS-433)/Project_2


### Load the dataset and prepare the labels and sequences

In [3]:
import pandas as pd
# I assume this directory contains two folders: data and models, containing the data and models respectively.
file_path = 'data/dataset.txt'

# Load dataset
df = pd.read_csv(file_path, sep='\t')
preprocess_data(df)

# Drop sequences that don't natively fit in splicebert
df = df.drop(df[df["IVS_SIZE"] > 1024].index)
# This should be improved at a later time

#Pick just a random subset to make the processing to come run fast.
#If you want to train for real then remove this line.
df = df.sample(n = 1000)

#Create a split based on chromosome types (Alis idea)
train_chrs = ["chr1", "chr2", "chr3", "chr4",
              "chr5", "chr6", "chr7","chr10",
              "chr11", "chr12", "chr13", "chr14",
              "chr15", "chr16", "chr17", "chr18",
              "chr19", "chr20", "chr21", "chr22",
              "chrX", "chrY"]

test_chrs = ["chr8"]
val_chrs = ["chr9"]

train_df, test_df, val_df = split_train_test_on_chr(df, train_chrs, val_chrs, test_chrs, shuffle=True)



  df['STRAND'] = df['STRAND'].replace({'+': 1, '-': 0})


Chromosomes in train set: {'chr22', 'chr2', 'chr17', 'chr5', 'chr10', 'chr21', 'chr6', 'chr4', 'chr15', 'chr7', 'chr3', 'chr13', 'chr16', 'chr11', 'chr19', 'chr1', 'chrX', 'chr12', 'chrY', 'chr14', 'chr20', 'chr18'}
Chromosomes in validation set: {'chr9'}
Chromosomes in test set: {'chr8'}

Total data points: 1000
Train set contains 940 data points (94.00%)
Validation set contains 37 data points (3.70%)
Test set contains 23 data points (2.30%)


In [4]:
train_seqs, train_labels = extract_intron_seq_and_labels(train_df, max_model_input_size=1024, truncate=False) # I don't trust the truncation function yet.
test_seqs, test_labels = extract_intron_seq_and_labels(test_df, max_model_input_size=1024, truncate=False)
val_seqs, val_labels = extract_intron_seq_and_labels(val_df, max_model_input_size=1024, truncate=False)

In [5]:
df.columns.values

array(['CHR', 'START', 'END', 'STRAND', 'GENE', 'TRANSCRIPT', 'IVS',
       'IVS_SIZE', 'BP_POS', 'BP_ACC_DIST', 'BP_ACC_SEQ', 'IVS_SEQ',
       'BP_POS_WITHIN_STRAND'], dtype=object)

### Load the splicebert model and tokenizer

In [6]:
SPLICEBERT_PATH = "models/SpliceBERT.1024nt"  # set the path to the folder of pre-trained SpliceBERT
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoModelForTokenClassification

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(SPLICEBERT_PATH)

# finetuning SpliceBERT for token classification tasks
model = AutoModelForTokenClassification.from_pretrained(SPLICEBERT_PATH, num_labels = 2) # We want binary classification on tokens so num_labels = 2

Some weights of BertForTokenClassification were not initialized from the model checkpoint at models/SpliceBERT.1024nt and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Show a case of inference

In [7]:
# prepare a test input to show how to do inference
showcase_seq = "ACGUACGuacguaCGuGCAUGUCAUGC"  ## WARNING: this is just a demo. SpliceBERT may not work on sequences shorter than 64nt as it was trained on sequences of 64-1024nt in length
showcase_seq = ' '.join(list(showcase_seq.upper().replace("U", "T"))) # U -> T and add whitespace
showcase_ids = tokenizer.encode(showcase_seq) # N -> 5, A -> 6, C -> 7, G -> 8, T(U) -> 9. NOTE: a [CLS] and a [SEP] token will be added to the start and the end of seq
print(showcase_ids) #Print the tokenized version
showcase_ids = torch.as_tensor(showcase_ids) # convert python list to Tensor
showcase_ids = showcase_ids.unsqueeze(0) # add batch dimension, shape: (batch_size, sequence_length)

[2, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 8, 7, 6, 9, 8, 9, 7, 6, 9, 8, 7, 3]


In [8]:
#infer with the (untrained) model
showcase_logit = model(showcase_ids, output_hidden_states=False).logits

In [9]:
#Define a function for making the output easily interpretable
def logittobinary(logits):
  """
  This is just a ghetto function for converting binary logits into binary strings
  There might be an existing method for this in transformers, but I coulnd't find any
  so i wrote this real quick
  """
  output_list = []
  for j in range(len(logits)):
    binary = ""
    for i in range(len(logits[j,:,1])):
      if logits[j,i,0] < logits[j,i,1]:
        binary += "1"
      else:
        binary += "0"
    output_list.append(binary)
  return output_list

In [10]:
print(logittobinary(showcase_logit))

['11011101110111011101111111111']


The model is not yet finetuned so this output is just gibberish at the moment. Notice that for each token the likelyhood of a branchpoint is about the same as not a branchpoint (depends a bit on seed), which should definately not be the case once the model has been trained.

### Tokenize the training data and put it into the dataset format

In [11]:
#Tokenize the input data
train_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in train_seqs] #There shouldn't be any "U"s in the training data, but I have kept the replacer line
test_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in test_seqs]
val_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in val_seqs]

In [12]:
#package for use in dataset class
train_ids = tokenizer(train_seqs, padding = 'max_length', padding_side = 'left', max_length = 1024)
test_ids = tokenizer(test_seqs, padding = 'max_length', padding_side = 'left', max_length = 1024)
val_ids = tokenizer(val_seqs, padding = 'max_length', padding_side = 'left', max_length = 1024)

In [14]:
# Build the dataset structure we will pass for training
from datasets import Dataset

train_dataset = Dataset.from_dict(train_ids)
train_dataset = train_dataset.add_column("labels", train_labels)

test_dataset = Dataset.from_dict(test_ids)
test_dataset = test_dataset.add_column("labels", test_labels)

val_dataset = Dataset.from_dict(val_ids)
val_dataset = val_dataset.add_column("labels", val_labels)

In [15]:
#Set up the collator (I think it does padding)
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)

In [17]:
#Define model training parameters
model_name = SPLICEBERT_PATH.split("/")[-1]
batch_size = 4

args = TrainingArguments(
    f"{model_name}-finetuned-secondary-structure",
    eval_strategy = "no",
    save_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.001,
    #load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    #push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33moliversmedt[0m ([33moliversmedt-epfl[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


In [46]:
trained_model = AutoModelForTokenClassification.from_pretrained(f"{model_name}-finetuned-secondary-structure/checkpoint-25") #make sure you are loading the right checkpoint

In [51]:
logit = trained_model(input_ids, output_hidden_states=False).logits

In [52]:
print(logittobinary(logit))

['00000000000000000000000000000']


This short test will likely be all zeros after training as the model will have at least fitted to the mean

### Unused code I keep around for reference

In [11]:
#Create training data
#input_sequences = df["IVS_SEQ"].to_list()
#input_labels = []

#Sequences is a list of strings, input labels is a list of lists of token labels
#for index, sample in df.iterrows():
#    label_sequence = (sample["BP_POS"]-sample["START"])*[0] + [1] + (sample["END"]-sample["BP_POS"])*[0]
#    input_labels.append(label_sequence)

In [None]:
# use huggerface's official API to use SpliceBERT
# get nucleotide embeddings (hidden states)
#model = AutoModel.from_pretrained(SPLICEBERT_PATH) # load model
#last_hidden_state = model(input_ids).last_hidden_state # get hidden states from last layer
#hiddens_states = model(input_ids, output_hidden_states=True).hidden_states # hidden states from the embedding layer (nn.Embedding) and the 6 transformer encoder layers

# get nucleotide type logits in masked language modeling
#model = AutoModelForMaskedLM.from_pretrained(SPLICEBERT_PATH) # load model
#logits = model(input_ids).logits # shape: (batch_size, sequence_length, vocab_size)