In [1]:
# Install datasets as it is not already installed on colab
!pip install datasets evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5

### Setup colab and load scripts from github

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# Change working directory to Project folder, you may change this as needed
%cd "/content/drive/MyDrive/Machine_Learning_(CS-433)/Project_2"

#!git clone https://github.com/AliSaadatV/BP_LM.git #clone repository if you haven't already

#Import useful functions from repository
from BP_LM.data_preprocessing import split_train_test_on_chr,  extract_intron_seq_and_labels, truncate_strands

Mounted at /content/drive
/content/drive/MyDrive/Machine_Learning_(CS-433)/Project_2


### Load the dataset and prepare the labels and sequences

In [21]:
import pandas as pd
# I assume this directory contains two folders: data and models, containing the data and models respectively.
file_path = 'data/dataset.txt'

# Load dataset
df = pd.read_csv(file_path, sep='\t')
df['BP_POS_WITHIN_STRAND'] = df['IVS_SIZE'] + df['BP_ACC_DIST']

# Drop sequences that don't natively fit in splicebert
#df = df.drop(df[df["IVS_SIZE"] > 100].index)
#df = df.reset_index(drop = True)
# This should be improved at a later time

#Pick just a random subset to make the processing to come run fast.
#If you want to train for real then remove this line.
df = df.sample(n = 5000)

#Create a split based on chromosome types (Alis idea)
train_chrs = ["chr1", "chr2", "chr3", "chr4",
              "chr5", "chr6", "chr7",
              "chr12", "chr13", "chr14",
              "chr15", "chr16", "chr17", "chr18",
              "chr19", "chr20", "chr21", "chr22",
              "chrX", "chrY"]

test_chrs = ["chr8", "chr11"]
val_chrs = ["chr9", "chr10"]

train_df, test_df, val_df = split_train_test_on_chr(df, train_chrs, val_chrs, test_chrs, shuffle=True)

#Grab some random entries to use
#train_df = df.iloc[0:2]
#test_df = df.iloc[2:4]
#val_df = df.iloc[4:6]

Chromosomes in train set: {'chr17', 'chr22', 'chr14', 'chr1', 'chr15', 'chr3', 'chr5', 'chr2', 'chr12', 'chrX', 'chrY', 'chr19', 'chr21', 'chr20', 'chr18', 'chr6', 'chr4', 'chr7', 'chr13', 'chr16'}
Chromosomes in validation set: {'chr10', 'chr9'}
Chromosomes in test set: {'chr8', 'chr11'}

Total data points: 5000
Train set contains 4175 data points (83.50%)
Validation set contains 377 data points (7.54%)
Test set contains 448 data points (8.96%)


In [22]:
df

Unnamed: 0,CHR,START,END,STRAND,GENE,TRANSCRIPT,IVS,IVS_SIZE,BP_POS,BP_ACC_DIST,BP_ACC_SEQ,IVS_SEQ,BP_POS_WITHIN_STRAND
72591,chr7,128849205,128849330,+,FLNC,ENST00000325888,IVS29,126,128849309,-22,AGCAGGATCTCCCGCATGGCAG,GTGAGTGCCCTTTCTCTCCTCTTCTTGGTGTGGGCCAGGGTGGTTG...,104
174191,chrX,70284044,70284265,+,RAB41,ENST00000374473,IVS6,222,70284211,-55,ACCTTTTTTTTTTCCCCTTTTTTTTTTTTTGGTCCCCATTCACACA...,GTAATACTTGTTTCTTTCTATGATACTTTAATTGTGCTCTGTCTGT...,167
73835,chr7,148760497,148766560,+,CUL1,ENST00000325222,IVS7,6064,148766540,-21,ACTTGAATTAATTTTCTCCAG,GTAAGCTTAAATATAGTACTTTAAGTAGACTTAAGTTAAAGTCATT...,6043
126196,chr15,48485497,48487074,-,FBN1,ENST00000316623,IVS29,1578,48485518,-22,ATATTATTTTCATTTCTTTTAG,GTAAGTTCTTTTTTATTTTATTTTATTTTATTTTATTTTACTTTAT...,1556
28053,chr2,191148205,191150946,-,STAT4,ENST00000392320,IVS1,2742,191148225,-21,ATTCTACACCTTCTTTTATAG,GTAAGTGGCCAGACCACACCTTCTGGCTGCCTCTTGTAAGGAGGAC...,2721
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173377,chrX,49227128,49228035,-,CACNA1F,ENST00000323022,IVS8,908,49227150,-23,AGCCCTCTATCCTCCTCCCTCAG,GTGAGAGACCTAGACACTCCCTGCTTCCCACCCCTCAGCCACTGCC...,885
176349,chrX,132069664,132071068,+,STK26,ENST00000394334,IVS7,1405,132071053,-16,ATCTTTTTATCCTTAG,GTGAGTATATATTGCTATTATTACTATTTGTTTTCTATTATTAGAT...,1389
76271,chr8,38106445,38107020,+,ASH2L,ENST00000343823,IVS2,576,38106998,-23,AGTCTCGAACTGCTCTGACACAG,GTAAGTATTTTTAGTTGTTTGCAAGACAAAATAGGGTTTGTTTTAG...,553
62327,chr6,75124372,75125126,-,COL12A1,ENST00000322507,IVS40,755,75124397,-26,AATCTCTTTCTTTCTTTATTTTGTAG,GTGAGTCATGGATATTTTCATGGTTGTGAGAAAAACTGTAGTAGAA...,729


In [23]:
train_df

Unnamed: 0,CHR,START,END,STRAND,GENE,TRANSCRIPT,IVS,IVS_SIZE,BP_POS,BP_ACC_DIST,BP_ACC_SEQ,IVS_SEQ,BP_POS_WITHIN_STRAND
0,chr21,29067706,29069422,-,CCT8,ENST00000286788,IVS3,1717,29067729,-24,ACTTGATATTACCTTTTCTACTAG,GTAAGTTGTTTCTTTAAAAAGTCAAGAAATATTTTGGTGACATCAA...,1693
1,chr17,63713528,63714005,-,STRADA,ENST00000336174,IVS5,478,63713545,-18,ATTCCCTTTTTCTTTTAG,GTATGTGCAGGGGCCGTCCTTACTCTCCTGCTTTCTGCTCCAGGGG...,460
2,chr1,181711070,181715337,+,CACNA1E,ENST00000367573,IVS8,4268,181715314,-24,ACCAAACCATTTGTTTCCATATAG,GTAGGCCTGGGGGGCTGCAGGAGGCTGGTGAGTGGGCTGCAGAGAC...,4244
3,chrX,48987878,48988120,-,GRIPAP1,ENST00000376423,IVS12,243,48987900,-23,ACCTGCTGTGGACGTGGGGCCAG,GTATATTGTACTGCGTGGGCTCAGGGAGCTGGGGTTCAGGATGGGG...,220
4,chr17,36486967,36492812,+,ZNHIT3,ENST00000617429,IVS2,5846,36492783,-30,ATGTGGGCCTGGGATTTGTGTCTTTTTCAG,GTGAGCCCCGTCCCCGCCAGCCCTCGTACCACTGCGCACGGGGCAG...,5816
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170,chr16,58676055,58676288,-,SLC38A7,ENST00000219320,IVS7,234,58676076,-22,ACGGTGCCCTTCACCTGGACAG,GTGCCAGTGCCAGTTGCAAGGTCCCTGCTGTCCCCTTAGATCACCC...,212
4171,chr14,80914409,80916400,-,CEP128,ENST00000555265,IVS3,1992,80914428,-20,AGTCCATTTTCTTTCTCAAG,GTTAGTTTAACAATGGTTGCATTTAATAGAATTTAAATAGTTTGAG...,1972
4172,chr12,123879802,123881624,+,DNAH10,ENST00000673944,IVS50,1823,123881599,-26,AATTCTTTTTTTTTTTTTAAATGCAG,GTGGGGATGAGCCCCACCCTGTCCATGGGCTCACTTTCTCCTGAGC...,1797
4173,chr20,15499848,15862744,+,MACROD2,ENST00000684519,IVS8,362897,15862722,-23,AGTGTTTTATCTTTTGCCTCTAG,GTAGGAGGAACGACATAATCAGTGAACATCCAAGATGATGTAATTT...,362874


In [24]:
train_seqs, train_labels = extract_intron_seq_and_labels(train_df, max_model_input_size=1024, truncate=True)
test_seqs, test_labels = extract_intron_seq_and_labels(test_df, max_model_input_size=1024, truncate=True)
val_seqs, val_labels = extract_intron_seq_and_labels(val_df, max_model_input_size=1024, truncate=True)

In [25]:
print(train_seqs[0])
print(train_labels[0])

ACTTTGGGAGGTCAAGGCAGGCGGATCACCTAAGGTCAGGAGTTTGAGACTAGCCTGGCCAACATGGTGAAACCCCATCTCTACTAAAAATACAGAAACAAAATCAGCCGGGCGTGGTGGCACACACCTGTAGTTCCAGCTACTCAGTAGGCTAGGGCAGGAGAATTGCTTGAACCTGGGAGGCGGAGGTTGCAATGAGCCAAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCAAGACTCCATCTCAAAAAAGAAAAAAAAATACATATTATATATATAAATTCTTAAAGAACTTAAGTTAAAAGTTAAATTTTAACTCCACAAAACACTTGTACAAGGGTTTGTCACTGTGAATGTTAATGTTGCACACTGATGATCTCAGGAGTTATTAAAGTACAGTGGTGATACCGTCAAAATAATAAAATCACAGGGCTGACTTTTTATAGTTTTCTTGTTGCTCTCAGTTCTTACTCCTTGGCTTGTTTCTGTAAAGGTATCTAAAATTAAATCCTAAGAATAAAATCCACATAAAATTATGTTATATTTGAAGAAAAGTTTTATACTCCCCCTTTTCTTTAAATCACAAATGATAACCCTTACAACAAGAATATATAGTAAAATAGATATTGAGAGTCATGACCAAGTTAAGTGGTATCTATTTTAGTTCTCCTGAAGGCTTTTAAAAAATATCTAGGTTTGGTGGTTGTAATTACTTTGTTTTTTAATTTAGTGAAATATGATATGGTGAAACATATGTGTGGCACTGTATGTGCAGGCCACCAAAAGAGGATGCACTTTGAAATTCTTATTTATAGTGATACATTTGTGCTGATCGGCATAAAGGGACTTGGATTTTTTCTTGTGACCACACTATAAGTAGCGTATAGCAGTGTAGCAGTGTGATCAGATGGAGTTACCTAATCTACAAGAAAATTGAGTCTATTTAAGAAATGGAAATGAACAATTTTATGTTGCTTTAAGGATCAGATGTTTA

### Load the splicebert model and tokenizer

In [26]:
SPLICEBERT_PATH = "models/SpliceBERT.1024nt"  # set the path to the folder of pre-trained SpliceBERT
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoModelForTokenClassification

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(SPLICEBERT_PATH)
#tokenizer = AutoTokenizer.from_pretrained("SpliceBERT.1024nt-finetuned-secondary-structure/checkpoint-2084") #should you wish to load a checkpoint instead

# finetuning SpliceBERT for token classification tasks
model = AutoModelForTokenClassification.from_pretrained(SPLICEBERT_PATH, num_labels = 2) # We want binary classification on tokens so num_labels = 2
#model = AutoModelForTokenClassification.from_pretrained("SpliceBERT.1024nt-finetuned-secondary-structure/checkpoint-2084", num_labels = 2) #should you wish to load a checkpoint instead


### Show a case of inference

In [27]:
# prepare a test input to show how to do inference
showcase_seq = "ACGUACGuacguaCGuGCAUGUCAUGC"  ## WARNING: this is just a demo. SpliceBERT may not work on sequences shorter than 64nt as it was trained on sequences of 64-1024nt in length
showcase_seq = ' '.join(list(showcase_seq.upper().replace("U", "T"))) # U -> T and add whitespace, the white space is necessary for the tokenizer for some reason.
showcase_ids = tokenizer.encode(showcase_seq) # N -> 5, A -> 6, C -> 7, G -> 8, T(U) -> 9. NOTE: a [CLS] and a [SEP] token will be added to the start and the end of seq
print(showcase_ids) #Print the tokenized version
showcase_ids = torch.as_tensor(showcase_ids) # convert python list to Tensor
showcase_ids = showcase_ids.unsqueeze(0) # add batch dimension, shape: (batch_size, sequence_length)

[2, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 8, 7, 6, 9, 8, 9, 7, 6, 9, 8, 7, 3]


In [28]:
#infer with the (untrained) model
showcase_logit = model(showcase_ids, output_hidden_states=False).logits

In [29]:
#Define a function for making the output easily interpretable
def logittobinary(logits):
  """
  This is just a ghetto function for converting binary logits into binary strings
  There might be an existing method for this in transformers, but I coulnd't find any
  so i wrote this real quick
  """
  output_list = []
  for j in range(len(logits)):
    binary = ""
    for i in range(len(logits[j,:,1])):
      if logits[j,i,0] < logits[j,i,1]:
        binary += "1"
      else:
        binary += "0"
    output_list.append(binary)
  return output_list

In [30]:
print(logittobinary(showcase_logit))

['00000000000000000000000000000']


The model is not yet finetuned so this output is just gibberish at the moment. Notice that for each token the likelyhood of a branchpoint is about the same as not a branchpoint (depends a bit on seed), which should definately not be the case once the model has been trained.

### Tokenize the training data and put it into the dataset format

In [31]:
#Tokenize the input data
train_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in train_seqs] #There shouldn't be any "U"s in the training data, but I have kept the replacer line
test_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in test_seqs]
val_seqs = [' '.join(list(seq.upper().replace("U", "T"))) for seq in val_seqs]

In [32]:
#package for use in dataset class
train_ids = tokenizer(train_seqs)#, padding = 'max_length', padding_side = 'left', max_length = 1024)
test_ids = tokenizer(test_seqs)#, padding = 'max_length', padding_side = 'left', max_length = 1024)
val_ids = tokenizer(val_seqs)#, padding = 'max_length', padding_side = 'left', max_length = 1024)

In [33]:
# Build the dataset structure we will pass for training
from datasets import Dataset

train_dataset = Dataset.from_dict(train_ids)
train_dataset = train_dataset.add_column("labels", train_labels)

test_dataset = Dataset.from_dict(test_ids)
test_dataset = test_dataset.add_column("labels", test_labels)

val_dataset = Dataset.from_dict(val_ids)
val_dataset = val_dataset.add_column("labels", val_labels)

In [34]:
#Set up the collator (I think it does padding)
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
data_collator = DataCollatorForTokenClassification(tokenizer) #Unsure about how data_collator pads exactly so i specified padding in the tokenizer itself


In [35]:
#Need a metric for the evaluation strategy
from evaluate import load
from sklearn.metrics import average_precision_score
from scipy.special import softmax
import numpy as np

metric1 = load("f1")
metric2 = load("accuracy")
metric3 = average_precision_score #syntax for this is a bit different as it comes from a different library

def compute_metrics(eval_pred):
    """
    Function to simultaneously evaluate accuracy, F1 and average precision (AP)

    The function does the evaluation per token and not per label.

    average precision is the most interesting as it accounts for the fact that the
    ideal decision boundary may be something non trivial.
    """
    raw_predictions, labels = eval_pred

    logits = raw_predictions[0] #discard hidden states and keep logits

    #Find predictions from logits
    predictions = softmax(logits, axis=2)[:,:,1] #probability of positive label
    categorical_predictions = np.argmax(logits, axis=2) #=1 if prob. pos. > 0.5

    #Reshape predictions and labels into long strings to compute metrics per token
    categorical_predictions = categorical_predictions.reshape((-1,))
    predictions = predictions.reshape((-1,))
    labels = labels.reshape((-1,))

    #Remove all the padded ones
    categorical_predictions = categorical_predictions[labels!=-100]
    predictions = predictions[labels!=-100]
    labels = labels[labels!=-100]

    #compute metrics
    f1 = metric1.compute(predictions=categorical_predictions, references=labels)
    accuracy = metric2.compute(predictions=categorical_predictions, references=labels)
    AP = metric3(labels, predictions)

    #Return joint dictionary
    return f1 | accuracy | {"AP" : AP}

In [39]:
#Define model training parameters
model_name = SPLICEBERT_PATH.split("/")[-1]
batch_size = 4

args = TrainingArguments(
    f"{model_name}-trash",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    optim = "adamw_torch",
    weight_decay=0.001,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    #push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


In [20]:
val_df

Unnamed: 0,CHR,START,END,STRAND,GENE,TRANSCRIPT,IVS,IVS_SIZE,BP_POS,BP_ACC_DIST,BP_ACC_SEQ,IVS_SEQ,BP_POS_WITHIN_STRAND
0,chr11,46898678,46898903,-,LRP4,ENST00000378623,IVS6,226,46898691,-14,TTGTGTTTCCCTAG,GTGAGTACTCTGGCCAGCTGGGAGGTGGGGAGGCCAGGCTGGGAAG...,212
1,chr11,18096855,18103232,-,SAAL1,ENST00000524803,IVS2,6378,18096871,-17,ATCATTTTTCTTTTCAG,GTATGAGGCTGGAAACAAAACTCTGAAGACTGTTGGGTGAGGAGAT...,6361
2,chr11,34086220,34086304,+,CAPRIN1,ENST00000341394,IVS10,85,34086281,-24,ATTCTATCCTAACTTAACCTGTAG,GTATGTTCATTTTAGTCAGACTCTGTAACAGAAAGTTTAAGTGTTT...,61
3,chr11,90210595,90211214,-,CHORDC1,ENST00000320585,IVS5,620,90210615,-21,ATTTTTTTTTCTTTGTTACAG,GTAAGATTTTATTTTGTTTTATTGTTAATTATTTTCTGGTACAGTT...,599
4,chr11,61325709,61326778,-,DDB1,ENST00000301764,IVS5,1070,61325729,-21,ATTTTGCTGTCTTTCCTGCAG,GTTGGTAGGGGGTTTAGGAAATGTGCCTCCACCTGGGTCTAGGCTA...,1049
...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,chr11,61014015,61015712,+,CD6,ENST00000313421,IVS8,1698,61015686,-27,ATGCCCTCGACTCTGTTCTCTCCCCAG,GTAGGATGTCCCCCATCCTGGGTGTGGGAGGGCTGGGGAGGACAAG...,1671
417,chr11,89969546,89970243,+,TRIM64,ENST00000533122,IVS3,698,89970218,-26,ATCCTTTTAACCAAACATCTCTGCAG,GTAAGAATGAAAATGTTTTCTTTGTTTTTATGCAAATAAACACAAT...,672
418,chr11,66975639,66976176,+,C11orf86,ENST00000683896,IVS1,538,66976157,-20,CCCCCACCTCTGTCTTCCAG,GTATGGCTGTGGGCTGAAGGATGGAGGGTACCACAGCAGGTGGGCA...,518
419,chr8,73673243,73688653,-,STAU2,ENST00000524300,IVS5,15411,73673261,-19,ATGTTTTTTTTTTAAACAG,GTATGGCAGTGACTCCTTCTGTTATGTTGTCTGCTATGTGTGTTCT...,15392


### Manually check if model ever even predict a point

In [None]:
trained_model = AutoModelForTokenClassification.from_pretrained(f"{model_name}-finetuned-secondary-structure/checkpoint-2328") #make sure you are loading the right checkpoint

In [None]:

fixed_train_seqs = [' '.join(list(train_seq.upper().replace("U", "T"))) for train_seq in train_seqs]
train_id = tokenizer.encode(fixed_train_seqs) # N -> 5, A -> 6, C -> 7, G -> 8, T(U) -> 9. NOTE: a [CLS] and a [SEP] token will be added to the start and the end of seq
print(train_id) #Print the tokenized version
train_id = torch.as_tensor(train_id) # convert python list to Tensor
train_id = train_id.unsqueeze(0) # add batch dimension, shape: (batch_size, sequence_length)

[2, 8, 6, 9, 6, 9, 9, 9, 9, 9, 7, 8, 9, 9, 7, 8, 9, 6, 7, 6, 7, 9, 9, 6, 9, 8, 8, 8, 9, 9, 7, 7, 7, 6, 9, 9, 8, 9, 7, 7, 7, 9, 9, 9, 7, 9, 7, 9, 6, 7, 9, 7, 6, 9, 9, 9, 9, 6, 9, 9, 9, 6, 8, 9, 7, 9, 7, 7, 6, 8, 6, 8, 6, 7, 7, 9, 9, 9, 7, 9, 7, 7, 7, 6, 9, 8, 8, 6, 8, 9, 9, 6, 6, 8, 9, 6, 8, 8, 9, 9, 9, 9, 9, 6, 7, 9, 8, 9, 8, 6, 6, 9, 8, 9, 9, 9, 6, 9, 6, 9, 7, 6, 9, 9, 7, 6, 7, 9, 9, 8, 9, 7, 9, 9, 7, 7, 7, 9, 6, 7, 7, 7, 9, 7, 9, 9, 6, 8, 6, 6, 6, 8, 8, 6, 6, 7, 7, 7, 9, 9, 6, 9, 9, 9, 9, 7, 7, 8, 9, 7, 8, 8, 8, 8, 9, 9, 9, 9, 6, 9, 9, 9, 6, 7, 7, 9, 6, 9, 7, 8, 6, 6, 7, 7, 8, 6, 6, 6, 8, 9, 6, 8, 6, 8, 9, 9, 9, 7, 7, 6, 8, 8, 8, 8, 6, 7, 9, 7, 6, 6, 6, 8, 8, 9, 7, 8, 6, 6, 9, 8, 6, 8, 9, 8, 3]


In [None]:
logit = trained_model((train_id), output_hidden_states=False).logits

In [None]:
print(logittobinary(logit))

['0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000']


This short test will likely be all zeros after training as the model will have at least fitted to the mean

### Unused code I keep around for reference

In [None]:
#Create training data
#input_sequences = df["IVS_SEQ"].to_list()
#input_labels = []

#Sequences is a list of strings, input labels is a list of lists of token labels
#for index, sample in df.iterrows():
#    label_sequence = (sample["BP_POS"]-sample["START"])*[0] + [1] + (sample["END"]-sample["BP_POS"])*[0]
#    input_labels.append(label_sequence)

In [None]:
# use huggerface's official API to use SpliceBERT
# get nucleotide embeddings (hidden states)
#model = AutoModel.from_pretrained(SPLICEBERT_PATH) # load model
#last_hidden_state = model(input_ids).last_hidden_state # get hidden states from last layer
#hiddens_states = model(input_ids, output_hidden_states=True).hidden_states # hidden states from the embedding layer (nn.Embedding) and the 6 transformer encoder layers

# get nucleotide type logits in masked language modeling
#model = AutoModelForMaskedLM.from_pretrained(SPLICEBERT_PATH) # load model
#logits = model(input_ids).logits # shape: (batch_size, sequence_length, vocab_size)