In [None]:
!pip3 install datasets
!pip install --upgrade transformers datasets

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer
import torch
from datasets import Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd "/content/drive/MyDrive/epfl_ml_project/BP_LM"

/content/drive/MyDrive/epfl_ml_project/BP_LM


In [5]:
# Import helper methods
from data_preprocessing import *

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

### Loading and prepping our dataset

In [7]:
# Load our dataset
file_path = '../dataset/Data_GRCh38_canonical_transcript_intron_bp_database_Peng_20241113.txt'
df = pd.read_csv(file_path, delimiter = "\t")

preprocess_data(df)
print(df.shape)
print(f'Example row: \n {df.iloc[0]}\n')

(177980, 13)
Example row: 
 CHR                                                                  chr1
START                                                              924949
END                                                                925921
STRAND                                                                  1
GENE                                                               SAMD11
TRANSCRIPT                                                ENST00000616016
IVS                                                                  IVS1
IVS_SIZE                                                              973
BP_POS                                                             925894
BP_ACC_DIST                                                           -28
BP_ACC_SEQ                                   ACAGGGTCTGCCTCGGCTCTGCTCGCAG
IVS_SEQ                 GTGCCGCCGCCCCTCCCTTCGCTGCCGGGACCCGCGGGCCCCGACC...
BP_POS_WITHIN_STRAND                                                  945
Name: 0, d

  df['STRAND'] = df['STRAND'].replace({'+': 1, '-': 0})


In [8]:
train_chrs = ['chr15']
val_chrs = ['chr13']
test_chrs = ['chr18']

train_df, val_df, test_df = split_train_test_on_chr(df, train_chrs, val_chrs, test_chrs)

Chromosomes in train set: {'chr15'}
Chromosomes in validation set: {'chr13'}
Chromosomes in test set: {'chr18'}

Total data points: 12389
Train set contains 6485 data points (52.34%)
Validation set contains 3158 data points (25.49%)
Test set contains 2746 data points (22.16%)


In [9]:
# Extract intron sequences and labels from each set
# and truncate introns longer than max model input length
max_model_length = 1000

introns_tr, labels_tr = extract_intron_seq_and_labels(train_df, max_model_length)
introns_val, labels_val = extract_intron_seq_and_labels(val_df, max_model_length)
introns_test, labels_test = extract_intron_seq_and_labels(test_df, max_model_length)

# Padding still has to be done, but that can be done automatically
# NOTE: Makes sure to pad ON THE LEFT

In [16]:
print([len(intron) for intron in introns_tr])
print([len(label) for label in labels_tr])
print(introns_tr[3])
print(labels_tr[3])

[415, 1000, 1000, 517, 1000, 1000, 788, 1000, 1000, 1000, 1000, 88, 1000, 335, 1000, 1000, 991, 532, 1000, 1000, 1000, 1000, 1000, 1000, 150, 1000, 1000, 1000, 1000, 1000, 94, 217, 780, 1000, 1000, 1000, 632, 1000, 1000, 283, 1000, 86, 1000, 111, 1000, 1000, 1000, 90, 1000, 1000, 1000, 786, 1000, 585, 1000, 1000, 1000, 1000, 1000, 129, 286, 183, 1000, 112, 1000, 1000, 82, 234, 1000, 1000, 854, 634, 472, 1000, 276, 1000, 1000, 631, 97, 1000, 794, 390, 1000, 1000, 506, 1000, 272, 409, 1000, 1000, 1000, 745, 262, 716, 1000, 1000, 839, 1000, 1000, 574, 753, 1000, 836, 88, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 388, 944, 701, 1000, 1000, 1000, 191, 1000, 1000, 1000, 1000, 1000, 1000, 144, 1000, 1000, 1000, 1000, 106, 1000, 364, 482, 1000, 1000, 941, 1000, 621, 1000, 1000, 1000, 514, 1000, 659, 1000, 84, 1000, 1000, 1000, 1000, 106, 850, 1000, 95, 1000, 106, 885, 222, 1000, 1000, 80, 1000, 599, 1000, 956, 1000, 1000, 1000, 1000, 995, 1000, 1000, 1000, 675, 1000, 1000, 651, 1000, 519

In [None]:
# Some very basic finetuning using the HyenaDNA small model and just the IVS_SEQ and BP as the data

# Instantiate pretrained model
checkpoint = "LongSafari/hyenadna-medium-160k-seqlen-hf"
max_length = 32_000

# bfloat16 for better speed and reduced memory usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=2, torch_dtype=torch.bfloat16, device_map="auto")



sequences = ["AUGCUA", "GCUAAUUGC", "AUCG"]
labels = [
    [0, 0, 1, 0, 0, 0],  # BP at index 2
    [0, 0, 0, 0, 1, 0, 0, 0, 0],  # BP at index 5
    [0, 1, 0, 0]  # BP at index 1
]

# Tokenize the sequences
tokenized_inputs = tokenizer(sequences, padding=True, truncation=True, max_length=max_length, return_tensors='pt', is_split_into_words=True)


# Create a dataset for training
dataset_dict = {
    "input_ids": tokenized_inputs["input_ids"],
    "labels": labels,
}
ds = Dataset.from_dict(dataset_dict)
ds.set_format("torch")

# Initialize Trainer
args = {
    "output_dir": "tmp",
    "num_train_epochs": 1,
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 4,
    "gradient_checkpointing": True,
    "learning_rate": 2e-5,
}
training_args = TrainingArguments(**args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)

# Train the model
result = trainer.train()

print(result)