# Enhancer Fine Tuning
conda activate nt_finetune_v2

In [96]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import matthews_corrcoef

In [12]:
num_labels_enhancers_types = 3
# Load the model
model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=num_labels_enhancers_types)

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model.to('cuda')

EsmForSequenceClassification(
  (esm): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(4105, 1280, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1002, 1280, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-23): 24 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=1280, out_features=1280, bias=True)
              (key): Linear(in_features=1280, out_features=1280, bias=True)
              (value): Linear(in_features=1280, out_features=1280, bias=True)
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=1280, out_features=1280, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((1280,), eps=1e-12, elementwise_affine=True)
          )
          (intermediate): EsmIntermediate(
            

There are weights and bias that has been newly assigned. Since we have to perform training i.e. Dense classifier weights and classifier bias

## Change the model downstream classifier head
Changes in the classifier head can be used for making different prediction like promotor classificaiton, regression etc.

In [None]:
from transformers import EsmForSequenceClassification

model2 = EsmForSequenceClassification.from_pretrained(
    "InstaDeepAI/nucleotide-transformer-500m-human-ref",
    num_labels=1,
    problem_type="regression"
)


Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model2

EsmForSequenceClassification(
  (esm): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(4105, 1280, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1002, 1280, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-23): 24 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=1280, out_features=1280, bias=True)
              (key): Linear(in_features=1280, out_features=1280, bias=True)
              (value): Linear(in_features=1280, out_features=1280, bias=True)
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=1280, out_features=1280, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((1280,), eps=1e-12, elementwise_affine=True)
          )
          (intermediate): EsmIntermediate(
            

In [2]:
raw_model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")


Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
raw_model

EsmForSequenceClassification(
  (esm): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(4105, 1280, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1002, 1280, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-23): 24 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=1280, out_features=1280, bias=True)
              (key): Linear(in_features=1280, out_features=1280, bias=True)
              (value): Linear(in_features=1280, out_features=1280, bias=True)
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=1280, out_features=1280, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((1280,), eps=1e-12, elementwise_affine=True)
          )
          (intermediate): EsmIntermediate(
            

In [5]:
import torch.nn as nn
raw_model.classifier = nn.Sequential(
    nn.Linear(1280, 512),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(512, 1)
)

raw_model.config.num_labels = 1
raw_model.config.problem_type = "regression"
print(raw_model.classifier)
print(raw_model.config.problem_type)
print(raw_model.config.num_labels)

Sequential(
  (0): Linear(in_features=1280, out_features=512, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.2, inplace=False)
  (3): Linear(in_features=512, out_features=1, bias=True)
)
regression
1


In [60]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
# AutoTokenizer.from_pretrained(...): This command downloads and loads the pre-trained tokenizer associated with the 
# specified model, "InstaDeepAI/nucleotide-transformer-500m-human-ref".
# Purpose: The tokenizer converts raw DNA sequences (strings of 'A', 'C', 'G', 'T') into numerical input IDs and 
# attention masks that the transformer model can understand. For the Nucleotide Transformer, this typically involves 
# k-mer tokenization (breaking the sequence into overlapping or non-overlapping short nucleotide chunks). i.e. into 6 mers
# So the sequence which is currently 300 bp converted into 50 tokens.

In [61]:
def tokenize_function(examples):
    outputs = tokenizer(examples["data"])
    return outputs

In [None]:
from datasets import load_dataset, Dataset

In [8]:
# Set the dataset_name to None or 'default' since the prompt is about the config name
# but 'promoter_all' is actually a value in the 'task' column.
dataset_config_name = None 

# Load the full downstream tasks dataset
full_train_dataset = load_dataset(
    "InstaDeepAI/nucleotide_transformer_downstream_tasks",
    dataset_config_name,  # Pass None to use the default configuration
    split="train",
    streaming=False,
)


In [33]:
full_train_dataset

Dataset({
    features: ['sequence', 'name', 'label', 'task'],
    num_rows: 461850
})

In [None]:
total_task = set(full_train_dataset['task'])
print(total_task)
print(len(total_task))

{'H3K9ac', 'enhancers', 'H3K4me1', 'splice_sites_acceptors', 'promoter_all', 'enhancers_types', 'splice_sites_all', 'H3K4me3', 'H3K36me3', 'promoter_tata', 'promoter_no_tata', 'H3K4me2', 'H3K79me3', 'H4ac', 'H4', 'H3K14ac', 'splice_sites_donors', 'H3'}
18


In [10]:
set(full_train_dataset['label'])

{0, 1, 2}

In [14]:
# Filter the dataset for the specific task "promoter_all"
# Filter the training split
train_dataset_enhancers = full_train_dataset.filter(
    lambda example: example["task"] == "promoter_all"
)

In [26]:
set(train_dataset_enhancers['task'])
set(train_dataset_enhancers['label'])

{0, 1}

In [19]:
# Load the full test split
full_test_dataset = load_dataset(
    "InstaDeepAI/nucleotide_transformer_downstream_tasks",
    None,  # Use 'None' for the default configuration
    split="test",
    streaming=False,
)

In [25]:
print(set(full_test_dataset['task']))
print(set(full_test_dataset['label']))

{'H3K9ac', 'enhancers', 'H3K4me1', 'splice_sites_acceptors', 'promoter_all', 'enhancers_types', 'splice_sites_all', 'H3K4me3', 'H3K36me3', 'promoter_tata', 'promoter_no_tata', 'H3K4me2', 'H3K79me3', 'H4ac', 'H4', 'H3K14ac', 'splice_sites_donors', 'H3'}
{0, 1, 2}


In [23]:
# Filter the test split
test_dataset_promoter = full_test_dataset.filter(
    lambda example: example["task"] == "promoter_all"
)

In [28]:
print(set(test_dataset_promoter['task']))
print(set(test_dataset_promoter['label']))

{'promoter_all'}
{0, 1}


In [None]:
# Get training data
train_sequences_enhancers = train_dataset_enhancers['sequence']
train_labels_enhancers = train_dataset_enhancers['label']

# Split the dataset into a training and a validation dataset
train_sequences_enhancers, validation_sequences_enhancers, train_labels_enhancers, validation_labels_enhancers = train_test_split(train_sequences_enhancers,
                                                                              train_labels_enhancers, test_size=0.10, random_state=42)

print(train_sequences_enhancers[0:2])
print(len(train_sequences_enhancers[0]))
print(train_labels_enhancers[0])
print(validation_sequences_enhancers[0])
print(len(validation_sequences_enhancers[0]))
print(validation_labels_enhancers[0])

['TTTCGTAAAACAACCATGTTGTCTTCTGCACAGTACTGTATTCAGTGGCGAAGATGGATAGCCAATCCTTAAGCTCGCTAAAGGTGTAAACCAGCAAAGGGGGAGCCCGCCTGCGATACAATTTGAGCCCTTGTCGCTGATTTGCTAAGAGAGGGAGGCAGATTGAGAGAGAGAGAGAGATAGAGGACCGAGAGTTAGGGATTGACTTTGGCGTGAGCGCGCGCATGTTGGAACAAGATCGGTGCTTATGGAAAGAGAGAGGGACCAGGACTACTATCCGACTGAAGAAGAAGAAGCCAG', 'CAAATTGGATTTTTCTTTTTCTTTCCTTCCTCCTTCCTTCCTTCTTTTTTTCTTTTAATTGGCAACTCAGATTTTTCGAAGTGTTTTTGCTATCTCACTGCTGGAAAGCCTGGTTCTGCCTTTCCTAAAATCTCGTGTGCAGGTTCGCACTCCGGCTACTTTCAGGCCTCTAGGGAGCCCAGGTAGCGGCGCGCACGCGCACGCGCACACTTCTCCCTCGCTGGTCTTCAGGCCCGGCCCGCCCTGTCCAGAGGCGCCGGGACCCAGGCGCCTGCAGCCGCCCGCCGGGCCGACGTCCCA']
300
0
TGCTCATAAAGGCCTTGGGCGTTTCTCTACCAAGCCCCAGAAGGCCCCCGTGTACGAAAAACCGACGGTGTCAATGAGGCGCGGGCTCCCCGCTATCAAAAAACTAGCGGGGCTGAGTTGATGTCAGTTACGCCTTACAAGTTCCGGGAAAGGACCACCGTATGACTGAGAAGAAGACGTTCAGGCATAGCGCGTTGATGTACGGGCCATACGTATACTTACATTGATTGCCATTCAGTGAGGCGGAGCAGAGTCTGCGGCAACAGCAGTAGCGGGCCGCCGCCGCCGCCATGAACCCCG
300
0


In [34]:
# Get test data
test_sequences_enhancers = test_dataset_promoter['sequence']
test_labels_enhancers = test_dataset_promoter['label']

In [48]:
print(test_sequences_enhancers[0])
print(len(test_sequences_enhancers[0]))
print(test_labels_enhancers[0])

ACCGTTGAGATAGAATAGTCGAGCGACCGTAGCAGTTGTACTACATTTAACATTGGGTTTTACCTAGGCGCTTCCTCAGCTACAGCGGCTAAAGCAGTTACTCAGCTAGATGGGTTGCTAAGTAAGCTCTTCAGATCAGAGCATGACGAAACGACTTGACGATGCGGCAGTGGGAGCTGGGCCACGTGGAGACTACTACATCTATGAGGATAAGGTTCCGGCCCGGTCGACTGCAACAGCGCCACCCCCAGAACCTCCTAAGCTGGTCAACGATAAGCCCCACAATTTCGTCTCTGGAAT
300
0


In [50]:
# Tokenizing the datasets
# Enhancer dataset
ds_train_enhancers = Dataset.from_dict({"data": train_sequences_enhancers,'labels':train_labels_enhancers})
ds_validation_enhancers = Dataset.from_dict({"data": validation_sequences_enhancers,'labels':validation_labels_enhancers})
ds_test_enhancers = Dataset.from_dict({"data": test_sequences_enhancers,'labels':test_labels_enhancers})

In [57]:
print("Train\n",ds_train_enhancers)
print("Validation\n",ds_validation_enhancers)
print("Test\n",ds_test_enhancers)

Train
 Dataset({
    features: ['data', 'labels'],
    num_rows: 47948
})
Validation
 Dataset({
    features: ['data', 'labels'],
    num_rows: 5328
})
Test
 Dataset({
    features: ['data', 'labels'],
    num_rows: 5920
})


In [79]:
tokenizer

EsmTokenizer(name_or_path='InstaDeepAI/nucleotide-transformer-500m-human-ref', vocab_size=4107, model_max_length=1000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '<unk>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<cls>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [63]:
# Creating tokenized enhancer dataset
tokenized_datasets_train_enhancers = ds_train_enhancers.map(
    tokenize_function,
    batched=True,
    remove_columns=["data"],
)

Map: 100%|██████████| 47948/47948 [00:15<00:00, 3026.32 examples/s]


In [None]:
print(tokenized_datasets_train_enhancers['input_ids'])
print(len(tokenized_datasets_train_enhancers['input_ids']))
print(len(tokenized_datasets_train_enhancers['input_ids'][0])) # so by tokenization from 300 it is converted to 50.
# 6 mers 300 / 6 = 50

Column([[3, 1393, 12, 651, 1501, 1660, 2262, 1865, 2275, 2832, 1992, 3717, 2644, 3698, 1043, 1860, 2620, 259, 3310, 3746, 3148, 352, 3753, 3517, 3163, 2320, 3328, 3984, 1488, 3280, 3280, 1235, 695, 855, 3867, 603, 3808, 3826, 3617, 1990, 202, 3965, 1152, 208, 3328, 2624, 2344, 1718, 1808, 784, 935], [3, 2057, 3865, 1433, 1433, 2654, 1690, 2413, 1625, 1385, 1289, 3974, 1589, 1375, 225, 1378, 1130, 637, 3847, 2689, 1662, 1389, 10, 1761, 3649, 1766, 1730, 1177, 2302, 1619, 3310, 983, 3058, 3634, 3634, 3622, 1438, 2493, 3945, 2302, 3054, 3755, 1680, 4030, 4046, 2303, 2684, 3774, 2995, 3766, 3500], [3, 2693, 2429, 1442, 3857, 2750, 2715, 1552, 3236, 564, 3250, 1971, 1646, 2999, 2879, 2322, 3386, 2765, 2766, 1344, 1965, 1710, 170, 1578, 1683, 2962, 1983, 3694, 1665, 3901, 3839, 3973, 3012, 1789, 603, 3745, 2474, 2684, 3826, 3675, 894, 3050, 4010, 2974, 3007, 3459, 890, 2290, 2671, 2841, 4074], [3, 3566, 3759, 3782, 4087, 3918, 2495, 3823, 3536, 3305, 4036, 1390, 1545, 3193, 2041, 637, 346, 8

In [81]:
### First index is alway the beginning of the tokens
print(tokenized_datasets_train_enhancers['input_ids'][0][0])
print(tokenized_datasets_train_enhancers['input_ids'][1][0])
print(tokenized_datasets_train_enhancers['input_ids'][2][0])
print(tokenized_datasets_train_enhancers['input_ids'][3][0])

3
3
3
3


In [84]:
tokenized_datasets_validation_enhancers = ds_validation_enhancers.map(
    tokenize_function,
    batched=True,
    remove_columns=["data"],
)
tokenized_datasets_test_enhancers = ds_test_enhancers.map(
    tokenize_function,
    batched=True,
    remove_columns=["data"],
)

Map: 100%|██████████| 5328/5328 [00:01<00:00, 2899.22 examples/s]
Map: 100%|██████████| 5920/5920 [00:02<00:00, 2927.73 examples/s]


In [88]:
print(tokenized_datasets_test_enhancers['input_ids'][0][0])
print(tokenized_datasets_test_enhancers['input_ids'][1][0])
print(tokenized_datasets_validation_enhancers['input_ids'][0][0])
print(tokenized_datasets_validation_enhancers['input_ids'][1][0])

3
3
3
3


In [90]:
# Fine Tuning 
batch_size = 8
model_name='nucleotide-transformer_enhancer'
args_enhancers = TrainingArguments(
    f"{model_name}-finetuned-NucleotideTransformer",
    remove_unused_columns=False,
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps= 1,
    per_device_eval_batch_size= 64,
    num_train_epochs= 2,
    logging_steps= 100,
    load_best_model_at_end=True,  # Keep the best model according to the evaluation
    metric_for_best_model="mcc_score", # The mcc_score on the evaluation dataset used to select the best model
    label_names=["labels"],
    dataloader_drop_last=True,
    max_steps= 1000
)


In [91]:
# Define the metric for the evaluation
def compute_metrics_mcc(eval_pred):
    """Computes Matthews correlation coefficient (MCC score) for binary classification"""
    predictions = np.argmax(eval_pred.predictions, axis=-1)
    references = eval_pred.label_ids
    r={'mcc_score': matthews_corrcoef(references, predictions)}
    return r

In [97]:
trainer = Trainer(
    model,
    args_enhancers,
    train_dataset= tokenized_datasets_train_enhancers,
    eval_dataset= tokenized_datasets_validation_enhancers,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_mcc,
)

  trainer = Trainer(


In [98]:
train_results = trainer.train()

Step,Training Loss,Validation Loss,Mcc Score
100,0.1724,0.449981,0.79716
200,0.3683,0.514276,0.764191
300,0.3318,0.334206,0.816509
400,0.2977,0.289499,0.818478
500,0.2162,0.275525,0.855552
600,0.2058,0.265742,0.865453
700,0.2302,0.267372,0.860214
800,0.2452,0.232194,0.871608
900,0.2407,0.219733,0.871457
1000,0.1974,0.20971,0.875064
