##### Peptide prediction using antiberta
Code is based on tutorial [github](https://github.com/alchemab/antiberta/blob/master/mlm.ipynb) from [paper](https://www.cell.com/patterns/fulltext/S2666-3899(22)00105-2). But as we need the generative model it will use BertForMaskedLM

In [1]:
from transformers import (
    RobertaConfig,
    RobertaTokenizer,
    RobertaForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import re
#so progress can be seen
from tqdm.notebook import tqdm
import transformers.trainer_utils as trainer_utils

trainer_utils.tqdm = tqdm
from datasets import Dataset, load_dataset
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    roc_auc_score,
    average_precision_score
)
import pandas as pd
import torch
import numpy as np
import random
import os

#### Tokenizing file with tokenizer from paper

In [2]:
# Initialise the tokeniser
tokenizer = RobertaTokenizer.from_pretrained(
    "tokenizer"
)

# Initialise the data collator, which is necessary for batching
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

#### Setting seed from notebook

In [3]:
def set_seed(seed: int = 42):
    """
    Set all seeds to make results reproducible (deterministic mode).
    When seed is None, disables deterministic mode.
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

#### Select a random HLA to generate peptides

In [4]:
hla_df = pd.read_csv("data/fullData/common_hla_sequence.csv")

# Assuming the HLA sequence is under a column like "sequence" or "HLA_sequence"
# Adjust this if your column has a different name
randomRows = hla_df.sample(1)
hlaSequences = randomRows['HLA_sequence'].tolist()
hlaNames = randomRows['HLA'].tolist()

print("Randomly selected HLA sequences:")
print(hlaSequences)
print("Randomly selected HLAs")
print(hlaNames)

Randomly selected HLA sequences:
['YYSEYRNIYAQTDESNLYLSYDYYTWAERAYEWY']
Randomly selected HLAs
['HLA-B*07:02']


In [5]:
from datasets import load_dataset

# Load and filter your CSV data
full_df = pd.read_csv('data/fullData/data.csv')
tokenizersForTrain = []
tokenizersForEval = []
for i in range(len(hlaSequences)):
    hlaSeq = hlaSequences[i]
    hlaName = hlaNames[i]
    filtered_df = full_df[full_df['HLA_sequence'].isin([hlaSeq])]

    filtered_df = filtered_df[["peptide"]]
    filtered_df.to_csv('temp_filtered.csv', index=False)
    # Load filtered dataset
    dataset = load_dataset('csv', data_files={'full': 'temp_filtered.csv'})['full']

    # Split dataset into train and eval (e.g., 90% train, 10% eval)
    split = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split['train']
    eval_dataset = split['test']

    # Tokenize train dataset
    tokenized_train = train_dataset.map(
        lambda z: tokenizer(
            z["peptide"],
            padding="max_length",
            truncation=True,
            max_length=150,
            return_special_tokens_mask=True,
        ),
        batched=True,
        num_proc=1,
        remove_columns=["peptide"],
    )
    tokenizersForTrain.append(tokenized_train)
    # Tokenize eval dataset
    tokenized_eval = eval_dataset.map(
        lambda z: tokenizer(
            z["peptide"],
            padding="max_length",
            truncation=True,
            max_length=150,
            return_special_tokens_mask=True,
        ),
        batched=True,
        num_proc=1,
        remove_columns=["peptide"],
    )
    tokenizersForEval.append(tokenized_eval)


Generating full split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/23563 [00:00<?, ? examples/s]

Map:   0%|          | 0/2619 [00:00<?, ? examples/s]

#### Model Configuration from paper

In [6]:
# These are the cofigurations they used for pre-training.
antiberta_config = {
    "num_hidden_layers": 12,
    "num_attention_heads": 12,
    "hidden_size": 768,
    "d_ff": 3072,
    "vocab_size": 25,
    "max_len": 150,
    "max_position_embeddings": 152,
    "batch_size": 96,
    "max_steps": 225000,
    "weight_decay": 0.01,
    "peak_learning_rate": 0.0001,
}

In [7]:
# Initialise the model
model_config = RobertaConfig(
    vocab_size=antiberta_config.get("vocab_size"),
    hidden_size=antiberta_config.get("hidden_size"),
    max_position_embeddings=antiberta_config.get("max_position_embeddings"),
    num_hidden_layers=antiberta_config.get("num_hidden_layers", 12),
    num_attention_heads=antiberta_config.get("num_attention_heads", 12),
    type_vocab_size=1,
)
models = []
for i in range(len(hlaSequences)):
    model = RobertaForMaskedLM(model_config)
    models.append(model)

In [8]:
# construct training arguments
# Huggingface uses a default seed of 42
args = TrainingArguments(
    output_dir="test",
    overwrite_output_dir=True,
    per_device_train_batch_size=antiberta_config.get("batch_size", 32),
    per_device_eval_batch_size=antiberta_config.get("batch_size", 32),
    max_steps=225000,
    save_steps=2500,
    logging_steps=2500,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
    weight_decay=0.01,
    warmup_steps=10000,
    learning_rate=1e-4,
    gradient_accumulation_steps=antiberta_config.get("gradient_accumulation_steps", 1),
    disable_tqdm=False,
    fp16=True,
    eval_strategy="steps",
    remove_unused_columns=False
)

In [None]:
for i in range(len(hlaSequences)):    
    trainer = Trainer(
        model=model,
        args=args,
        data_collator=collator,
        train_dataset=tokenized_train ,
        eval_dataset = tokenized_eval
    )
    trainer.train()

    # HLA string to be safe for folder names
    safe_hla = re.sub(r'[^A-Za-z0-9_\-]', '_', hlaSequences[i])

    # Define model output directory
    model_dir = os.path.join("models", f"HLA_{safe_hla}")
    os.makedirs(model_dir, exist_ok=True)
    trainer.save_model(model_dir)