In [6]:
import matplotlib as plt
import random 
import numpy as np
# we define a mix of fontsizes, for different parts of a plot
SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 20

import seaborn as sns

# example of how you can use these fontsizes to set a global configuration for matplotlib;
# you should assign them based on the specific appearance of the figures you are producing
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

#setting seeds
seed = 6
random.seed(seed )
torch.manual_seed(seed )
np.random.seed(seed ) 
colors = ['#9BC995', "#083D77", '#9A031E', '#C4B7CB', '#FC7753']

# Create a seaborn palette from these colors
custom_palette = sns.color_palette(colors)
palette = sns.color_palette(custom_palette, n_colors=15)

#### Load Model

In [7]:
import os
import pandas as pd
import torch
from tqdm import tqdm
from transformers import RobertaTokenizer, RobertaForMaskedLM

# Create output directory
os.makedirs("results", exist_ok=True)

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("./tokenizer")
model = RobertaForMaskedLM.from_pretrained("./modelsHlaPeptide")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(26, 768, padding_idx=1)
      (position_embeddings): Embedding(152, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm):

In [9]:
from generatePeptides import generatePeptidesNaive
import pandas as pd 
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
#number of HLAs to assess
topN = 5
peptideLengths = 9

df = pd.read_csv("data/fullData/data.csv")
hla_counts = df["HLA_sequence"].value_counts()
hla_counts = Counter(df["HLA"])

# Sort by most common
most_common_hlas = hla_counts.most_common()

# Print top HLA names with most samples
for hla, count in most_common_hlas[:topN]:
    print(f"{hla}: {count}")

HLA-B*15:02: 37616
HLA-A*02:01: 30583
HLA-B*57:01: 29221
HLA-B*07:02: 26182
HLA-A*01:01: 13103


In [10]:
data = pd.read_csv("data/fullData/data.csv")
hlas = most_common_hlas[:topN]

peptidesReal = {}
peptidesGenerated = {}
hla_sequences = {}
peptidesToGenerate = 50
for hla, _ in hlas:
    # Extract real peptides for this HLA
    peptides = data[data['HLA'] == hla]['peptide'].values  
    peptidesReal[hla] = peptides

    # Extract unique HLA sequence
    hla_seq_set = set(data[data['HLA'] == hla]['HLA_sequence'].values)
    if len(hla_seq_set) == 1:
        hla_sequences[hla] = list(hla_seq_set)[0]
    else:
        print(hla_seq_set, peptidesReal, hla)
        raise ValueError(f"Multiple different sequences found for HLA {hla}")

    # Generate peptides using the extracted sequence
    peptidesGenerated[hla] = generatePeptidesNaive(
        tokenizer, 
        model, 
        hla_sequence=hla_sequences[hla], 
        peptide_length=peptideLengths, 
        top_k=peptidesToGenerate
    )

In [None]:

for hla, _ in hlas:
    real_peps = peptidesReal[hla]
    gen_peps = peptidesGenerated[hla]

    # Tokenize HLA sequence once
    hlaTokens = tokenizer.tokenize(hla)
    hlaTokenIds = tokenizer.convert_tokens_to_ids(hlaTokens)
    peptideStart = len(hlaTokenIds) + 2  # [CLS] HLA [SEP]

    for pep in gen_peps:
        peptideTokens = tokenizer.tokenize(pep)
        peptideTokenIds = tokenizer.convert_tokens_to_ids(peptideTokens)

        inputIds = [tokenizer.cls_token_id] + hlaTokenIds + [tokenizer.sep_token_id]
        inputIds += peptideTokenIds + [tokenizer.sep_token_id]
        inputIds = torch.tensor([inputIds]).to(device)
        attentionMask = torch.ones_like(inputIds).to(device)

        with torch.no_grad():
            outputs = model.roberta(input_ids=inputIds, attention_mask=attentionMask)
            hidden_states = outputs.last_hidden_state  
            peptide_hidden_states = hidden_states[0, peptideStart:peptideStart + len(peptideTokenIds), :]
