In [1]:
import pandas as pd

df = pd.read_excel("data/vaccine/MS_validated_peptides.xlsx")
df

Unnamed: 0,Allele,Peptide,Mutation
0,A11;A68,VVVGAGDVGK,KRAS-G13D
1,A01,ILDTAGKEEY,KRAS-Q61K
2,A03,ATTTAPPLSGK,CTNNB1-S45P
3,A11;A68,TTAPPLSGK,CTNNB1-S45P
4,A11;A68,TTTAPFLSGK,CTNNB1-S45F
5,A01,ILDEAYVMAY,ERBB2-Y772_A775dup
6,A02,KLVVVGADGV,KRAS-G12D
7,A03;A11;A68,VVVGADGVGK,KRAS-G12D
8,B0702;C0304;C0802,GADGVGKSAL,KRAS-G12D
9,A0101,ILDTAGREEY,KRAS-Q61R


In [2]:
import torch
import pickle
from fennet.mhc.mhc_binding_retriever import MHCBindingRetriever
from fennet.mhc.mhc_binding_model import ModelHlaEncoder, ModelSeqEncoder

model_version = "v0819"
pept_encoder = ModelSeqEncoder().to("cuda")
hla_encoder = ModelHlaEncoder().to("cuda")

hla_encoder.load_state_dict(
    torch.load(f"model/HLA_model_{model_version}.pt", map_location="cuda")
)
pept_encoder.load_state_dict(
    torch.load(f"model/pept_model_{model_version}.pt", map_location="cuda")
)

with open(f"embeds/hla_{model_version}_embeds.pkl", "rb") as f:
    data_dict = pickle.load(f)
    hla_df = data_dict["protein_df"]
    hla_embeds = data_dict["embeds"]

In [3]:
fasta_list = "uniprotkb_UP000005640_AND_reviewed_true_2024_03_01.fasta"
retriever = MHCBindingRetriever(
    hla_encoder,
    pept_encoder,
    hla_df,
    hla_embeds,
    fasta_list,
    digested_pept_lens=(8, 15),
)

In [4]:
dists_matrix = retriever.get_binding_distances(hla_embeds, df["Peptide"].tolist())
dists_matrix.shape

(28, 16454)

In [5]:
import numpy as np


def filter_alleles_by_distance_for_epitopes(
    distance_matrix, distance_threshold, hla_alleles
):
    allele_idx_lists = [
        np.where(row < distance_threshold)[0] for row in distance_matrix
    ]
    allele_id_lists = [
        [hla_alleles[i] for i in id_list] for id_list in allele_idx_lists
    ]
    return allele_id_lists

In [6]:
dist_threshold_list = [0.1, 0.2, 0.3]

for i in dist_threshold_list:
    df[f"Extend Alleles <={i}"] = filter_alleles_by_distance_for_epitopes(
        dists_matrix, i, hla_df["allele"].tolist()
    )

In [7]:
df

Unnamed: 0,Allele,Peptide,Mutation,Extend Alleles <=0.1,Extend Alleles <=0.2,Extend Alleles <=0.3
0,A11;A68,VVVGAGDVGK,KRAS-G13D,"[A03_420, A11_406, F01_06, F01_11, F01_08, F01...","[A03_12, A11_14, A03_420, A11_406, F01_06, F01...","[B15_128, B14_12, A03_12, A68_01, A01_15N, A11..."
1,A01,ILDTAGKEEY,KRAS-Q61K,"[A01_274, A01_324, A01_342, A01_220, A01_292, ...","[A01_274, A01_324, A30_175, A01_210, A01_342, ...","[A33_171, A31_153, A01_274, A29_37, A29_56, A2..."
2,A03,ATTTAPPLSGK,CTNNB1-S45P,"[A03_420, A11_284, A03_396, A03_243, A03_224, ...","[A03_12, A11_14, A03_420, A11_406, F01_06, F01...","[B15_128, B14_12, A03_12, A01_15N, A11_14, B18..."
3,A11;A68,TTAPPLSGK,CTNNB1-S45P,"[A03_12, A11_406, F01_06, F01_03, F01_05, F01_...","[B14_12, A03_12, A68_01, A11_14, A03_420, A11_...","[B15_128, B14_12, A03_12, A68_01, A01_15N, A11..."
4,A11;A68,TTTAPFLSGK,CTNNB1-S45F,"[A03_12, A11_406, F01_06, F01_03, F01_05, F01_...","[A03_12, A68_01, A03_420, A11_406, A34_23, A34...","[B15_128, B14_12, A03_12, A68_01, A01_15N, A11..."
5,A01,ILDEAYVMAY,ERBB2-Y772_A775dup,"[A01_274, A01_324, A01_342, A01_220, A01_292, ...","[A01_274, A01_324, A30_175, A01_210, A01_342, ...","[A33_171, A31_153, A01_274, A29_37, A29_56, A2..."
6,A02,KLVVVGADGV,KRAS-G12D,"[B13_166, B13_15, B13_120, B13_03]","[A02_884, A02_929, A02_1047, A02_763, A02_874,...","[A02_236, A02_01, A02_132, A02_759, A02_765, A..."
7,A03;A11;A68,VVVGADGVGK,KRAS-G12D,"[A03_420, A11_406, F01_06, F01_01, F01_11, F01...","[A03_12, A11_14, A03_420, A11_406, F01_06, F01...","[B15_128, B14_12, A03_12, A68_01, A01_15N, A11..."
8,B0702;C0304;C0802,GADGVGKSAL,KRAS-G12D,"[C05_171, C05_160, C05_58, C08_239, C05_281, C...","[C12_262, C08_200, C05_171, C08_53, C05_200, C...","[A02_851, A02_704, A02_646, A02_87, A02_136, A..."
9,A0101,ILDTAGREEY,KRAS-Q61R,"[A01_274, A01_324, A01_342, A01_220, A01_292, ...","[A01_274, A01_324, A30_175, A01_210, A01_342, ...","[A33_171, A31_153, A01_274, A29_37, A29_56, A2..."


In [8]:
df["Extend Alleles <=0.1"] = df["Extend Alleles <=0.1"].apply(lambda x: ";".join(x))
df["Extend Alleles <=0.2"] = df["Extend Alleles <=0.2"].apply(lambda x: ";".join(x))
df["Extend Alleles <=0.3"] = df["Extend Alleles <=0.3"].apply(lambda x: ";".join(x))

In [9]:
df.to_csv("data/vaccine/predict_extend_allele_for_shared_antigen_nm.csv", index=False)