In [None]:
import os
import re

import pandas as pd
from tqdm import tqdm

tqdm.pandas()

### Helper functions

In [None]:
# format IEDB dataset
def format_iedb_dataset(df):
    df = df.iloc[:, [2, 4, -2, -1]]
    df.columns = ['peptide', 'hla', 'type', 'measure']
    return df.drop_duplicates().reset_index(drop=True)

# Extract peptide and PTM info from annotation
def extract_peptide(txt):
    pep = txt.split()[0]
    ptm_match = re.search(r"\+ (.*?\))", txt)
    ptm_text = ptm_match.group(1) if ptm_match else None
    if ptm_text:
        split_match = re.match(r'([A-Za-z]+)\(([^)]+)\)', ptm_text)
        ptm, loc = split_match.groups() if split_match else (None, None)
    else:
        ptm, loc = None, None
    return pep, ptm, loc

# Filter HLA nomenclature and PTM
def filter_df(df):
    ptm2drop = {'SCM', 'MULT', 'INDIST', 'MCM', 'UNK', 'OTH'}

    hla = df['hla'].astype(str)
    typ = df['type'].fillna('').astype(str).str.lower()

    mask = (
        hla.str.contains(r'(?:HLA|H-?2)', case=False, regex=True)
        & ~df['ptm'].isin(ptm2drop)
    )

    type_map = [
        ('qualitative', 'BA_qualitative'),
        ('kd', 'BA_KD'),
        ('ic50', 'BA_IC50'),
        ('ec50', 'BA_EC50'),
        ('ka', 'BA_KA'),
        ('life', 'BA_HL'),
        ('3d', '3D'),
        ('spectrometry', 'MS')
    ]
    mapped = pd.Series('BA_quantitative', index=df.index, dtype='object')

    for key, val in type_map:
        hit = typ.str.contains(key, regex=False)
        mapped.loc[hit & (mapped == 'BA_quantitative')] = val

    return (
        df.loc[mask]
          .assign(type=mapped.loc[mask].to_numpy())
          .drop_duplicates()
          .reset_index(drop=True)
    )

# Drop rows with conflicting labels
def drop_conflicting_rows(df, labels_only=False):
    df = df.copy()
    key_cols = [c for c in df.columns if c != 'label']
    df_filled = df.fillna('0')
    label_nunique = df_filled.groupby(key_cols)['label'].transform('nunique')
    df = df.loc[label_nunique == 1].copy()

    if not labels_only:
        df['measure_num'] = (
            df['measure']
            .astype(str)
            .str.extract(r'([0-9]+(?:\.[0-9]+)?)')[0]
            .astype(float)
        )

        lower_better = {'BA_IC50', 'BA_EC50', 'BA_KD'}
        higher_better = {'BA_HL', 'BA_quantitative'}

        cleaned = []
        for (assay_type, allele), sub in df.groupby(['type', 'hla'], sort=False):
            labels = set(sub['label'])

            if sub['measure_num'].isna().all() or labels != {0, 1}:
                cleaned.append(sub)
                continue

            if assay_type in lower_better:
                max_binder = sub.loc[sub['label'] == 1, 'measure_num'].max()
                min_nonbinder = sub.loc[sub['label'] == 0, 'measure_num'].min()
                if pd.notna(max_binder) and pd.notna(min_nonbinder) and max_binder > min_nonbinder:
                    conflict = sub['measure_num'].between(min_nonbinder, max_binder)
                    sub = sub.loc[~conflict]

            elif assay_type in higher_better:
                min_binder = sub.loc[sub['label'] == 1, 'measure_num'].min()
                max_nonbinder = sub.loc[sub['label'] == 0, 'measure_num'].max()
                if pd.notna(min_binder) and pd.notna(max_nonbinder) and max_nonbinder > min_binder:
                    conflict = sub['measure_num'].between(min_binder, max_nonbinder)
                    sub = sub.loc[~conflict]

            cleaned.append(sub)

        return (
            pd.concat(cleaned, ignore_index=True)
            .drop(columns='measure_num')
            .sort_values(['peptide', 'hla'])
            .reset_index(drop=True)
        )
    else:
        return df

# Filter out peptides with invalid PTM-amino acid combinations
def filter_ptm(df: pd.DataFrame) -> pd.DataFrame:
    ptm_motifs = {
        'DIMETH': lambda pep, loc: bool(re.search(r'[KR]', loc)),
        'METH':   lambda pep, loc: bool(re.search(r'[CHNQKRILDE]', loc)),
        'OX':     lambda pep, loc: bool(re.search(r'[KPC]', loc)),  # M, W, H artifact
        'PHOS':   lambda pep, loc: bool(re.search(r'[STY]', loc)),
        'GLYC':   lambda pep, loc: bool(re.match(r'N[^P][ST]', pep)) or bool(re.search(r'[STY]', loc)),
        'NAc':    lambda pep, loc: int(re.search(r'\d+', loc).group()) == 1,
        'NAh':    lambda pep, loc: int(re.search(r'\d+', loc).group()) == 1,
        'ACET':   lambda pep, loc: bool(re.search(r'[STK]', loc)) or int(re.search(r'\d+', loc).group()) == 1,
        'FORM':   lambda pep, loc: bool(re.search(r'[K]', loc)) or int(re.search(r'\d+', loc).group()) == 1,
        'SUCC':   lambda pep, loc: bool(re.search(r'[K]', loc)) or int(re.search(r'\d+', loc).group()) == 1,
        'PALM':   lambda pep, loc: bool(re.search(r'[CST]', loc)),
        'FARN':   lambda pep, loc: bool(re.search(r'[C]', loc)),
        'NIT':    lambda pep, loc: bool(re.search(r'[STY]', loc)),
        'SULF':   lambda pep, loc: bool(re.search(r'[STYW]', loc)),
        'NITSYL': lambda pep, loc: bool(re.search(r'[CM]', loc)),
        'ADP':    lambda pep, loc: bool(re.search(r'[RKEDSTYCQNH]', loc)),
        'DEAM':   lambda pep, loc: bool(re.search(r'[NQ]', loc)),
        'CYSTL':  lambda pep, loc: bool(re.search(r'[C]', loc)),
        'TRIOX':  lambda pep, loc: bool(re.search(r'[C]', loc)),
        'CROTO':  lambda pep, loc: bool(re.search(r'[K]', loc)),
        'UBQ':    lambda pep, loc: bool(re.search(r'[K]', loc)),
        'AMID':   lambda pep, loc: bool(re.search(r'[DE]', loc)),
        'Hydroxykynurenine': lambda pep, loc: bool(re.search(r'[W]', loc)),
        'kynurenine': lambda pep, loc: bool(re.search(r'[W]', loc)),
        'GGHK': lambda pep, loc: bool(re.search(r'[K]', loc)),
        'PYRE': lambda pep, loc: bool(re.search(r'[QE]', loc)),
        'AspSA': lambda pep, loc: bool(re.search(r'[M]', loc)),
        'TRIOX': lambda pep, loc: bool(re.search(r'[C]', loc)),
        'PLP': lambda pep, loc: int(re.search(r'\d+', loc).group()) == 1, # PLP needs free N-terminus
        'DHA': lambda pep, loc: bool(re.search(r'[C]', loc)),
        'DEHY': lambda pep, loc: bool(re.search(r'[SCTY]', loc)),
        'SELEN': lambda pep, loc: bool(re.search(r'[S]', loc)),
        'NITRO': lambda pep, loc: bool(re.search(r'[C]', loc)),
        'PALM': lambda pep, loc: bool(re.search(r'[CST]', loc)),
        'MYRI': lambda pep, loc: bool(re.search(r'[G]', loc)) and int(re.search(r'\d+', loc).group()) == 1,
        'SCHIFF': lambda pep, loc: int(re.search(r'\d+', loc).group()) == 1,
        'DIDEHY': lambda pep, loc: bool(re.search(r'[T]', loc)), # intermediate in threonine dehydrogenase pathway
        'CRTAL': lambda pep, loc: bool(re.search(r'[K]', loc)),  # Michael addition
        'CRTNL': lambda pep, loc: bool(re.search(r'[K]', loc)),  # Kcr
        'AEBS':  lambda pep, loc: bool(re.search(r'[SYKH]', loc)) or int(re.search(r'\d+', loc).group()) == 1,
        'MAL' : lambda pep, loc: bool(re.search(r'[K]', loc)),
        'ADPR': lambda pep, loc: bool(re.search(r'[RDNQEKTCS]', loc)) or int(re.search(r'\d+', loc).group()) == 1,
        'GALA': lambda pep, loc: bool(re.match(r'N[^P][ST]', pep)) or bool(re.search(r'[STY]', loc)),
    }
    invalid_ptms = {'CBML', 'AML', 'GLUCU', 'GLUC', 'DHEX'}
    invalid_aa_pattern = re.compile(r'[_BOUJZX]')
    def is_valid_ptm(row):
        pep, loc, ptm = row.peptide, row.ptm_location, row.ptm
        if invalid_aa_pattern.search(pep) or ptm in invalid_ptms:
            return False
        return ptm_motifs.get(ptm, lambda *_: True)(pep, loc)
    return df[df.apply(is_valid_ptm, axis=1)].reset_index(drop=True)

# Parse sptxt file
def parse_sptxt_file(data_path):
    AA_SET    = set("ACDEFGHIKLMNPQRSTVWY")
    hla = os.path.splitext(os.path.basename(data_path))[0]
    
    with open(data_path, 'r') as f:
        text = f.read()

    if "### ===" in text:
        text = text.split("### ===", 1)[1]

    records = re.split(r'\n\s*\n', text)
    out = []

    for rec in records:
        if "Name:" not in rec:
            continue
        lines = rec.splitlines()

        # 1) Raw sequence from Name line (keep the '[...]' part)
        raw_name = next(L for L in lines if L.startswith("Name:"))
        raw_seq  = raw_name.split("Name:",1)[1].strip().split("/",1)[0]

        # 2) PTM location = position of '[' (0‑based index) interpreted as 1‑based
        ptm_location = raw_seq.find("[") if "[" in raw_seq else "0"

        # 3) Clean peptide (remove brackets, digits, etc.)
        peptide = "".join(ch for ch in raw_seq if ch in AA_SET)

        # 4) PTM type from Mods= in Comment line
        comment  = next((L for L in lines if L.startswith("Comment:")), "")
        mods = next(
            (tok[5:] for tok in comment.split() if tok.startswith("Mods=")),
            "0"
        )

        if mods in ("", "0"):
            ptm = "0"
        else:
            parts = mods.split(",")
            ptm = parts[2].split('/')[0] if len(parts) >= 3 else None

        out.append({
            "peptide":       peptide,
            "hla":           hla,
            "ptm":           ptm,
            "ptm_location":  ptm_location,
            "label":         1.0
        })

    return out    

## IEDB dataset
Download IEDB dataset with following filter options (csv, single header, IEDB Website Displayed):
1. Epitope: Linear peptide
2. MHC Assay: Outcome-Positive-Any
3. MHC Restriction: Class I-Resolution-Any
4. MHC Source: Human
5. Disease: Any
6. Reference: Date-to 2024

In [None]:
pos_df = pd.read_csv('raw_positive.csv') # raw IEDB dataset
neg_df = pd.read_csv('raw_negative.csv')

In [None]:
pos_df = format_iedb_dataset(pos_df)
pos_df['label'] = 1.0
neg_df = format_iedb_dataset(neg_df)
neg_df['label'] = 0.0
df = pd.concat([pos_df, neg_df])
df[['peptide', 'ptm', 'ptm_location']] = df['peptide'].progress_apply(extract_peptide).apply(pd.Series)
df = filter_df(df)
df_clean = drop_conflicting_rows(df)
df_clean['length'] = df_clean['peptide'].progress_apply(lambda x: len(x))
final_df = df_clean[['peptide', 'length', 'hla', 'label', 'ptm', 'ptm_location']][(df_clean['length'] < 16) & (df_clean['length'] > 7)].drop_duplicates()
iedb_final = filter_ptm(final_df)
iedb_final

In [None]:
iedb_final[iedb_final['label'] == 1].to_csv('iedb_positive_preprocessed_to_2024.csv', index=False)
iedb_final[iedb_final['label'] == 0].to_csv('iedb_negative_preprocessed_to_2024.csv', index=False)

# SysteMHC Atlas v2 dataset
Download class I allele specific libraries from https://systemhc.sjtu.edu.cn/download:
1. All class I alleles except SLAs
2. Save all .sptxt files in the same path

In [None]:
SPTXT_DIR = "Your path"

files = [fn for fn in os.listdir(SPTXT_DIR) if fn.lower().endswith(".sptxt")]
all_recs = []
for fn in tqdm(files, desc="Parsing .sptxt files"):
    path = os.path.join(SPTXT_DIR, fn)
    all_recs.extend(parse_sptxt_file(path))

df = pd.DataFrame(all_recs, columns=["peptide", "hla", "ptm", "ptm_location", "label"])

df['hla'] = df['hla'].str.replace(
    r'^(HLA-[A-Z])(\d{2})_(\d{2})$',        
    r'\1*\2:\3',
    regex=True
).str.replace(
    r'^H-2-',
    'H2-',
    regex=True
)

df['length'] = df['peptide'].apply(lambda x: len(x))

# vague ptm filtration
# Ammonia loss is too vague
# Pro->Pyrrolidinone can never exist internally
mask = df['ptm'].str.contains(r'USM|Cation|Ammonia-loss|Pro->Pyrrolidinone', na=False)
df_clean = df[~mask].copy()

In [None]:
PTM_MAP = { 
    "0":                         "0",
    "ADP-Ribosyl":               "ADPR",
    "AEBS":                      "AEBS",
    "Acetyl":                    "ACET",
    "Carbamyl":                  "CBML",
    "Carboxymethyl":             "CMETH",
    "Carboxy":                   "CBX",
    "Cys->Dha":                  "DHA",
    "Cysteinyl":                 "CYSTL",
    "Crotonaldehyde":            "CRTAL",
    "Crotonyl":                  "CRTNL",
    "dHex":                      None,         # dHEX is unclear
    "Deamidated":                "DEAM",
    "Dehydrated":                "DEHY",
    "Delta-H(2)C(2)":            "SCHIFF",     # Schiff base with acetaldehyde
    "Delta:S(-1)Se(1)":          "SELEN",
    "Didehydro":                 "DIDEHY",
    "Dimethyl":                  "DIMETH",
    "Farnesyl":                  "FARN",
    "Formyl":                    "FORM",
    "Fucosylation":              "FUCO",
    "Galactosyl":                "GALA",
    "Gln->pyro-Glu":             "PYRE",
    "Glu->pyro-Glu":             "PYRE",
    "Glucosylgalactosyl":        "GGHK",       # Glucosylgalactosyl hydroxylysine
    "Glucuronyl":                None,         # Hexuronic acid is unclear
    "Hex":                       None,         # Hexose is unclear
    "HexNAc":                    None,
    "HexNAc1dHex1":              None,
    "HexNAc2":                   None,
    "Label:2H(4)":               None,         # quantitative tag
    "Malonyl":                   "MAL",
    "Met->AspSA":                "AspSA",      # Aspartic semialdehyde
    "Met-loss+Acetyl":           "NAc",        # Methionine cleaved + N-acetylation
    "Met-loss+acetaldehyde":     "NAh",        # Methionine cleaved + N-acetaldehyde
    "Methyl":                    "METH",
    "Myristoyl":                 "MYRI",
    "Nitro":                     "OX",         # Oxidation to Nitro group
    "Nitrosyl":                  "NITRO",      # Addition of NO typically to C
    "Oxidation":                 "OX",
    "Palmitoyl":                 "PALM",
    "Phospho":                   "PHOS",
    "PyridoxalPhosphate":        "PLP",        # Active form of vitamin B6
    "quinone":                   None,         # Not enough information
    "Succinyl":                  "SUCC",
    "Sulfo":                     "OX",
    "Sulfide":                   "SULF",
    "Thiazolidine":              None,         # Formaldehyde adducts
    "TMT6plex":                  None,         # Experimental tag
    "Trimethyl":                 "TRIMETH",
    "Trioxidation":              "TRIOX",
    "Trp->Hydroxykynurenin":     "Hydroxykynurenine",
    "Trp->Kynurenin":            "kynurenine",
    "Trp->Oxolactone":           None
}

In [None]:
df_clean['ptm'] = df_clean['ptm'].map(PTM_MAP)
df_clean['ptm_location'] = df_clean.apply(
    lambda r: f"{r['peptide'][int(r['ptm_location'])-1]}{r['ptm_location']}"
              if r['ptm_location'] != '0' else '0',
    axis=1
)
df_clean = df_clean.dropna()
sys_final = filter_ptm(df_clean).drop_duplicates().reset_index(drop=True)
sys_final

In [None]:
sys_final.to_csv('systemhc_positive_preprocessed_v230601.csv', index=False)

In [None]:
total = drop_conflicting_rows(pd.concat([iedb_final,sys_final]), labels_only=True).drop_duplicates().reset_index(drop=True)
total["hla"] = total["hla"].str.replace(r"\bHLA-C\*03:01\b", "HLA-C*03:04", regex=True)
total = total[total["hla"] != "HLA-A*01:191"]

def has_L_after_star(s):
    if pd.isna(s):
        return False
    tail = str(s).split("*", 1)[-1] if "*" in str(s) else str(s)
    return "L" in tail

total = total[~total["hla"].map(has_L_after_star)].copy()

# Acetylation on P1 --> NAc
pattern = r'(?<=[A-Za-z])1(?!\d)'
bool_mask = (
    (total['ptm'] == 'ACET')
    & (total['ptm_location'].str.contains(pattern, regex=True))
)
total.loc[bool_mask, 'ptm'] = 'NAc'
total['ptm_location'] = total['ptm_location'].apply(
    lambda x: ','.join(set(x.split(','))) if pd.notna(x) else x
)

total.to_csv("total_data.csv", index=False)