用于将表格中野生型和突变型肽与HLA型组合为fasta文件，便于netMHCpan输入

In [6]:
import pandas as pd
import os
import re

# 读取文件

In [2]:
table_s5='/work/longyh/BY/raw/1-s2.0-S0092867417311224-mmc5.xlsx'

In [3]:
s5=pd.read_excel(table_s5,header=3)

In [4]:
s5.columns

Index(['Patient', 'Hugo Symbol', 'WT Peptide', 'MT Peptide', 'WT Allele',
       'MT Allele', 'WT Score', 'MT Score'],
      dtype='object')

In [10]:
output_dir='/work/longyh/BY/processed'

# 修改HLA表达式

In [5]:
#检查'WT Allele''MT Allele'两列是否相同
s5['Allele_equal']=s5['WT Allele']==s5['MT Allele']
s5['Allele_equal'].value_counts()

Allele_equal
True    41477
Name: count, dtype: int64

In [7]:
def normalize_hla(x):
    if pd.isna(x):
        return x
    s = str(x).strip()
    # 已经近似正确的形式：HLA-<gene><xx>:<yy> 或 HLA<gene>:xx:yy 等
    m = re.match(r'(?i)^(?:hla[-\s:]*)?([A-Za-z0-9]+)[:\-]?(\d{2})[:]?(\d{2})$', s)
    if m:
        gene = m.group(1).upper()
        return f'HLA-{gene}{m.group(2)}:{m.group(3)}'
    # 去掉常见分隔符和前缀后再尝试匹配（例如 A0201 -> A02:01）
    s2 = re.sub(r'(?i)^(hla[-\s:]*)', '', s)   # 去 HLA 前缀
    s2 = re.sub(r'[\*\-:\s]', '', s2)          # 去分隔符
    m2 = re.match(r'(?i)^([A-Za-z0-9]+?)(\d{2})(\d{2})$', s2)
    if m2:
        gene = m2.group(1).upper()
        return f'HLA-{gene}{m2.group(2)}:{m2.group(3)}'
    # 回退：返回大写原始字符串，便于人工检查
    return s.upper()




In [None]:
# 对数据框中常见列进行规范化
for col in ['MT Allele', 'MT Allele']:
    if col in s5.columns:
        s5[col + '_norm'] = s5[col].apply(normalize_hla)

In [9]:
s5[[c for c in s5.columns if 'Allele' in c]].head()

Unnamed: 0,WT Allele,MT Allele,Allele_equal,WT Allele_norm,MT Allele_norm
0,C0501,C0501,True,HLA-C05:01,HLA-C05:01
1,C0501,C0501,True,HLA-C05:01,HLA-C05:01
2,C0501,C0501,True,HLA-C05:01,HLA-C05:01
3,A0201,A0201,True,HLA-A02:01,HLA-A02:01
4,A0201,A0201,True,HLA-A02:01,HLA-A02:01


# 生成野生型 fasta

In [11]:
with open(os.path.join(output_dir, "wt_allele_peptide.fasta"), "w") as f:
    for _, row in s5.iterrows():
        allele = row.get("WT Allele_norm")
        peptide = row.get("WT Peptide")
        if pd.notna(allele) and pd.notna(peptide):
            f.write(f">{allele}\n{peptide}\n")

netMHCpan -f wt_allele_peptide.fasta -BA -xls -xlsfile wt_results.xls -l 9

# 生成突变型 fasta

In [12]:
with open(os.path.join(output_dir, "mt_allele_peptide.fasta"), "w") as f:
    for _, row in s5.iterrows():
        allele = row.get("MT Allele_norm")
        peptide = row.get("MT Peptide")
        if pd.notna(allele) and pd.notna(peptide):
            f.write(f">{allele}\n{peptide}\n")

netMHCpan -f mt_allele_peptide.fasta -BA -xls -xlsfile mt_results.xls -l 9

# 保存表格

In [13]:
s5.to_excel(os.path.join(output_dir, "table_s5_normalized.xlsx"), index=False)