###### 本代码为dizco的python复现版本
##### #原始代码用R书写（https://github.com/jmwozniak/DizcoProcessing）

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import precision_recall_curve,mean_squared_error,log_loss
import datetime
from os import listdir
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from statannotations.Annotator import Annotator
import warnings
warnings.filterwarnings('ignore')

In [None]:
#判断肽段修饰中是否包含目标修饰
def label_match(values,modID,col_mod):
    try:
        if np.isnan(values[col_mod+1]): return np.nan
    except:
        mod = values[col_mod+1].split('; ')
        match = [m for m in mod if any(name in m for name in modID)]
        return values[0] if match != [] else np.nan

#判断值是否为空，若是则返回True，反之则为False
def is_not_nan(value):
    return True if not np.isnan(value) else False

#提取modID中所提及的修饰的肽段
def extract_labeled(data,modID,col_mod):
    data = data.reset_index(drop=False)
    index = list(map(lambda x:label_match(x,modID,col_mod), data.values))
    index = list(filter(lambda x:is_not_nan(x), index))
    return data.iloc[index,1:].reset_index(drop=True)

#提取Master Protein Accessions的ID，保留第一个ID或所有ID
def id_extrac(values,col_num,uni):
    if uni: return values[col_num].split('; ')[0]
    else: return values[col_num]

#提取Master Protein Accessions的ID
def extract_proteins(data,col_num,uni=True):
    prot_ids = list(map(lambda x:id_extrac(x,col_num,uni), data.values))
    return prot_ids

#去除修饰中modName所列出来的修饰，剩余的修饰中仅保留修饰的氨基酸及其位点
def removed_mod(mod,modName):
    try:
        if np.isnan(mod): return mod
    except:
        mod = mod.split('; ')
        match = [m.split('(')[0] for m in mod if not any(name in m for name in modName)]
        if match == []: return np.nan
        return ';'.join(match)

#分别提取修饰的氨基酸及其位点
def extract_aa_site(mod,aa=False,site=False):
    try:
        if np.isnan(mod): return mod
    except:
        aa_lt = ';'.join([m[0] for m in mod])
        site_lt = ';'.join([m[1:] for m in mod])
        if aa: return aa_lt
        if site: return site_lt

#根据给定的table计算每个蛋白的PSM
def cal_prot_psm(input_table,merge_table,col_name):
    total_psm = input_table.groupby(by=['Master Protein Accessions']).count().reset_index(drop=False).iloc[:,:2]
    total_psm.columns = ['Master Protein Accessions',col_name]
    merge_table = pd.merge(merge_table, total_psm,how='outer')
    return merge_table

#根据给定的table计算每个蛋白的unique peptide数量
def cal_uni_pep(input_table,merge_table,col_name):
    uni_pep_num = []
    for prot,table in input_table.groupby(by=['Master Protein Accessions']):
        num = len(table['Upper_Seq'].unique())
        uni_pep_num.append(tuple((prot,num)))
    uni_pep_num = pd.DataFrame(uni_pep_num,columns=['Master Protein Accessions',col_name])
    merge_table = pd.merge(merge_table, uni_pep_num,how='outer')
    return merge_table

In [None]:
#对同位素标记的PSM数据进行处理，并进行训练
#处理过程包括以下步骤：
#1. 获取唯一的Uniprot ID以及质谱打到的肽段
#2. 提取含有modID修饰的肽段信息，即标记氨基酸及其位点
#3. 基于First Scan和File ID信息生成相应的肽段ID，用于后续的PSM计算
#4. 根据肽段是否有modID的修饰，将数据拆分为labeled_psms和unlabeled_psms，对labeled_psms会进一步依据其轻标或重标修饰，进行拆分
#5. 提取对于同时在轻标和重标出现的肽段，并在pair列将其标记为1
#6. 根据此前生成的肽段ID信息，统计肽段的PSM，scoreDiff，并进行标准化得到numPSMs_scaled，scoreDiff_scaled
#   此外还计算了agreePSMs，它是肽段的一致性比例，也即它在多个质谱文件中出现的频率
#7. 统计labeled_psms与unlabeled_psms在不同肽段长度的平均保留时间
#   并用未被标记的不同长度的肽段的平均保留时间与肽段本身的保留时间求它们之间的差值，并标准化得RT_Diff_fromUPL_scaled
#8. 统计蛋白的PSM以及unique peptides数量
#9. 生成训练集与测试集，其中阳性样本在训练集和测试集中分别为paired为1和0的肽段
#   阴性样本来源于unlabeled_psms，且按照1:1的比例分配给训练集与测试集
#10. 使用上述的numPSMs_scaled，scoreDiff_scaled，agreePSMs，RT_Diff_fromUPL_scaled结合逻辑回归训练并预测标记肽段
def processPSMs_isotope(data,modID,ptmRS=True):
    if ptmRS: use_col = [1,3,4,5,7,8,9,10,11,12,15,16,17,18,19,24,25,26,27,28,32]
    else: use_col = [1,3,4,5]+list(range(7,13))+list(range(15,20))+list(range(24,29))
    data = data.iloc[:,use_col]
    
    #1. 获取唯一的Uniprot ID以及质谱打到的肽段
    data['Master Protein Accessions'][data['Master Protein Accessions'].isna()] = data['Protein Accessions'][data['Master Protein Accessions'].isna()]
    data['Master Protein Accessions'] = extract_proteins(data,col_num=4,uni=True)
    data['Upper_Seq'] = data['Annotated Sequence'].str.split('.',expand=True)[1].str.upper()
    
    #2. 提取含有modID标记的肽段信息，即标记氨基酸及其位点
    modName = ['Oxidation','Carbamidomethyl']
    data['label_site'] = list(map(lambda x:removed_mod(x,modName), data['Modifications'].to_list()))
    data['label_AA'] = list(map(lambda x:extract_aa_site(x,aa=True), data['label_site'].str.split(';')))
    data['label_loc'] = list(map(lambda x:extract_aa_site(x,site=True), data['label_site'].str.split(';')))
    
    #3. 基于First Scan和File ID信息生成相应的肽段ID，用于后续的PSM计算
    data['scanID'] = data['First Scan'].map(str).str.cat([data['File ID']],sep='_')
    data['scanID_pep'] = data['scanID'].map(str).str.cat([data['Annotated Sequence'].str.upper()],sep='_')
    data['uniqueID'] = data['scanID_pep'].map(str).str.cat([data['Modifications'].replace(np.nan,'-')],sep='_').str.strip('-')
    data['pepLength'] = list(map(lambda x:len(x), data['Upper_Seq'].to_list()))
    
    #4. 根据肽段是否有modID的修饰，以及具体的轻标或重标修饰，将数据拆分为labeled_psms和unlabeled_psms
    labeled_psms = extract_labeled(data,modID,3)
    labeled_psms['labeled'] = 1
    data = pd.merge(data, labeled_psms,how='outer')
    data['labeled'] = data['labeled'].replace(np.nan,0)
    unlabeled_psms = data[data['labeled']==0].reset_index(drop=True)
    unlabeled_psms['paired'] = 0
    light_mod,heavy_mod = modID[0],modID[1]
    light_psms = extract_labeled(labeled_psms,[light_mod],3)
    heavy_psms = extract_labeled(labeled_psms,[heavy_mod],3)
    
    #5. 提取对于同时在轻标和重标出现的肽段，并在pair列将其标记为1
    data.loc[data['scanID'].isin(heavy_psms['scanID']),'label_type'] = 'H'
    data.loc[data['scanID'].isin(light_psms['scanID']),'label_type'] = 'L'
    data['label_type'] = data['label_type'].replace(np.nan,'NA')
    labeled_psms.loc[(labeled_psms['Upper_Seq'].isin(heavy_psms['Upper_Seq']))&(labeled_psms['Upper_Seq'].isin(light_psms['Upper_Seq'])),'paired'] = 1
    labeled_psms['paired'] = labeled_psms['paired'].replace(np.nan,0)
    paired_psms = labeled_psms[labeled_psms['paired']==1].reset_index(drop=True)
    data = pd.merge(data, paired_psms,how='outer')
    data['paired'] = data['paired'].replace(np.nan,0)
    
    #6. 根据此前生成的肽段ID信息，统计肽段的PSM，scoreDiff，并进行标准化得到numPSMs_scaled，scoreDiff_scaled
    #此外还计算了agreePSMs，它是肽段的一致性比例，也即它在多个质谱文件中出现的频率
    scan_freq = pd.DataFrame([tuple((name,count)) for name,count in Counter(data['scanID']).items()],columns=['scanID','numPSMs'])
    data = pd.merge(data, scan_freq,how='outer')
    data['numPSMs_scaled'] = StandardScaler().fit_transform(data['numPSMs'].values.reshape(len(data),1))
    scan_psm_freq = pd.DataFrame([tuple((name,count)) for name,count in Counter(data['scanID_pep']).items()],columns=['scanID_pep','agreePSMs'])
    data = pd.merge(data, scan_psm_freq,how='outer')
    data['agreePSMs'] = data['agreePSMs']/data['numPSMs']
    data['scoreDiff'] = [data.loc[i,'Delta Score'] if not pd.isna(data.loc[i,'Delta Score'])
                         else data.loc[i,'Delta Cn'] for i in data.index]
    data['scoreDiff_scaled'] = StandardScaler().fit_transform(data['scoreDiff'].values.reshape(len(data),1))
    
    #7. 统计labeled_psms与unlabeled_psms在不同肽段长度的平均保留时间
    #并用未被标记的不同长度的肽段的平均保留时间与肽段本身的保留时间求它们之间的差值，并标准化得RT_Diff_fromUPL_scaled
    RT_mat = pd.DataFrame()
    RT_mat['pepLength'] = data['pepLength'].unique()
    upl = unlabeled_psms.groupby(by=['pepLength']).mean()['RT in min'].reset_index(drop=False)
    upl.columns = ['pepLength','RTavg_fromUPL']
    RT_mat = pd.merge(RT_mat, upl,how='outer')
    lpl = paired_psms.groupby(by=['pepLength']).mean()['RT in min'].reset_index(drop=False)
    lpl.columns = ['pepLength','RTavg_fromLPL']
    RT_mat = pd.merge(RT_mat, lpl,how='outer')
    data = pd.merge(data, RT_mat,how='outer')
    data['RT_Diff_fromUPL'] = data['RTavg_fromUPL']-data['RT in min']
    data['RT_Diff_fromUPL_scaled'] = StandardScaler().fit_transform(data['RT_Diff_fromUPL'].values.reshape(len(data),1))
    
    #8. 统计蛋白的PSM以及unique peptides数量
    data['ProtID_Pep'] = data['Master Protein Accessions'].map(str).str.cat([data['Upper_Seq']],sep='_')
    data = cal_prot_psm(data,data,'Prot_totalPSMs')
    data = cal_prot_psm(labeled_psms,data,'Prot_labeledPSMs')
    data = cal_prot_psm(paired_psms,data,'Prot_pairedPSMs')
    data = cal_uni_pep(data,data,'Prot_uniquePeptides')
    data = cal_uni_pep(labeled_psms,data,'Prot_labeledPeptides')
    data = cal_uni_pep(paired_psms,data,'Prot_pairedPeptides')
    
    #9. 生成训练集与测试集，其中阳性样本在训练集和测试集中分别为paired为1和0的肽段
    #阴性样本来源于unlabeled_psms，且按照1:1的比例分配给训练集与测试集
    data = data[~data['RT_Diff_fromUPL'].isna()].reset_index(drop=True) #这一步是为了避免由于RT_Diff_fromUPL存在缺失值引起的训练错误
    trainNegative = unlabeled_psms.iloc[0::2,:]
    testNegative = unlabeled_psms.iloc[1::2,:]
    trainSet = pd.concat([data[data['paired']==1],
                          data[data['uniqueID'].isin(trainNegative['uniqueID'].to_list())]],axis=0).reset_index(drop=True)
    testSet = pd.concat([data[(data['paired']==0)&(data['labeled']==1)],
                          data[data['uniqueID'].isin(testNegative['uniqueID'].to_list())]],axis=0).reset_index(drop=True)
    
    #10. 使用上述的numPSMs_scaled，scoreDiff_scaled，agreePSMs，RT_Diff_fromUPL_scaled结合逻辑回归训练并预测标记肽段
    use_name = ['numPSMs_scaled','scoreDiff_scaled','agreePSMs','RT_Diff_fromUPL_scaled']
    lg_clf = LogisticRegression(solver='lbfgs',penalty='none',n_jobs=-1)
    X = trainSet[use_name].values.astype(np.float64)
    y = trainSet['labeled'].values.astype(np.float64)
    lg_clf = lg_clf.fit(X,y)
    
    y_pre_train = lg_clf.predict_proba(X)[:,1]
    y_pre_test = lg_clf.predict_proba(testSet[use_name].values.astype(np.float64))[:,1]
    
    train_out = trainSet.copy()
    train_out['lg_probs'] = y_pre_train
    train_out.loc[train_out['lg_probs']>0.5,'lg_pred'] = 'Labeled'
    train_out.loc[train_out['lg_probs']<=0.5,'lg_pred'] = 'Unlabeled'
    train_out['model_set'] = 'train'
    
    test_out = testSet.copy()
    test_out['lg_probs'] = y_pre_test
    test_out.loc[test_out['lg_probs']>0.5,'lg_pred'] = 'Labeled'
    test_out.loc[test_out['lg_probs']<=0.5,'lg_pred'] = 'Unlabeled'
    test_out['model_set'] = 'test'
    
    merge_out = pd.concat([train_out,test_out],axis=0)
    merge_out.loc[merge_out['lg_probs']>=0.85,'label_confidence'] = 'High'
    merge_out.loc[(merge_out['lg_probs']>=0.5)&(merge_out['lg_probs']<0.85),'label_confidence'] = 'Medium'
    merge_out.loc[merge_out['lg_probs']<0.5,'label_confidence'] = 'Low'

    return(merge_out.reset_index(drop=True))

In [None]:
#对每一个传进来的uni_id，在processedPSMs中找到该uni_id所对应的修饰位点信息
def extract_site(uni_id,processedPSMs):
    table = processedPSMs[processedPSMs['pep_label']==uni_id].reset_index(drop=True)
    all_sites = table['label_site'].to_list()
    sites = list(table['label_site'].unique())
    locs = list(table['label_loc'].unique())
    if table[~table['label_site'].isna()].empty:
        return tuple((uni_id,np.nan,np.nan,np.nan,np.nan))
    
    aas = [site[0] for site in sites]
    siteDF = pd.DataFrame(data=[sites,aas,locs]).T
    siteDF.columns=['sites','aas','locs']
    site_freq = pd.DataFrame([tuple((name,count)) for name,count in Counter(all_sites).items()],columns=['sites','site_freq'])
    siteDF = pd.merge(siteDF, site_freq,how='outer')
    siteDF = siteDF.sort_values(by=['locs'],ascending=True,key=lambda x: pd.to_numeric(x, errors='coerce')).reset_index(drop=True)
    siteDF = siteDF.astype(str)
    return tuple([uni_id])+tuple([';'.join(siteDF[col].to_list()) for col in siteDF.columns])

In [None]:
# 基于同位素的PSM统计数据，在肽段层面上做进一步统计，具体过程如下：
#1. 根据光谱和First Scan的信息为每个被探针标记的肽段赋予unique_scanID和pep_label
#2. 对每一个unique_scanID选取Prot_totalPSMs或XCorr最高的肽段创建uniqueScan_DF
#3. 根据uniqueScan_DF和PSM数据，统计每条标记肽段的光谱数和PSM数
#4. 同位素输出的PSM数据，统计每条标记肽段的平均预测分数，XCorr，scoreDiff，numPSMs和RT_Diff_fromUPL
#5. 统计每条标记肽段的标记位点，被标记的频率等信息
def processPeps_isotope(processedPSMs):
    #1. 根据光谱，First Scan和肽段信息为每个被探针标记的肽段赋予unique_scanID和pep_label
    processedPSMs['unique_scanID'] = processedPSMs['First Scan'].map(str).str.cat([processedPSMs['Spectrum File']],sep='_')
    processedPSMs['probeID'] = processedPSMs['Spectrum File'].str.split('_',expand=True)[3]
    processedPSMs.loc[processedPSMs[processedPSMs['labeled']==0].index,['probeID']] = np.nan
    processedPSMs['pep_label'] = processedPSMs['Upper_Seq'].map(str).str.cat([processedPSMs['probeID'].replace(np.nan,'-')],sep='_').str.strip('-')
    processedPSMs_sorted = processedPSMs.sort_values(by=['unique_scanID','Prot_totalPSMs','XCorr'],ascending=[False,False,False]).reset_index(drop=True)
    
    #2. 对每一个unique_scanID选取Prot_totalPSMs或XCorr最高的肽段创建uniqueScan_DF
    uniqueScan_DF = []
    for uni_scan,table in processedPSMs_sorted.groupby(by=['unique_scanID']):
        table = table.sort_values(by=['Prot_totalPSMs','XCorr'],ascending=[False,False]).reset_index(drop=True)
        uniqueScan_DF.append(table.iloc[0,:].values)
    uniqueScan_DF = pd.DataFrame(uniqueScan_DF,columns=processedPSMs_sorted.columns)
    uniqueScan_DF['probeID'] = uniqueScan_DF['Spectrum File'].str.split('_',expand=True)[3]
    uniqueScan_DF.loc[uniqueScan_DF[uniqueScan_DF['labeled']==0].index,['probeID']] = np.nan
    uniqueScan_DF['pep_label'] = uniqueScan_DF['Upper_Seq'].map(str).str.cat([uniqueScan_DF['probeID'].replace(np.nan,'-')],sep='_').str.strip('-')
    
    #3. 根据uniqueScan_DF和同位素输出的PSM数据，统计每条标记肽段的光谱数和PSM数
    pepDF = pd.DataFrame()
    pepDF['Unique_ID'] = uniqueScan_DF['pep_label'].unique()
    pepDF = pd.merge(pepDF, uniqueScan_DF[['Master Protein Accessions','pep_label']],left_on=['Unique_ID'],right_on=['pep_label'],how='inner').drop_duplicates().iloc[:,:-1]
    pepDF.columns = ['Unique_ID','Uniprot_ID']
    spec_freq = pd.DataFrame([tuple((name,count)) for name,count in Counter(uniqueScan_DF['pep_label']).items()],columns=['Unique_ID','#_Spectra'])
    pepDF = pd.merge(pepDF, spec_freq,how='outer')
    psm_freq = pd.DataFrame([tuple((name,count)) for name,count in Counter(processedPSMs['pep_label']).items()],columns=['Unique_ID','#_PSMs'])
    pepDF = pd.merge(pepDF, psm_freq,how='inner')
    
    #4. 同位素输出的PSM数据，统计每条标记肽段的平均预测分数，XCorr，scoreDiff，numPSMs和RT_Diff_fromUPL
    featureCols = ['lg_probs','XCorr','scoreDiff','numPSMs','RT_Diff_fromUPL']
    averagedFeatures = processedPSMs.groupby(by='pep_label').mean()[featureCols].reset_index(drop=False)
    averagedFeatures.rename(columns={'pep_label':'Unique_ID'},inplace=True)
    pepDF = pd.merge(pepDF, averagedFeatures,on='Unique_ID',how='inner')
    max_lg = processedPSMs[['pep_label','lg_probs']].groupby(by='pep_label').max().reset_index(drop=False)
    max_lg.columns = ['Unique_ID','max_lg_probs']
    pepDF = pd.merge(pepDF, max_lg,on='Unique_ID',how='inner')
    
    #5. 统计每条标记肽段的标记位点，被标记的频率等信息
    values = list(map(lambda x:extract_site(x,processedPSMs), pepDF['Unique_ID'].to_list()))
    values = pd.DataFrame(values,columns=['Unique_ID','sites','aas','locs','site_psms'])
    pepDF = pd.merge(pepDF, values,on='Unique_ID',how='inner')
    pepDF['PEPTIDE_SEQUENCE'] = pepDF['Unique_ID'].str.split('_',expand=True)[0]
    
    return pepDF

In [None]:
#计算每条肽段中TMT修饰数量
def count_tmt(mod):
    mod = mod.split('; ')
    count = 0
    for m in mod:
        if 'TMT' in m: count += 1
    return str(count)

#对TMT的PSM数据进行处理，并使用同位素的PSM数据进行模型训练并预测TMT的结果
#处理过程包括以下步骤：
#1. 根据需要选择合适的列用于后续的数据统计
#2. 提取含有modID修饰的肽段信息，即标记氨基酸及其位点
#3. 基于First Scan和File ID信息生成一系列相应的肽段ID，用于后续的PSM计算
#4. 去除含有CP266修饰的肽段，随后根据肽段是否有modID的修饰，将数据拆分为labeled_psms和unlabeled_psms
#5. 根据此前生成的肽段ID信息，统计肽段的PSM，scoreDiff，并进行标准化得到numPSMs_scaled，scoreDiff_scaled
#   此外还计算了agreePSMs，它是肽段的一致性比例，也即它在多个质谱文件中出现的频率
#6. 统计labeled_psms与unlabeled_psms在不同肽段长度的平均保留时间
#   并用未被标记的不同长度的肽段的平均保留时间与肽段本身的保留时间求它们之间的差值，并标准化得RT_Diff_fromUPL_scaled
#7. 统计蛋白的PSM以及unique peptides数量
#8. 在处理后的同位素数据选择paired为1或labeled为0的肽段，这样就保证了训练集中的阳性与阴性数据
#   随后重新计算RT_Diff_fromUPL_scaled，scoreDiff_scaled和numPSMs_scaled并用于模型训练
#9. 用训练好的模型预测TMT的数据
#10. 总结训练集与测试集的结果，计算协方差矩阵，最后将测试集结果导出
def processPSMs_TMT(data, modID, train_set_forPred, ptmRS=True, plex='10plex', quant_level='MS3'):
    
    #1. 根据需要选择合适的列用于后续的数据统计
    infor_col = [1,3,4,5,7,8,9,10,11,12,15,16,17,18,19,22,24,25,26,27,28,29,40,41]
    if plex == '16plex': abund_col = list(range(30,46))
    elif plex == '10plex': abund_col = list(range(30,40))
    else: print('Invalidated input')
    use_col = infor_col+abund_col
    if quant_level == 'MS3': use_col.append(23)
    if ptmRS: use_col.append(45)
    data = data.iloc[:,use_col]
    
    #2. 提取含有modID修饰的肽段信息，即标记氨基酸及其位点
    data['Master Protein Accessions'][data['Master Protein Accessions'].isna()] = data['Protein Accessions'][data['Master Protein Accessions'].isna()]
    data['Master Protein Accessions'] = extract_proteins(data,col_num=4,uni=True)
    data['Upper_Seq'] = data['Annotated Sequence'].str.split('.',expand=True)[1].str.upper()
    modName = ['TMT6plex','TMTpro','Carbamidomethyl','Oxidation']
    data['label_site'] = list(map(lambda x:removed_mod(x,modName), data['Modifications'].to_list()))
    data['label_AA'] = list(map(lambda x:extract_aa_site(x,aa=True), data['label_site'].str.split(';')))
    data['label_loc'] = list(map(lambda x:extract_aa_site(x,site=True), data['label_site'].str.split(';')))
    for m in modID: data[m] = ['1' if m in mod else '0' for mod in data['Modifications']]
    
    #3. 基于First Scan和File ID信息生成一系列相应的肽段ID，用于后续的PSM计算
    data['pep_label'] = data[['Upper_Seq']+modID].apply(lambda x: '_'.join(x), axis=1)
    data['scanID'] = data['First Scan'].map(str).str.cat([data['File ID']],sep='_')
    data['scanID_pep'] = data['scanID'].map(str).str.cat([data['Annotated Sequence'].str.upper()],sep='_')
    data['uniqueID'] = data['scanID_pep'].map(str).str.cat([data['Modifications'].replace(np.nan,'-')],sep='_').str.strip('-')
    data['pepLength'] = list(map(lambda x:len(x), data['Upper_Seq'].to_list()))
    data['numTMT'] = list(map(lambda x:count_tmt(x), data['Modifications'].to_list()))
    data['pepLength_numTMT'] = data['pepLength'].map(str).str.cat([data['numTMT']],sep='_')
    
    #4. 去除含有CP266修饰的肽段，随后根据肽段是否有modID的修饰，将数据拆分为labeled_psms和unlabeled_psms
    index = [i for i in data.index if 'CP266' not in data.loc[i,'Modifications']]
    data = data.iloc[index,:].reset_index(drop=True)
    labeled_psms = extract_labeled(data,modID,3)
    labeled_psms['labeled'] = 1
    data = pd.merge(data, labeled_psms,how='outer')
    data['labeled'] = data['labeled'].replace(np.nan,0)
    unlabeled_psms = data[data['labeled']==0].reset_index(drop=True)
    
    #5. 根据此前生成的肽段ID信息，统计肽段的PSM，scoreDiff，并进行标准化得到numPSMs_scaled，scoreDiff_scaled
    #   此外还计算了agreePSMs，它是肽段的一致性比例，也即它在多个质谱文件中出现的频率
    scan_freq = pd.DataFrame([tuple((name,count)) for name,count in Counter(data['scanID']).items()],columns=['scanID','numPSMs'])
    data = pd.merge(data, scan_freq,how='outer')
    data['numPSMs_scaled'] = StandardScaler().fit_transform(data['numPSMs'].values.reshape(len(data),1))
    scan_psm_freq = pd.DataFrame([tuple((name,count)) for name,count in Counter(data['scanID_pep']).items()],columns=['scanID_pep','agreePSMs'])
    data = pd.merge(data, scan_psm_freq,how='outer')
    data['agreePSMs'] = data['agreePSMs']/data['numPSMs']
    data['scoreDiff'] = [data.loc[i,'Delta Score'] if not pd.isna(data.loc[i,'Delta Score'])
                         else data.loc[i,'Delta Cn'] for i in data.index]
    data['scoreDiff_scaled'] = StandardScaler().fit_transform(data['scoreDiff'].values.reshape(len(data),1))
    
    #6. 统计labeled_psms与unlabeled_psms在不同肽段长度的平均保留时间
    #   并用未被标记的不同长度的肽段的平均保留时间与肽段本身的保留时间求它们之间的差值，并标准化得RT_Diff_fromUPL_scaled
    RT_mat = pd.DataFrame()
    RT_mat['pepLength_numTMT'] = data['pepLength_numTMT'].unique()
    upl = unlabeled_psms.groupby(by=['pepLength_numTMT']).mean()['RT in min'].reset_index(drop=False)
    upl.columns = ['pepLength_numTMT','RTavg_fromUPL']
    RT_mat = pd.merge(RT_mat, upl,how='outer')
    lpl = labeled_psms.groupby(by=['pepLength_numTMT']).mean()['RT in min'].reset_index(drop=False)
    lpl.columns = ['pepLength_numTMT','RTavg_fromLPL']
    RT_mat = pd.merge(RT_mat, lpl,how='outer')
    RT_mat = RT_mat.sort_values(by=['pepLength_numTMT']).reset_index(drop=True)
    data = pd.merge(data, RT_mat,how='outer')
    data['RT_Diff_fromUPL'] = data['RTavg_fromUPL']-data['RT in min']
    data['RT_Diff_fromUPL_scaled'] = StandardScaler().fit_transform(data['RT_Diff_fromUPL'].values.reshape(len(data),1))
    
    #7. 统计蛋白的PSM以及unique peptides数量
    data['ProtID_Pep'] = data['Master Protein Accessions'].map(str).str.cat([data['Upper_Seq']],sep='_')
    data = cal_prot_psm(data,data,'Prot_totalPSMs')
    data = cal_prot_psm(labeled_psms,data,'Prot_labeledPSMs')
    data = cal_uni_pep(data,data,'Prot_uniquePeptides')
    data = cal_uni_pep(labeled_psms,data,'Prot_labeledPeptides')
    
    #8. 在处理后的同位素数据选择paired为1或labeled为0的肽段，这样就保证了训练集中的阳性与阴性数据
    #   随后重新计算RT_Diff_fromUPL_scaled，scoreDiff_scaled和numPSMs_scaled并用于模型训练
    train_set_forPred['RT_Diff_fromUPL_scaled'] = StandardScaler().fit_transform(train_set_forPred['RT_Diff_fromUPL'].values.reshape(len(train_set_forPred),1))
    train_set_forPred['scoreDiff_scaled'] = StandardScaler().fit_transform(train_set_forPred['scoreDiff'].values.reshape(len(train_set_forPred),1))
    train_set_forPred['numPSMs_scaled'] = StandardScaler().fit_transform(train_set_forPred['numPSMs'].values.reshape(len(train_set_forPred),1))
    use_name = ['numPSMs_scaled','scoreDiff_scaled','agreePSMs','RT_Diff_fromUPL_scaled']
    lg_clf = LogisticRegression(solver='lbfgs',penalty='none',n_jobs=-1)
    X = train_set_forPred[use_name].values.astype(np.float64)
    y = train_set_forPred['labeled'].values.astype(np.float64)
    lg_clf = lg_clf.fit(X,y)
    
    #9. 用训练好的模型预测TMT的数据
    y_pre_train = lg_clf.predict_proba(X)[:,1]
    y_pre_test = lg_clf.predict_proba(data[use_name].values.astype(np.float64))[:,1]
    
    #10. 总结训练集与测试集的结果，计算协方差矩阵，最后将测试集结果导出
    train_out = train_set_forPred.copy()
    train_out['lg_probs'] = y_pre_train
    train_out.loc[train_out['lg_probs']>0.5,'lg_pred'] = 'Labeled'
    train_out.loc[train_out['lg_probs']<=0.5,'lg_pred'] = 'Unlabeled'
    train_out['model_set'] = 'train'
    
    train_y_pre = train_out['lg_pred'].values.copy()
    train_y_pre[train_y_pre=='Labeled']=1
    train_y_pre[train_y_pre=='Unlabeled']=0
    train_y_pre = train_y_pre.astype(np.float64)
    train_tn, train_fp, train_fn, train_tp = confusion_matrix(train_out['labeled'], train_y_pre).ravel()
    
    test_out = data.copy()
    test_out['lg_probs'] = y_pre_test
    test_out.loc[test_out['lg_probs']>0.5,'lg_pred'] = 'Labeled'
    test_out.loc[test_out['lg_probs']<=0.5,'lg_pred'] = 'Unlabeled'
    test_out['model_set'] = 'test'
    
    test_y_pre = test_out['lg_pred'].values.copy()
    test_y_pre[test_y_pre=='Labeled']=1
    test_y_pre[test_y_pre=='Unlabeled']=0
    test_y_pre = test_y_pre.astype(np.float64)
    test_tn, test_fp, test_fn, test_tp = confusion_matrix(test_out['labeled'], test_y_pre).ravel()
    
    test_out.loc[test_out['lg_probs']>=0.85,'label_confidence'] = 'High'
    test_out.loc[(test_out['lg_probs']>=0.5)&(test_out['lg_probs']<0.85),'label_confidence'] = 'Medium'
    test_out.loc[test_out['lg_probs']<0.5,'label_confidence'] = 'Low'

    return test_out

In [None]:
# 基于TMT的PSM统计数据，在肽段层面上做进一步统计，具体过程如下：
#1. 对每一个scanID选取Prot_totalPSMs或XCorr最高的肽段创建uniqueScan_DF
#2. 根据uniqueScan_DF和PSM数据，统计每条标记肽段的光谱数和PSM数
#3. 分别处理TMT10和16的数据，统计每条肽段的丰度的平均值
#4. 根据TMT data输出的PSM数据，统计每条标记肽段的平均预测分数，XCorr，scoreDiff，numPSMs和RT_Diff_fromUPL
#5. 统计每条标记肽段的标记位点，被标记的频率等信息

def processPeps_TMT(processedPSMs, modID, plex='10plex'):

    #1. 对每一个scanID选取Prot_totalPSMs或XCorr最高的肽段创建uniqueScan_DF 
    processedPSMs_sorted = processedPSMs.sort_values(by=['scanID','Prot_totalPSMs','XCorr'],ascending=[False,False,False]).reset_index(drop=True)
    uniqueScan_DF = []
    for uni_scan,table in processedPSMs_sorted.groupby(by=['scanID']):
        table = table.sort_values(by=['Prot_totalPSMs','XCorr'],ascending=[False,False]).reset_index(drop=True)
        uniqueScan_DF.append(table.iloc[0,:].values)
    uniqueScan_DF = pd.DataFrame(uniqueScan_DF,columns=processedPSMs_sorted.columns)
    uniqueScan_DF.to_excel(psm_path+'uniqueScanDF_TMTmerged_%s.xlsx'%datetime.datetime.now().strftime('%Y-%m-%d'),index=False)
    
    #2. 根据uniqueScan_DF和PSM数据，统计每条标记肽段的光谱数和PSM数
    pepDF = pd.DataFrame()
    pepDF['Unique_ID'] = uniqueScan_DF['pep_label'].unique()
    pepDF = pd.merge(pepDF, uniqueScan_DF[['Master Protein Accessions','pep_label']],left_on=['Unique_ID'],right_on=['pep_label'],how='inner').drop_duplicates().iloc[:,:-1]
    pepDF.columns = ['Unique_ID','Uniprot_ID']
    spec_freq = pd.DataFrame([tuple((name,count)) for name,count in Counter(uniqueScan_DF['pep_label']).items()],columns=['Unique_ID','#_Spectra'])
    pepDF = pd.merge(pepDF, spec_freq,how='outer')
    psm_freq = pd.DataFrame([tuple((name,count)) for name,count in Counter(processedPSMs['pep_label']).items()],columns=['Unique_ID','#_PSMs'])
    pepDF = pd.merge(pepDF, psm_freq,how='inner')
    
    #3. 分别处理TMT10和16的数据，统计每条肽段的丰度的平均值
    if plex=='10plex':
        use_col = ["Abundance 126", "Abundance 127N", "Abundance 127C", "Abundance 128N", "Abundance 128C", 
                   "Abundance 129N", "Abundance 129C", "Abundance 130N", "Abundance 130C", "Abundance 131", "pep_label"]
    elif plex=='16plex':
        use_col = ["Abundance 126", "Abundance 127N", "Abundance 127C", "Abundance 128N", "Abundance 128C", 
                   "Abundance 129N", "Abundance 129C", "Abundance 130N", "Abundance 130C", "Abundance 131N", 
                   "Abundance 131C", "Abundance 132N", "Abundance 132C", "Abundance 133N", "Abundance 133C",
                   "Abundance 134N", "pep_label"]
    tmtData = uniqueScan_DF.loc[:,use_col].copy()
    tmtData = tmtData.replace(np.nan,1)
    summedTMT = tmtData.groupby(by='pep_label').sum().reset_index(drop=False)
    summedTMT = summedTMT.rename(columns={'pep_label':'Unique_ID'},inplace=True)
    pepDF = pd.merge(pepDF, summedTMT,how='outer',on=['Unique_ID'])
    
    #4. 根据TMT data输出的PSM数据，统计每条标记肽段的平均预测分数，XCorr，scoreDiff，numPSMs和RT_Diff_fromUPL
    feats_toAvg = ['lg_probs','XCorr','scoreDiff','numPSMs','RT_Diff_fromUPL']
    averagedFeatures = processedPSMs.groupby(by='pep_label').mean()[feats_toAvg].reset_index(drop=False)
    averagedFeatures.rename(columns={'pep_label':'Unique_ID'},inplace=True)
    pepDF = pd.merge(pepDF, averagedFeatures,on='Unique_ID',how='inner')
    max_lg = processedPSMs[['pep_label','lg_probs']].groupby(by='pep_label').max().reset_index(drop=False)
    max_lg.columns = ['Unique_ID','max_lg_probs']
    pepDF = pd.merge(pepDF, max_lg,on='Unique_ID',how='inner')
    
    #5. 统计每条标记肽段的标记位点，被标记的频率等信息
    values = list(map(lambda x:extract_site(x,processedPSMs), pepDF['Unique_ID'].to_list()))
    values = pd.DataFrame(values,columns=['Unique_ID','sites','aas','locs','site_psms'])
    pepDF = pd.merge(pepDF, values,on='Unique_ID',how='inner')
    temp = pepDF['Unique_ID'].str.split('_',expand=True)
    temp.columns = ['PEPTIDE_SEQUENCE']+modID
    pepDF = pd.concat([pepDF,temp],axis=1)
    
    return pepDF

In [None]:
#将来自Uniprot和AlphaFold的信息与处理好的肽段信息整合
def matchUniFeatures(processedPeps, featureTab):
    featureTab['Gene'] = featureTab['Gene Names'].str.split(' ',expand=True)[0]
    for i in featureTab.index:
        if pd.isna(featureTab.loc[i,'Gene']):
            featureTab.loc[i,'Gene'] = featureTab.loc[i,'Entry']
    
    featureTab_reviewed = featureTab[featureTab['Reviewed']=='reviewed'].reset_index(drop=True)
    featureTab_proteomics = featureTab[featureTab['Entry'].isin(processedPeps['Uniprot_ID'])].reset_index(drop=True)
    
    processedPeps = pd.merge(processedPeps, featureTab[['Entry','Gene']],
                             left_on='Uniprot_ID',right_on='Entry',how='left')
    reviewed_feature_cols = ["Protein names", "Gene Names", "Active site", "Binding site", "Cofactor", "DNA binding", "Site", "PDB", "ChEMBL", "DrugBank", "Sequence", "Gene", "AlphaFoldDB"]
    processedPeps = pd.merge(processedPeps, featureTab_reviewed[reviewed_feature_cols],on='Gene',how='left').reset_index(drop=True)    
    
    for i in processedPeps.index:
        pep_seq,seq = processedPeps.loc[i,['PEPTIDE_SEQUENCE','Sequence']].values
        try: start = seq.find(pep_seq)+1
        except:
            processedPeps.loc[i,'Labeled Peptide'] = np.nan
            processedPeps.loc[i,'TARG_PEPRANGE'] = np.nan
        end = start+len(pep_seq)
        peprange = '_'.join([str(start),str(end)])
        targ_peprange = '_'.join([processedPeps.loc[i,'Uniprot_ID'],peprange])
        processedPeps.loc[i,'Labeled Peptide'] = peprange
        processedPeps.loc[i,'TARG_PEPRANGE'] = targ_peprange
    processedPeps = processedPeps.replace(np.nan,'')
    
    return processedPeps

In [None]:
def roc_plot(processedPSMs,probe,model_set):
    processedPSMs_ = processedPSMs[processedPSMs['model_set']==model_set].reset_index(drop=True)
    y_data,y_pre = processedPSMs_['labeled'].values,processedPSMs_['lg_probs'].values
    fpr,tpr,thre = roc_curve(np.float64(y_data),y_pre)
    auc_score = roc_auc_score(y_data,y_pre)
    
    plt.plot(fpr,tpr,linewidth=2,label=probe+'_{:}_{:}'.format(model_set,round(auc_score,4)))
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel('False Positive Rate', fontsize=14,labelpad=10)
    plt.ylabel('True Positive Rate', fontsize=14,labelpad=10)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(fontsize=12,loc=4)
    plt.title(probe,fontsize=12)
    
def pr_plot(processedPSMs,probe):
    processedPSMs_ = processedPSMs[processedPSMs['model_set']=='test'].reset_index(drop=True)
    y_test_data,y_test_pre = processedPSMs_['labeled'].values,processedPSMs_['lg_probs'].values
    precision,recall,threshold = precision_recall_curve(np.float64(y_test_data),y_test_pre)
    
    plt.plot(threshold,precision[:-1],linewidth=2,label='Precision')
    plt.plot(threshold,recall[:-1],linewidth=2,label='Recall')
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel('Threshold', fontsize=14,labelpad=5)
    plt.ylabel('Score', fontsize=14,labelpad=5)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(fontsize=12,loc=4)
    plt.title('{:}'.format(probe),fontsize=12)

def metric_stat(table_dict,thre):
    metric_data = []
    for probe,processedPSMs in table_dict.items():
        for model_set,table in processedPSMs.groupby(by='model_set'):
            y_data,y_pre = table['labeled'].values,table['lg_probs'].values
            re_y_pre = 1-y_pre
            y_pre_array = np.c_[re_y_pre,y_pre]
            mse = mean_squared_error(y_data, y_pre)
            lg_pred = y_pre.copy()
            lg_pred = ['Labeled' if i>thre else 'Unlabeled' for i in lg_pred]
            loss = log_loss(lg_pred,y_pre_array)
            metric_data.append(tuple((probe,model_set,mse,loss)))
    metric_data = pd.DataFrame(metric_data,columns=['probe','model_set','mse','loss'])
    return metric_data

def metrics_plot(table_dict,thre):
    metric_data = metric_stat(table_dict,thre)
    
    plt.figure(figsize=(4,4),dpi=100)
    plt.rcParams['font.sans-serif'] = 'Arial'
    sns.scatterplot(data=metric_data,x='mse',y='loss',hue='model_set',s=60)
    plt.xlabel('MSE',fontsize=14,labelpad=5)
    plt.ylabel('Loose',fontsize=14,labelpad=5)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(fontsize=12)
    plt.title('Threshold = {:}'.format(thre),fontsize=16)
    plt.xlim()
    plt.show()

In [None]:
psm_path = 'D:/All_for_paper/1. PhD Work Program/3. Research project/1. Dizco/Test files/'
modID = ['-AC-Light','-AC-Heavy']

In [None]:
#读取同位素标记的PSM table
for file_name in tqdm(listdir(psm_path+'isotope_data')):
    psm_table_iso = pd.read_excel(psm_path+'isotope_data/'+file_name)
    processedPSMs = processPSMs_isotope(psm_table_iso,modID,ptmRS=True)
    processedPSMs.to_excel(psm_path+'isotope_processedPSMs/'+'processedPSMs_{:}_{:}.xlsx'.format(file_name.split('_')[2],datetime.datetime.now().strftime('%Y%m%d')),index=False)
    processedPEPs = processPeps_isotope(processedPSMs)
    processedPEPs.to_excel(psm_path+'isotope_processedPEPs/'+'processedPEPs_{:}_{:}.xlsx'.format(file_name.split('_')[2],datetime.datetime.now().strftime('%Y%m%d')),index=False)

In [None]:
#读取同位素标记的处理后的PSM table，查看每一个的预测情况，并进行merge
table_dict = {}
merge_data = pd.DataFrame()
for file_name in tqdm(listdir(psm_path+'isotope_processedPSMs')):
    processedPSMs = pd.read_excel(psm_path+'isotope_processedPSMs/'+file_name)
    probe = file_name.split('_')[1]
    table_dict.setdefault(probe,processedPSMs)
    processedPSMs_ = processedPSMs.copy()
    processedPSMs_['probe'] = probe
    if merge_data.empty: merge_data = processedPSMs_.copy()
    else: merge_data = pd.concat([merge_data,processedPSMs_],axis=0)
merge_data.to_csv(psm_path+'merge_isoTOP_PSM_data.csv',index=False)

In [None]:
#ROC plot的绘制
fig,ax = plt.subplots(figsize=(12,12),dpi=600)
plt.rcParams['font.sans-serif'] = 'Arial'
i = 1
for probe,processedPSMs in table_dict.items():
    plt.subplot(4,4,i)
    roc_plot(processedPSMs,probe,'train')
    roc_plot(processedPSMs,probe,'test')
    i += 1
plt.tight_layout()
plt.show()

#P-R曲线绘制
fig,ax = plt.subplots(figsize=(12,12),dpi=600)
plt.rcParams['font.sans-serif'] = 'Arial'
i = 1
for probe,processedPSMs in table_dict.items():
    plt.subplot(4,4,i)
    pr_plot(processedPSMs,probe)
    i += 1
plt.tight_layout()
plt.show()

#model metrics
metrics_plot(table_dict,thre=0.5)

In [None]:
#以delta score和PSM画散点图
score_1 = merge_data[['scoreDiff_scaled','labeled']].copy()
score_1.columns = ['Score','labeled']
score_1['Name'] = 'ScoreDiff'
score_2 = merge_data[['numPSMs_scaled','labeled']].copy()
score_2.columns = ['Score','labeled']
score_2['Name'] = 'numPSMs'
score_table = pd.concat([score_1,score_2],axis=0)

fig,ax = plt.subplots(figsize=(4,4),dpi=600)
plt.rcParams['font.sans-serif'] = 'Arial'
ax=sns.boxplot(data=score_table, x='Name',y='Score',hue='labeled',
            width=0.5,showfliers=False,)
box_pairs = [(('ScoreDiff',1),('ScoreDiff',0)),
             (('numPSMs',1),('numPSMs',0))]
annot = Annotator(ax, data=score_table,x='Name',y='Score',hue='labeled',pairs=box_pairs)
annot.configure(test='t-test_ind', text_format='star',line_height=0.03,line_width=1)
annot.apply_and_annotate()
plt.xlabel('',fontsize=14,labelpad=5)
plt.ylabel('Score',fontsize=14,labelpad=5)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles,['Unlabeled','Labeled'],fontsize=12,bbox_to_anchor=(0.50,0.5))
plt.show()

In [None]:
#统计每个探针标记到的氨基酸的频率
processedPEP = pd.read_csv(psm_path+'merge_isoTOP_PEP_data.csv')
processedPEP_ = processedPEP[processedPEP['lg_probs']>0.5]
processedPEP_ = processedPEP_[~processedPEP_['aas'].isna()].reset_index(drop=True)

freq_data,avg_psm_data = pd.DataFrame(),pd.DataFrame()
for probe,table in processedPEP_.groupby(by='probe'):
    table['aas_list'] = table['aas'].str.split(';')
    exploded_df = table.explode('aas_list')
    table['psm_count'] = table['site_psms'].str.split(';')
    exploded_df = pd.concat([exploded_df,table.explode('psm_count').iloc[:,-1]],axis=1)
    exploded_df['psm_count'] = exploded_df['psm_count'].astype(np.float64)
    
    freq = exploded_df['aas_list'].value_counts().reset_index(drop=False)
    freq.columns = ['amino acid',probe]
    if freq_data.empty: freq_data = freq.copy()
    else: freq_data = pd.merge(freq_data,freq,on=['amino acid'])
    
    avg_psm = exploded_df.groupby(by='aas_list').mean().iloc[:,-1].reset_index(drop=False)
    avg_psm.columns = ['amino acid',probe]
    if avg_psm_data.empty: avg_psm_data = avg_psm.copy()
    else: avg_psm_data = pd.merge(avg_psm_data,avg_psm,on=['amino acid'])
    

freq_data.iloc[:,1:] = StandardScaler().fit_transform(freq_data.iloc[:,1:].values.reshape(len(freq_data),8))

In [None]:
def heatmap_plot(table):
    plt.figure(figsize=(15,6),dpi=600)
    plt.rcParams['font.sans-serif'] = 'Arial'
    ax=sns.heatmap(table.iloc[1:,:].values.astype(np.float64),square=True,
                cbar_kws={"shrink": 0.8,"label": "Z-score"},cmap='vlag',
                yticklabels=table.index[1:],xticklabels=table.iloc[0,:],
                vmin=2,vmax=6)
    ax.tick_params(axis='x', labelsize=16)
    ax.tick_params(axis='y', labelsize=16,rotation=0)
    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=16)
    font = {'family':'Arial','size':16,}
    cbar.set_label('Average PSM for per a.a',fontdict=font)
    plt.xlabel('Amino acid',fontsize=18,labelpad=5)
    plt.ylabel('Probes',fontsize=18,labelpad=5)
    plt.show()

heatmap_plot(freq_data.T)
heatmap_plot(avg_psm_data.T)