##### 直接从质谱数据中读取给定蛋白的肽段匹配信息

In [None]:
import sys
sys.path.append('E:/Proteomics/PhD_script/1. Dizco/1.1 RawFileProcessing')
from os import listdir
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from ms_readout import MSPiepline,ParseMS2
from statannotations.Annotator import Annotator
from adjustText import adjust_text
import os
import warnings
warnings.filterwarnings('ignore')

#### 1. 从质谱数据中获取修饰与不修饰的肽段信息

In [None]:
#根据给定修饰，找到对应蛋白在质谱数据中的匹配情况
def generate_result(ms_path,mod):
    total_result = {}
    for group in listdir(ms_path):
        sample_result = {}
        for sample in listdir(ms_path+group):
            match_result = {}
            mrd = MSPiepline(f'{ms_path}{group}/{sample}/',f'MS1_output_{sample}_{group}')
            result,result_mod = mrd.generate_mod(sirt3_seq, mod[sample])
            match_result.setdefault('non_mod',result)
            match_result.setdefault('mod',result_mod)
            sample_result.setdefault(sample,match_result)
        total_result.setdefault(group,sample_result)
    return total_result

In [None]:
uniprot = pd.read_csv('D:/All_for_paper/1. PhD Work Program/3. Research project/1. Dizco/Test files/uniprotkb_human_AND_reviewed_true_AND_m_2024_09_12.tsv',sep='\t')
uniprot['Gene Names'] = uniprot['Gene Names'].str.split(' ',expand=True)[0]
sirt3_seq = uniprot[uniprot['Gene Names']=='SIRT3'].iloc[0,-1]

mod = {'sta429':640.233,'ncc':294.206}
ms_path = 'F:/Dizco/MS_files/'

In [None]:
total_result = generate_result(ms_path,mod)

In [None]:
for sample,probe in total_result.items():
    if sample != 'FREE':continue
    for probe_name,modify in probe.items():
        for mod,table in modify.items():
            if mod=='non_mod':
                print(f'{sample}-{probe_name}-{mod}:\n')
                for (fileName,mw,pep),table_ in table.groupby(by=['fileName','M.W','match_pep']):
                    if table_['start_site'].values[0] >= 118:
                        print(f'{fileName} {pep}({mw}): {len(table_)}')

#### 2. 仅获取未修饰的肽段信息，并进一步绘制MS2谱图

In [None]:
#仅返回未修饰的匹配肽段信息
def generate_result(prot_seq,ms_path,mod):
    total_result = {}
    for group in listdir(ms_path):
        sample_result = {}
        mrd = MSPiepline(f'{ms_path}{group}/',f'MS1_output_{group}')
        result,result_mod = mrd.generate_mod(prot_seq, mod['Ide'])
        sample_result.setdefault('non_mod',result)
        total_result.setdefault(group,sample_result)
    return total_result

#整合多个质谱文件的信息，返回指定蛋白MS1的情况
def integrate_pep_infor(total_result):
    pep_infor = {}
    for sample,condition in total_result.items():
        for cond,table in condition.items():
            table_ = table.drop_duplicates(subset=['match_pep','start_site','end_site'])
            table_ = table_[['match_pep','start_site','end_site']].reset_index(drop=True)
            for i in table_.index:
                pep = table_.loc[i,'match_pep']
                table_pep = table[table['match_pep']==pep]
                for fileName,t in table_pep.groupby(by='fileName'):
                    table_.loc[i,fileName] = len(t)
            table_.insert(0,'Condition',sample)
            table_.to_excel(f'{ms_path}{sample}.xlsx',index=False)
            pep_infor.setdefault(sample,table_)
    return pep_infor

In [None]:
#根据质谱实际MS2的结果，与理论计算的MS2结果进行匹配，若有匹配到的谱图，则进行谱图绘制并标注b/y离子峰
def ms2_plot(data, uv_infor, non_uv_infor, remain=1, tolerance=1):
    for i in tqdm(table_concat.index):
        fileName, scan, rt, charge, pep = table_concat.iloc[i,[0,1,2,5,6]].values
        if 'non-UV' in fileName:
            target_ms2 = non_uv_infor[(non_uv_infor['fileName']==fileName)&(non_uv_infor['scan']==scan)]
        else: 
            target_ms2 = uv_infor[(uv_infor['fileName']==fileName)&(uv_infor['scan']==scan)]
        
        match = False
        target_ms2_ = target_ms2.copy()
        target_ms2_ = target_ms2_.reset_index(drop=True)
        match_peak = []
        for j in target_ms2_.index:
            frag_mass = round(target_ms2_.iloc[j,6],remain)
            intensity = target_ms2_.iloc[j,7]
            theo_ms2 = pep_ms2[pep]
            for charge,theo_table in theo_ms2.items():
                theo_table_ = theo_table.copy()
                theo_table_['frag_mass (b)'] = theo_table_['frag_mass (b)'].round(remain)
                theo_table_['frag_mass (y)'] = theo_table_['frag_mass (y)'].round(remain)
                for k in theo_table_.index:
                    if frag_mass > theo_table_.loc[k,'frag_mass (b)']-tolerance and frag_mass < theo_table_.loc[k,'frag_mass (b)']+tolerance:
                        match = True
                        site = theo_table_.loc[k,'site (b)']
                        match_peak.append(tuple((frag_mass,site,theo_table_.loc[k,'frag_mass (b)'],intensity)))
                    elif frag_mass > theo_table_.loc[k,'frag_mass (y)']-tolerance and frag_mass < theo_table_.loc[k,'frag_mass (y)']+tolerance:
                        match = True
                        site = theo_table_.loc[k,'site (y)']
                        match_peak.append(tuple((frag_mass,site,theo_table_.loc[k,'frag_mass (y)'],intensity)))
        match_peak = pd.DataFrame(match_peak,columns=['frag_mass','site','theo_mass','frag_intensity'])

        if match:
            matched_ms2_plot(target_ms2_,match_peak,fileName,scan,rt,charge,pep)

In [None]:
#绘制MS2谱图，若该谱图中存在两对及以上的b/y离子，则打印出来
def matched_ms2_plot(target_ms2_,match_peak,fileName,scan,rt,charge,pep):
    b_match = match_peak[match_peak.site.str.contains('b',case=False)].reset_index(drop=True)
    y_match = match_peak[match_peak.site.str.contains('y',case=False)].reset_index(drop=True)
    
    try:
        b_index = b_match['site'].str.split('^',expand=True)[0].str.split('b',expand=True)[1].values
        b_charge = b_match['site'].str.split('^',expand=True)[1].values
    except: b_index,b_charge = [],[]
    try: 
        y_index = y_match['site'].str.split('^',expand=True)[0].str.split('y',expand=True)[1].values
        y_charge = y_match['site'].str.split('^',expand=True)[1].values
    except: y_index,y_charge = [],[]
    count = 0
    for b,bc in zip(b_index,b_charge):
        for y,yc in zip(y_index,y_charge):
            if int(b)+int(y) == len(pep) and int(bc)==int(yc): count += 1
    if count >= 2: print(f'{pep}/{fileName}_{scan}')
    
    x_site = target_ms2_['Fragment m/z'].max()*0.4
    y_site = target_ms2_['Fragment Inmtensity'].max()*0.7
    
    text = []
    plt.figure(figsize=(4,3),dpi=100)
    plt.rcParams['font.sans-serif'] = 'Arial'
    plt.bar(target_ms2_['Fragment m/z'], target_ms2_['Fragment Inmtensity'],
            color='lightgray',width=3,zorder=0)
    if not b_match.empty:
        plt.bar(b_match['frag_mass'], b_match['frag_intensity'],
                color='black',width=3,zorder=10)
        for m in b_match.index:
            text.append(plt.text(b_match.loc[m,'frag_mass'],b_match.loc[m,'frag_intensity'],
                                  b_match.loc[m,'site'],zorder=200,fontsize=12,color='blue'))
    if not y_match.empty:
        plt.bar(y_match['frag_mass'], y_match['frag_intensity'],
                color='black',width=3,zorder=10)
        for m in y_match.index:
            text.append(plt.text(y_match.loc[m,'frag_mass'],y_match.loc[m,'frag_intensity'],
                                  y_match.loc[m,'site'],zorder=200,fontsize=12,color='red'))
    adjust_text(text,arrowprops=dict(arrowstyle='-',color='black'))
    plt.xlabel('m/z',fontsize=12)
    plt.ylabel('Intensity',fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.title(pep,fontsize=14)
    plt.text(x_site,y_site,f'{fileName}\nscan: {scan}\nRT: {rt:.3f}\nCharge: {charge}',
              fontsize=10,color='black',bbox=dict(boxstyle='round,pad=0.5',fc='white',alpha=0.5))
    if not os.path.exists(f'{ms_path}ms2_plot/{pep}'): os.mkdir(f'{ms_path}ms2_plot/{pep}')
    plt.savefig(f'{ms_path}ms2_plot/{pep}/{fileName}_{scan}.png',dpi=300,format='png',bbox_inches='tight')

In [None]:
uniprot = pd.read_csv('D:/All_for_paper/1. PhD Work Program/3. Research project/1. Dizco/Test files/uniprotkb_mouse_reviewed_2024_11_20.tsv',sep='\t')
uniprot['Gene Names'] = uniprot['Gene Names'].str.split(' ',expand=True)[0]
prot_seq = uniprot[uniprot['Gene Names']=='Trim14'].iloc[0,-1]

ms_path = 'F:/Dizco/MS_files/2024.11.20/'
mod = {'Ide':0}

In [None]:
#计算理论肽段及其质量，并在所提供的质谱文件中找到与之质量匹配的MS1
total_result = generate_result(prot_seq,ms_path,mod)
pep_infor = integrate_pep_infor(total_result)
pep_infor = pd.merge(pep_infor['uv'], pep_infor['non_uv'],on=['match_pep','start_site','end_site'],how='outer')
pep_infor.to_excel(f'{ms_path}pep_infor_Pcx.xlsx',index=False)
table_concat = pd.DataFrame()
for sample,condition in total_result.items():
    for cond,table in condition.items():
        table['type'] = sample
        table_concat = pd.concat([table_concat,table],axis=0)
pep_infor = pd.read_excel(f'{ms_path}pep_infor_TRIM14.xlsx')
table_concat = pd.merge(table_concat, pep_infor[['Index','match_pep']],on='match_pep')
table_concat['Seq_Intensity'] = np.log2(table_concat['Seq_Intensity'])

In [None]:
#绘制不同条件下的匹配到的理论肽段的RT和intensity分布
plt.figure(dpi=300,figsize=(20,3))
plt.rcParams['font.sans-serif'] = 'Arial'
ax=sns.boxplot(data=table_concat, x='Index', y='RT', hue='type',
            width=0.5,showfliers=False,)
box_pairs = [((i,'uv'),(i,'non_uv')) for i in range(1,len(pep_infor)+1)]
annot = Annotator(ax, data=table_concat, x='Index', y='RT', hue='type',pairs=box_pairs)
annot.configure(test='t-test_ind', text_format='star',line_height=0.03,line_width=1)
annot.apply_and_annotate()
plt.xlabel('Peptide index',fontsize=14,labelpad=5)
plt.ylabel('RT',fontsize=14,labelpad=5)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
#计算理论肽段的理论二级谱图（默认仅计算charge2-4的MS2）
pms2 = ParseMS2()
pep_ms2 = {pep:pms2.theoMS2(pep) for pep in pep_infor['match_pep']}

#根据计算的理论MS2，与匹配到的MS1进行比较，看质谱文件中的MS2是否能有之匹配的峰
uv_infor = pd.read_csv(f'{ms_path}uv/MS1_output_uv.csv')
non_uv_infor = pd.read_csv(f'{ms_path}non_uv/MS1_output_non_uv.csv')

ms2_plot(table_concat,uv_infor,non_uv_infor)