#### 该部分是为了以修饰到的氨基酸位点为中心，计算与邻接氨基酸的距离，以构建30x20的矩阵

In [None]:
import sys
sys.path.append('D:/All_for_paper/Proteomics/PhD_project/1. Dizco/')
from os import listdir
import gzip
from tqdm import tqdm
import pickle
from multiprocessing import Pool
from ExtractCoordinate import gain_prot_loc
from scipy.spatial import distance
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

##### AlphaFold: https://alphafold.com/download

In [None]:
#解压从AlphaFold上下载的.gz文件
def untar(file,tar_path,des_path):
    file = file.replace('.gz', '')
    g_file = gzip.GzipFile(tar_path)
    open(des_path+file,'wb+').write(g_file.read())

path = 'D:/All_for_paper/1. PhD Work Program/3. Research project/1. Dizco/AlphaFold_pdb/'
des_path = 'D:/All_for_paper/1. PhD Work Program/3. Research project/1. Dizco/AlphaFold_pdbFiles/'
for file in tqdm(listdir(path)):
    untar(file,path+file,des_path)

In [None]:
#读取pdb文件并获取蛋白每个氨基酸的空间信息
#AlphaFold总共有23,395个蛋白pdb文件，其中unique的有20,508个(于2024.11.05下载了新的pdb文件，但未查验相关数据是否重复)
#其中有大约三千的重复蛋白，它们的uniprot ID相同，我们保留sequence最大的pdb文件
#有部分蛋白在AlphaFold中没有预测完全长，即部分binding sites在AlphaFold中没有空间信息，故被省略
fileName = listdir(des_path)
with Pool(4) as p:
    output = p.map(gain_prot_loc,tqdm(fileName))
prot_loc_dic = {}
for prot in output:
    if list(prot.keys())[0] not in list(prot_loc_dic.keys()): prot_loc_dic.update(prot)
    else:
        table = prot_loc_dic[list(prot.keys())[0]]
        if len(table) >= len(list(prot.values())[0]): continue
        else: prot_loc_dic.update(prot)

pickle.dump(prot_loc_dic,open('D:/All_for_paper/1. PhD Work Program/3. Research project/1. Dizco/pkl_files/Coordinates of residues of proteins.pkl','wb'))

##### 开始计算氨基酸的patterns，并保存至相应的文件夹下

In [None]:
#计算蛋白的距离矩阵
def dist_matrix_cal(prot,method='euclidean'):
    prot_loc = prot_loc_dic[prot].copy()
    prot_loc['resName'] = list(prot_loc['Residue'].map(str).str.cat([prot_loc['Res_id'].astype(str)],sep='_'))
    coord_values = tuple(prot_loc[['x','y','z']].values)
    dist_matrix = pd.DataFrame(distance.cdist(coord_values,coord_values, metric=method), index=prot_loc['resName'], columns=prot_loc['resName'])
    return dist_matrix

#根据距离矩阵计算某一氨基酸0-15 A的范围内的邻近氨基酸，以生成pattern
def generate_pattern(res_table,res_site,res_id_trans):
    res_table = res_table[res_table[res_site]>0].reset_index(drop=True)
    res_table['Residues'] = res_table['resName'].str.split('_',expand=True)[0]
    residues = list(res_id_trans.values())
    res_count = {res:0 for res in residues}
    
    pattern = pd.DataFrame()
    dis_range = np.arange(0,15,0.5)
    for dis in dis_range:
        res_table_ = res_table[(res_table[res_site]>dis)&(res_table[res_site]<=dis+0.5)]
        res_table_ = res_table_.groupby(by='Residues').count().reset_index(drop=False)
        res_count_ = res_count.copy()
        
        if not res_table_.empty:
            res_dict = dict(zip(res_table_['Residues'],res_table_['resName']))
            res_count_.update(res_dict)
        
        res_count_ = pd.DataFrame.from_dict(res_count_, orient='index').T
        pattern = pd.concat([pattern,res_count_],axis=0)
    return pattern.reset_index(drop=True)

#将生成的pattern根据标签保存至相应路径
def generate_dist_matrix(probe,prot,dist_matrix,label_site,res,label_loc,labeled=None):
    if not os.path.exists(dist_path+probe): os.mkdir(dist_path+probe)
    if not os.path.exists(dist_path+probe+'/labeled_site'): os.mkdir(dist_path+probe+'/labeled_site')
    if not os.path.exists(dist_path+probe+'/unlabeled_site'): os.mkdir(dist_path+probe+'/unlabeled_site')
    if not os.path.exists(dist_path+probe+'/predictive_labeling'): os.mkdir(dist_path+probe+'/predictive_labeling')
    
    
    res_id_trans = {'A':'ALA','R':'ARG','N':'ASN','D':'ASP',
                    'C':'CYS','E':'GLU','Q':'GLN','G':'GLY',
                    'H':'HIS','I':'ILE','L':'LEU','K':'LYS',
                    'M':'MET','F':'PHE','P':'PRO','S':'SER',
                    'T':'THR','W':'TRP','Y':'TYR','V':'VAL'}
    res_site = label_site.replace(res,res_id_trans[res])
    try: res_table = dist_matrix.loc[:,res_site].reset_index(drop=False)
    except: return None
    pattern = generate_pattern(res_table,res_site,res_id_trans)
    
    if labeled==1: pattern.to_csv(dist_path+probe+'/labeled_site/{:}_{:}_{:}.csv'.format(probe,prot,label_site),index=False)
    elif labeled==0: pattern.to_csv(dist_path+probe+'/unlabeled_site/{:}_{:}_{:}.csv'.format(probe,prot,label_site),index=False)
    elif labeled is None: pattern.to_csv(dist_path+probe+'/predictive_labeling/{:}_{:}_{:}.csv'.format(probe,prot,label_site),index=False)

#提取探针修饰的氨基酸位点
def extract_all_site(table):
    lt = []
    for i in table.index:
        sites = table.loc[i,'label_site'].split(';')
        prot,pep = table.loc[i,['Master Protein Accessions','Upper_Seq']].values
        for site in sites:
            lt.append(tuple((prot,pep,site,site[0],site[1])))
    table = pd.DataFrame(lt,columns=table.columns)
    return table

#dizco中阳性结果里探针标记的氨基酸位点的距离矩阵的计算
def labeled_dist_matrix(probe,psm_table):
    psm_table = psm_table[psm_table['Master Protein Accessions'].isin(uniprot_infor['Entry'])].reset_index(drop=True)
    
    for (prot,pep),table in psm_table.groupby(by=['Master Protein Accessions','Upper_Seq']):
        review,seq = uniprot_infor[uniprot_infor['Entry']==prot].iloc[0,[1,5]]
        if review == 'unreviewed': continue
        start = seq.find(pep)+1
        table = table.copy().reset_index(drop=True)
        table = extract_all_site(table)
        
        dist_matrix = dist_matrix_cal(prot)
        for num,res in enumerate(pep):
            if num+1 in list(table['label_loc'].astype(int)): labeled = 1
            else: labeled = 0
            label_site = '{:}_{:}'.format(res,start+num)
            label_loc = start+num-1
            generate_dist_matrix(probe,prot,dist_matrix,label_site,res,label_loc,labeled)

#dizco中阴性结果中氨基酸位点的距离矩阵的计算
def unlabeled_dist_matrix(probe,psm_table):
    psm_table = psm_table[psm_table['Master Protein Accessions'].isin(list(uniprot_infor['Entry']))].reset_index(drop=True)
    
    for (prot,pep),table in psm_table.groupby(by=['Master Protein Accessions','Upper_Seq']):
        review,seq = uniprot_infor[uniprot_infor['Entry']==prot].iloc[0,[1,5]]
        if review == 'unreviewed': continue
        
        start = seq.find(pep)+1
        dist_matrix = dist_matrix_cal(prot)
        for num,res in enumerate(pep):
            label_site = '{:}_{:}'.format(res,start+num)
            label_loc = start+num-1
            generate_dist_matrix(probe,prot,dist_matrix,label_site,res,label_loc,labeled=None)


In [None]:
#为每个蛋白生成氨基酸的距离矩阵
psm_path = 'D:/All_for_paper/1. PhD Work Program/3. Research project/1. Dizco/Test files/'
dist_path = 'D:/All_for_paper/1. PhD Work Program/3. Research project/1. Dizco/distance_matrix/'
prot_loc_dic = pickle.load(open('D:/All_for_paper/1. PhD Work Program/3. Research project/1. Dizco/pkl_files/Coordinates of residues of proteins.pkl','rb'))
processedPSM = pd.read_csv(psm_path+'merge_isoTOP_PSM_data.csv')
uniprot_infor = pd.read_csv('D:/All_for_paper/Microprotein/Database/Uniprot/uniprotkb_human_2024_09_28.tsv',sep='\t')

In [None]:
#因为Q9C0D9和Q9NZV5在AlphaFold中没有预测的结构，所以从列表中删除
filter_name = ['Q9C0D9','Q9NZV5']
pre_prob = 0.85
for (probe,label),psm_table in tqdm(processedPSM.groupby(by=['probe','labeled'])):
    psm_table = psm_table.copy()
    psm_table = psm_table[~psm_table['Master Protein Accessions'].isin(filter_name)].reset_index(drop=True)

    if label == 1:
        psm_table = psm_table[(psm_table['paired']==1)&(psm_table['lg_probs']>pre_prob)]
        psm_table = psm_table[['Master Protein Accessions','Upper_Seq','label_site','label_AA','label_loc']]
        psm_table = psm_table.drop_duplicates().reset_index(drop=True)
        labeled_dist_matrix(probe,psm_table)
    elif label == 0:
        psm_table = psm_table[psm_table['lg_probs']>pre_prob]
        psm_table = psm_table[['Master Protein Accessions','Upper_Seq']]
        psm_table = psm_table.drop_duplicates().reset_index(drop=True)
        unlabeled_dist_matrix(probe,psm_table)