In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
import logging
import joblib
import pickle
import lmdb
from Bio import PDB
from Bio.PDB import PDBExceptions
from torch.utils.data import Dataset
from tqdm.auto import tqdm

import torch
import matplotlib.pyplot as plt
import seaborn as sns

import random

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1

def get_fasta_from_pdb(pdb_file):
    parser = PDBParser()
    structure = parser.get_structure("pdb", pdb_file)
    
    fasta_sequence = ""
    for chain in structure.get_chains():
        for residue in chain.get_residues():
                fasta_sequence += seq1(residue.get_resname())
    
    return fasta_sequence

In [4]:
seq1('ASP')

'D'

In [5]:
structure_dir = "/datapool/data2/home/jiahan/Data/PepMerge_new/"
seqs_dir = "/datapool/data2/home/jiahan/ResProj/PepDiff/frame-flow/Data/seqs"
bind_dic = torch.load("/datapool/data2/home/jiahan/ResProj/PepDiff/frame-flow/misc/affinity_dict.pt")

In [6]:
all_pdbs = os.listdir(structure_dir)
print(len(all_pdbs))
all_pdbs = [x for x in all_pdbs if x in bind_dic]
print(len(all_pdbs))

10348
9464


In [7]:
get_fasta_from_pdb(os.path.join(structure_dir,all_pdbs[0],'pocket.pdb'))

'YDQTLFSIVEWARSSFRELKVDDQMKLLQNCWSLLPYNNLLIEMLHAK'

In [None]:
with open(os.path.join(seqs_dir,'seqs.fasta'),'w') as f:
    for pdb in tqdm(all_pdbs):
        fasta = get_fasta_from_pdb(os.path.join(structure_dir,pdb,'receptor.pdb'))
        f.write(f'>{pdb}\n')
        f.write(fasta+'\n')
# mmseqs easy-cluster seqs.fasta clusterRes tmp --min-seq-id 0.4 -c 0.8 --cov-mode 1

In [4]:
tab = pd.read_csv('/datapool/data2/home/jiahan/ResProj/PepDiff/frame-flow/Data/seqs/clusterRes_cluster.tsv',sep='\t',header=None)
tab.columns = ['center','id']
tab['cnts'] = tab.groupby('center')['id'].transform('count')
tab.sort_values('cnts',ascending=False,inplace=True)
tab

Unnamed: 0,center,id,cnts
720,6u3n_C,3kla_C,657
561,6u3n_C,5ib3_C,657
563,6u3n_C,5nmk_C,657
564,6u3n_C,3mrm_P,657
565,6u3n_C,6g9r_P,657
...,...,...,...
8617,1g0y_I,1g0y_I,1
4209,3c01_A,3c01_A,1
4198,4tjx_B,4tjx_B,1
4184,6ybb_1_III_C,6ybb_1_III_C,1


In [13]:
cnts = tab.drop_duplicates('center')

In [21]:
cnts[cnts['cnts']<5]['cnts'].sum()

2019

In [22]:
cnts[cnts['cnts']<5]['cnts'].sum()

2019

In [23]:
10384-2019

8365

In [11]:
tab.to_csv('/datapool/data2/home/jiahan/ResProj/PepDiff/frame-flow/Data/seqs/center.csv',index=None)

In [5]:
len(set(tab['center']))

1557

In [10]:
len(set(tab['center']))

1557

In [7]:
cnts = pd.DataFrame(tab['center'].value_counts())
cnts = cnts.drop_duplicates(subset='center')
cnts.to_csv('/datapool/data2/home/jiahan/ResProj/PepDiff/frame-flow/Data/seqs/center.csv',index=None)

In [7]:
samples = pd.read_csv("/datapool/data2/home/jiahan/Res Proj/PepDiff/frame-flow/misc/231220/sample_all.csv")
samples

Unnamed: 0,id,len,tran,aar,rot,trans_loss,rot_loss
0,1a0n_A,14,8.903144,0.214286,2.382832,0.097863,1.207208
1,1a1m_C,9,0.920723,0.555556,0.370409,0.419278,0.570159
2,1a1n_C,8,0.678300,0.875000,0.452785,0.224458,0.172276
3,1a1o_C,9,0.713562,0.777778,0.573104,0.211861,0.160051
4,1a1r_C,16,1.512850,0.875000,1.068279,0.422904,0.476659
...,...,...,...,...,...,...,...
9459,8s9i_1_III_B,22,4.103814,0.272727,2.341589,2.404889,1.574362
9460,8siu_1_III_B,18,22.693531,0.111111,2.687794,0.371103,0.328657
9461,8sm5_5_III_J,17,3.152646,0.117647,1.861484,0.964529,0.384649
9462,8t0p_1_III_C,18,12.888019,0.055556,2.724181,0.546045,0.609222


In [8]:
res = pd.merge(tab,samples,on='id')
# res[['center','id','cnts','len']].to_csv('/datapool/data2/home/jiahan/Res Proj/PepDiff/frame-flow/Data/seqs/meta_data.csv',index=False)
res.to_csv('/datapool/data2/home/jiahan/Res Proj/PepDiff/frame-flow/Data/seqs/meta_data.csv',index=False)

In [9]:
res = pd.read_csv('/datapool/data2/home/jiahan/Res Proj/PepDiff/frame-flow/Data/seqs/meta_data.csv')
res

Unnamed: 0,center,id,cnts,len,tran,aar,rot,trans_loss,rot_loss
0,6u3n_C,3kla_C,657,9,0.917654,0.666667,0.654739,0.060250,0.475746
1,6u3n_C,5ib3_C,657,9,0.668258,0.444444,0.365615,0.129841,0.102632
2,6u3n_C,5nmk_C,657,9,0.797159,0.888889,0.278266,0.109927,0.236671
3,6u3n_C,3mrm_P,657,10,0.709478,0.600000,0.199372,0.101338,0.088611
4,6u3n_C,6g9r_P,657,9,0.816782,0.888889,0.291310,0.237174,0.124762
...,...,...,...,...,...,...,...,...,...
9459,1g0y_I,1g0y_I,1,21,13.338346,0.047619,2.600826,0.386976,0.615135
9460,3c01_A,3c01_A,1,20,9.313918,0.300000,2.057358,0.151006,0.600791
9461,4tjx_B,4tjx_B,1,3,0.425327,1.000000,0.507535,0.018409,1.391891
9462,6ybb_1_III_C,6ybb_1_III_C,1,22,7.948393,0.272727,2.600759,0.083613,0.395743


In [10]:
centers = set((res[(res['cnts']>=10)&(res['cnts']<=100)])['center'])
len(centers)

158

In [23]:
tests = random.sample(centers, 10)

since Python 3.9 and will be removed in a subsequent version.
  tests = random.sample(centers, 10)


In [24]:
tmp = res[res['center'].isin(tests)]
tmp

Unnamed: 0,center,id,cnts,len,tran,aar,rot,trans_loss,rot_loss
3941,8dgo_1_III_C,1r1s_B,36,6,0.679843,0.833333,0.430853,0.337473,0.250865
3943,8dgo_1_III_C,8dgo_1_III_C,36,4,0.425329,1.000000,0.482743,0.090849,0.363330
3944,8dgo_1_III_C,1jyr_L,36,9,0.946899,0.888889,0.403077,0.162988,0.471346
3945,8dgo_1_III_C,3mxy_L,36,7,0.897764,1.000000,1.040277,0.093215,0.483877
3946,8dgo_1_III_C,2h5k_C,36,3,0.386013,1.000000,0.431408,0.270294,0.130450
...,...,...,...,...,...,...,...,...,...
6194,2r02_B,3c3o_B,10,13,3.789058,0.230769,2.498516,0.384786,0.289986
6201,2r02_B,2r05_B,10,11,1.030670,0.727273,0.303218,0.352307,0.148563
6211,2r02_B,5v3r_B,10,13,3.138704,0.384615,2.109781,0.197829,0.174575
6212,2r02_B,2xs1_B,10,15,3.713773,0.066667,2.264070,0.378179,0.197648


In [25]:
tmp['tran'].mean()

2.3520436403266887

In [26]:
with open("/datapool/data2/home/jiahan/Res Proj/PepDiff/frame-flow/Data/RF_samples/names.txt",'w') as f:
    for i,row in tmp.iterrows():
        f.write(row['id']+'\n')