In [6]:
import pandas as pd
from tqdm import tqdm

In [2]:
data = pd.read_csv('../paper_data/pdbs.csv')
data.head()

Unnamed: 0,1_Group,2_Gene,3_HGNC,4_Uni_entry,5_Uni_acc,6_Domain_begin,7_Domain_end,8_Domain_length,9_Largest_insert_length,10_PDB_validation,11_Conformational_state,12_Dihedral_state,13_Group_in_Uni,14_Group_in_Manning,15_Synonymn
0,AGC,AKT1,HGNC:391,AKT1_HUMAN,P31749,150,408,259,9,6NPZB,DFGin,BLAminus,AGC,AGC,"PKB,RAC"
1,AGC,AKT2,HGNC:392,AKT2_HUMAN,P31751,152,409,258,9,3E8DB,DFGin,BLAminus,AGC,AGC,
2,AGC,AKT3,HGNC:393,AKT3_HUMAN,Q9Y243,148,405,258,9,,,,AGC,AGC,PKBG
3,AGC,CDC42BPA,HGNC:1737,MRCKA_HUMAN,Q5VT25,77,343,267,14,,,,AGC,AGC,KIAA0451
4,AGC,CDC42BPB,HGNC:1738,MRCKB_HUMAN,Q9Y5S2,76,342,267,14,5OTFA,DFGin,BLAminus,AGC,AGC,KIAA1124


In [3]:
data_filtered = data[['10_PDB_validation','11_Conformational_state']].copy()
data_filtered.dropna(inplace=True)

data_filtered.loc[:, 'PDB_ID'] = data_filtered['10_PDB_validation'].str.slice(0, 4)
data_filtered.loc[:, 'Chain'] = data_filtered['10_PDB_validation'].str.slice(4)

data_filtered.drop('10_PDB_validation', axis=1, inplace=True)

columns_titles = ['PDB_ID', 'Chain', '11_Conformational_state']
data_filtered = data_filtered.reindex(columns=columns_titles)

data_filtered.rename(columns={'11_Conformational_state': 'conf_state'}, inplace=True)
data_filtered.reset_index(drop=True, inplace=True)

data_filtered

Unnamed: 0,PDB_ID,Chain,conf_state
0,6NPZ,B,DFGin
1,3E8D,B,DFGin
2,5OTF,A,DFGin
3,2VD5,A,DFGin
4,5UVC,A,DFGin
...,...,...,...
261,2OSC,A,DFGout
262,3EQP,B,DFGin
263,3ZON,A,DFGin
264,4GVJ,A,DFGin


In [None]:
data_filtered.to_csv('aligned_pdb_list.csv', index=False)

In [4]:
value_counts = data_filtered['conf_state'].value_counts()

print(value_counts)


conf_state
DFGin       240
DFGout       17
DFGinter      9
Name: count, dtype: int64


In [None]:
import pandas as pd
import requests, json, datetime
from tqdm import tqdm


r = requests.get('https://search.rcsb.org/rcsbsearch/v2/query',params={'json': '''{ "query":{
      "type": "terminal",
      "label": "text",
      "service": "text",
      "parameters": {
        "attribute": "rcsb_entry_info.nonpolymer_entity_count",
        "operator": "greater",
        "negation": false,
        "value": 0
      }
    },
  "request_options": {
    "return_all_hits": true
  },
    "return_type": "entry"
  }'''})

PDBs = json.loads(r.text)
PDBs = [r['identifier'] for r in PDBs['result_set']]

query = """
{
  entries(entry_ids:[%s]){
    rcsb_id
    rcsb_accession_info {
      deposit_date
      }
    polymer_entities {
    rcsb_cluster_membership {
        cluster_id
        identity
    }
      rcsb_polymer_entity_container_identifiers {
        auth_asym_ids
        reference_sequence_identifiers {
          database_accession
          database_name
        }
      }
    }
  }
}"""

info = []

for i in tqdm(range(0, len(PDBs), 100)):
    chunk = PDBs[i:i+100]
    response = requests.post('https://data.rcsb.org/graphql', json={'query': query % ','.join(map(lambda p: f'"{p}"', chunk))})
    r = json.loads(response.text)
    for e in r['data']['entries']:
        pdb = e['rcsb_id']
        date = e['rcsb_accession_info']['deposit_date']
        year = datetime.datetime.strptime(date,'%Y-%m-%dT%H:%M:%SZ').year
        if e['polymer_entities']:
            for entity in e['polymer_entities']:
                if entity['rcsb_cluster_membership']:
                    datum = {}
                    for cluster in entity['rcsb_cluster_membership']:
                        datum[f'clust{cluster["identity"]}'] = cluster['cluster_id']

                    name = f'{pdb}_' + ''.join(entity['rcsb_polymer_entity_container_identifiers']['auth_asym_ids'])
                    datum['name'] = name
                    datum['year'] = year
                    datum['pdb'] = pdb
                    info.append(datum)

info = pd.DataFrame(info)

In [21]:
info = pd.read_csv('../paper_data/pdb_sequence_info.csv')
print(info.shape)
info.head()

(320334, 10)


Unnamed: 0.1,Unnamed: 0,clust100,clust95,clust90,clust70,clust50,clust30,name,year,pdb
0,0,98329,11,18,46,98,226,142L_A,1993,142L
1,1,98341,11,18,46,98,226,157L_A,1994,157L
2,2,730,11,18,46,98,226,182L_A,1995,182L
3,3,2,2,2,11,33,40,193L_A,1995,193L
4,4,15695,18253,19957,17829,8978,426,1A0F_AB,1997,1A0F


In [23]:
info.isin(['5CEN']).any().any()

False

In [25]:
data_list = []
unaccounted = []

for _, row in tqdm(data_filtered.iterrows()):
    pdb_id = row['PDB_ID']
    chain = row['Chain']
    conf = row['conf_state']
    
    matching_info_rows = info[(info['pdb'] == pdb_id) & info['name'].apply(lambda x: chain in x.split('_')[-1])]
    
    if matching_info_rows.empty:
        unaccounted.append(pdb_id)

    for _, info_row in matching_info_rows.iterrows():
        data_tuple = (pdb_id, chain, conf, info_row['clust30'], info_row['clust50'], info_row['clust70'])
        data_list.append(data_tuple)

final_df = pd.DataFrame(data_list, columns=['pdb_id', 'chain', 'conf_state', 'clust30', 'clust50', 'clust70'])

final_df.head()

266it [00:36,  7.34it/s]


Unnamed: 0,pdb_id,chain,conf_state,clust30,clust50,clust70
0,6NPZ,B,DFGin,2,162227,178319
1,3E8D,B,DFGin,2,8382,7890
2,5OTF,A,DFGin,2,2798,21423
3,2VD5,A,DFGin,2,151147,165748
4,5UVC,A,DFGin,59579,83463,85983


In [26]:
final_df.to_csv('aligned_pdb_list.csv')

In [13]:
print(final_df.shape)

(243, 6)


In [11]:
unique_values_count = final_df['clust30'].nunique(dropna=True)
print(unique_values_count)

37
