In [27]:
import json
import os
import re
from bs4 import BeautifulSoup

# import modules for geting jsons from url
import requests
import time
import concurrent
from concurrent.futures import ThreadPoolExecutor
from glob import glob
import urllib.request
import pickle

os.chdir("/home/adri/Projects/phd/pipeline")
class_a_path = "data/raw/class_a.json"
ligand_type_path = "data/raw/lig_type.json"
# load jsons
with open(class_a_path) as f:
    class_a = json.load(f)

with open(ligand_type_path) as f:
    ligand_type = json.load(f)

complex_dynids = ligand_type[1]['dyn_id']
class_a_dyn_ids = class_a[0]["dyn_id"]
# Check overlapping between the two lists
overlapping = [dyn_id for dyn_id in complex_dynids if dyn_id in class_a_dyn_ids]

In [2]:
def fetch_model_pdb(dynid):
    url = f"https://devel.gpcrmd.org/api/search_dyn/{dynid}"
    response = requests.get(url)
    json_response = response.json()
    pdb = json_response[0]["pdb_namechain"]
    return pdb

pdb_dict = {}

# Determine the level of parallelism. 
# Depending on the server's rate limits, you may need to adjust `max_workers`.
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_dynid = {executor.submit(fetch_model_pdb, dynid): dynid for dynid in overlapping}
    for future in concurrent.futures.as_completed(future_to_dynid):
        dynid = future_to_dynid[future]
        try:
            pdb_dict[dynid] = future.result()
        except Exception as exc:
            print(f'{dynid} generated an exception: {exc}')

In [3]:
# remove the chain from the pdb_dict
pdb_dict = {k:v.split(".")[0] for k,v in pdb_dict.items()}


In [None]:
# save the dict
with open("data/interim/dynid_to_pdb.pickle", "wb") as f:
    pickle.dump(pdb_dict, f)

In [26]:
pdb_dict

{6: '2R4R',
 19: '2Y02',
 9: '2R4R',
 10: '2R4R',
 8: '4N6H',
 15: '2Y00',
 4: '4N6H',
 20: '2Y02',
 7: '3EML',
 16: '2Y00',
 38: '3RFM',
 25: '2Y03',
 22: '5U09',
 40: '3UZA',
 26: '2Y03',
 31: '2YDO',
 29: '2Y04',
 33: '2YDV',
 36: '3REY',
 46: '4UHR',
 57: '2YCW',
 55: '2Y04',
 48: '5G53',
 42: '3UZC',
 58: '4BVN',
 50: '5IU4',
 44: '4EIY',
 67: '4MBS',
 60: '4DJH',
 83: '3D4S',
 80: '4PY0',
 76: '4PHU',
 89: '4RWS',
 90: '4IAR',
 86: '4RWS',
 71: '4GRV',
 70: '4EA3',
 69: '4MQS',
 78: '4PXZ',
 94: '4IB4',
 99: '3NY9',
 102: '3ODU',
 112: '3UON',
 103: '3ODU',
 115: '4LDL',
 117: '4LDO',
 106: '3PBL',
 109: '3RZE',
 119: '4MBS',
 120: '4ZUD',
 121: '4LDE',
 123: '3NY8',
 126: '3RZE',
 129: '3VW7',
 130: '3VW7',
 124: '3NYA',
 127: '3V2Y',
 146: '4ZUD',
 134: '3ZPQ',
 160: '5JQH',
 164: '5U09',
 149: '5A8E',
 140: '4AMI',
 150: '5A8E',
 145: '4N6H',
 133: '3ZPQ',
 167: '4AMJ',
 169: '5C1M',
 172: '4S0V',
 173: '5DSG',
 176: '4U16',
 181: '4XNW',
 174: '4U15',
 182: '4XNW',
 168: '5GL

In [4]:
def get_prot_name(pdb):

    url = f"https://gpcrdb.org/structure/{pdb}"
    fp = urllib.request.urlopen(url)
    mybytes = fp.read()
    mystr = mybytes.decode("utf8")
    fp.close()
    
    protein = re.findall('\.\./protein/(\w+)"', mystr)[0]
    
    return protein

In [5]:
def get_bw_from_protname(prot_name):
    # url of the protein page
    url =  f"https://gpcrdb.org/protein/{prot_name}"
    # parsing the html file from the page
    fp = urllib.request.urlopen(url)
    mybytes = fp.read()
    mystr = mybytes.decode("utf8")
    fp.close()

    # replacing newlines with spaces
    mystr = " ".join(mystr.splitlines())

    # searching strings with position and ballesteros informatino
    matches = re.findall("title='(.+?)'", mystr)

    # spliting strings in two or one element lists
    dic = [title.split() for title in matches]

    # making a dictionary out of the list. residues without BWid will have the same string as key and value
    dic = dict([a if len(a) > 1 else a*2 for a in dic])

    return dic

In [6]:
# revert the dict to have the protname as key and a list of dynids as value
reversed_pdb_dict = {}
for k,v in pdb_dict.items():
    if v not in reversed_pdb_dict:
        reversed_pdb_dict[v] = [k]
    else:
        reversed_pdb_dict[v].append(k)

In [None]:
pdb_to_bw_dict = {}

In [16]:
for pdb, dynids in reversed_pdb_dict.items():

    if pdb in pdb_to_bw_dict:
        continue
    
    try:
        prot_name = get_prot_name(pdb)
        bw_dict = get_bw_from_protname(prot_name)
    except urllib.request.HTTPError as e:
        print(f"Error with {pdb}: {e}")
        continue
    
    pdb_to_bw_dict[pdb] = bw_dict


Error with XXXX: HTTP Error 500: Internal Server Error


In [23]:
# format bw_dicts
formated_bw_dict = {}
for pdb, bw_dict in pdb_to_bw_dict.items():
    formated_bw_dict[pdb] = {}
    for k,v in bw_dict.items():
        # remove the letter from key
        new_k = re.sub("[A-Z]", "", k)
        # transform the value from X.YxY to XxY
        new_v = re.sub("\.\d+", "", v)

        # print(f"{k} -> {new_k}")
        # print(f"{v} -> {new_v}")

        formated_bw_dict[pdb][new_k] = new_v


In [24]:
formated_bw_dict

{'2R4R': {'148': '4x40',
  '149': '4x41',
  '150': '4x42',
  '151': '4x43',
  '152': '4x44',
  '153': '4x45',
  '154': '4x46',
  '155': '4x47',
  '156': '4x48',
  '157': '4x49',
  '158': '4x50',
  '159': '4x51',
  '160': '4x52',
  '161': '4x53',
  '162': '4x54',
  '163': '4x55',
  '164': '4x56',
  '165': '4x57',
  '166': '4x58',
  '167': '4x59',
  '168': '4x60',
  '169': '4x61',
  '170': '4x62',
  '171': '4x63',
  '172': '4x64',
  '196': '5x36',
  '197': '5x37',
  '198': '5x38',
  '199': '5x39',
  '200': '5x40',
  '201': '5x41',
  '202': '5x42',
  '203': '5x43',
  '204': '5x44',
  '205': '5x45',
  '206': '5x46',
  '207': '5x461',
  '208': '5x47',
  '209': '5x48',
  '210': '5x49',
  '211': '5x50',
  '212': '5x51',
  '213': '5x52',
  '214': '5x53',
  '215': '5x54',
  '216': '5x55',
  '217': '5x56',
  '218': '5x57',
  '219': '5x58',
  '220': '5x59',
  '221': '5x60',
  '222': '5x61',
  '223': '5x62',
  '224': '5x63',
  '225': '5x64',
  '226': '5x65',
  '227': '5x66',
  '228': '5x67',
  '22

In [25]:
# Save formated_bw_dict as a pickle
import pickle
with open("data/interim/formated_bw_dict.pickle", "wb") as f:
    pickle.dump(formated_bw_dict, f)