In [4]:
import pandas as pd
import json
import os
import urllib.request
import numpy as np
from collections import defaultdict
from Bio import AlignIO, SeqIO, pairwise2
import re
import html
from Bio.PDB.DSSP import make_dssp_dict
from Bio import PDB
import pandas as pd 
import subprocess
import mapping_uniprot_pdb
import requests, sys

In [5]:
# Download the needed files

#Download uniprot ID of all proteins on GPCRdb and their classification

#Download file with classification info
filename_listGPCRdb = "../data/250220_Classification_GPCRdb.xlsx"
listGPCRdb_df = pd.read_excel(filename_listGPCRdb)

#Load the excel file of your protein entry from https://gpcrdb.org/mutational_landscape/ #check for updates?!

filename_mutagenesis = "../data/GPCRdb_variants.xlsx"
mutagenesis_GPCRdb_raw_data = pd.read_excel(filename_mutagenesis)

#Load the excel file with Gprot and Barr coupling data from https://gproteindb.org/signprot/statistics_venn (filtered with own code)
#this info is also on GtoP but not as complete
filename_Gprot = "../data/GproteinDB_table.xlsx"
Gprot_GPCRdb = pd.read_excel(filename_Gprot)
filename_Barr = "../data/BarrDB_table.xlsx"
Barr_GPCRdb = pd.read_excel(filename_Barr)

#chimeric design info
filename_chimeric_designs = "../data/previous_designs.xlsx"
chimeric_design_df = pd.read_excel(filename_chimeric_designs)

#Download all structures on GPCRdb to access the GPCR state
requestURL = "https://gpcrdb.org/services/structure/"
r = requests.get(requestURL)

if not r.ok:
  r.raise_for_status()
  sys.exit()

structures_chain = json.loads(r.text)
json.dump(structures_chain, open("../data/structures_data.json", "w"), indent=2)
# structures_chain= json.load(open("../data/structures_data.json"))

#Download all endogenous on GPCRdb to access the GPCR state
requestURL = "https://gpcrdb.org/services/ligands/endogenousligands/"
r = requests.get(requestURL)

if not r.ok:
  r.raise_for_status()
  sys.exit()

endogenous_ligands = json.loads(r.text)
json.dump(endogenous_ligands, open("../data/endogenous_ligands.json", "w"), indent=2)
# endogenous_ligands = json.load(open("../data/endogenous_ligands.json"))

#alignment with all the natural GPCRs
MSA = "../data/MSA_all_mammalian.fasta"

# file with the representative experimental 3D structures (lowest resolution independent of activation state)
# file generate with the notebook "reference_structure.ipynb" updated Feb 2025
representative_structures_json = json.load(open("../data/250225_representative_structures_exp_pdbID_uniprotID.json"))

#chimeras
chimeric_entry_data = "../data/all_designs.fasta"
chimeras_record_dict = SeqIO.index(chimeric_entry_data, "fasta")

In [6]:
#for now select only 6 (demonstration purposes)
of_interest=["P20309",]

naturals_entry_data = "../data/MSA_all_mammalian.fasta"
entry_uniprotID_seq = {}
for record in SeqIO.parse(naturals_entry_data,"fasta"):
    # if record.id in of_interest:
        entry_uniprotID_seq[record.id]=str(record.seq)
print(len(entry_uniprotID_seq))
print(entry_uniprotID_seq)

1755
{'Q8NGE7': '--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLGSKPRVHLYILPCASQQVSTMGDRGTSNHSEMTDFILAGFRVRPELHILLFLLFLFVYAMILLGNVGMMTIIMTDPR-----------LNTPMYF-FLGNLSF-IDLFYSSVIEPKAMINFWSEN-----------KSISFAG-CVAQLFLFALLI-VTEGFLLAAMAYDRFIAICNPLLY----------------SVQMSTRLC-TQLVAGSYFCGCISSVIQTSMTFTLSFCASRAVDHFY--------------------------------------------------------------------------------------------------------------------------------------------CDSRPLQRLSCSDLFIHRMISFSLSCIIILPTIIVIIVSYMYIVSTVLKIHS---------------------------------

In [7]:
def coupling_Gprot_Barr(uniprot_id):
    Gprot_coupling_data = []
    Gprot_coupling_data_prot = {}
    gprot = ""
    for i in Gprot_GPCRdb[Gprot_GPCRdb['Uniprot ID'] == uniprot_id].iloc[0][1:]:
        if not i is np.NaN:
            gprot +=i
            gprot += ", "
    Gprot_coupling_data_prot["value"]=gprot[:-2]
    Gprot_coupling_data_prot["reference"]="https://gproteindb.org/signprot/statistics_venn"
    Gprot_coupling_data.append(Gprot_coupling_data_prot)

    Barr_coupling_data = []
    Barr_coupling_data_prot = {}
    barr = ""
    for i in Barr_GPCRdb[Barr_GPCRdb['Uniprot ID'] == uniprot_id].iloc[0][1:]:
        if not i is np.NaN:
            barr +=i
            barr += ", "
    Barr_coupling_data_prot["value"]=barr[:-2]
    Barr_coupling_data_prot["reference"]="https://arrestindb.org/signprot/arrestin_venn"
    Barr_coupling_data.append(Barr_coupling_data_prot)

    return Gprot_coupling_data,Barr_coupling_data

In [8]:
def retrieve_mutagenesis_info_Uniprot(variations_uniprot_json,uniprot_id):
    mutations_Uniprot = []
    for variant in range(len(variations_uniprot_json["features"])):
        mutation = {}
        mutation["start"] = int(variations_uniprot_json["features"][variant]["begin"])
        mutation["end"] = int(variations_uniprot_json["features"][variant]["end"])
        mutation_type = variations_uniprot_json["features"][variant]["consequenceType"]
        mutation["type"] = mutation_type

        typical_AA = "AVLIPMCFYWSTQNHKRDEG"
        mutation["original residue"] = variations_uniprot_json["features"][variant]["wildType"]
        if not mutation["original residue"] in typical_AA:
            continue
        predictions = []
        prediction = {}
        consensus = []
        if mutation_type.lower() == "missense":
            mutation["alternative residue"] = variations_uniprot_json["features"][variant]["mutatedType"]
            try:
                nb_predictors = len(variations_uniprot_json["features"][variant]["predictions"])
                for predictor in range(nb_predictors):
                    prediction = {}
                    algorithm = variations_uniprot_json["features"][variant]["predictions"][predictor]["predAlgorithmNameType"].lower()
                    score = variations_uniprot_json["features"][variant]["predictions"][predictor]["score"]
                    prediction_value = variations_uniprot_json["features"][variant]["predictions"][predictor]["predictionValType"]
                    if algorithm.lower() == "polyphen":
                        prediction["predictor"] = "polyphen"
                        prediction["value"] = str(score)
                        if score >= 0.2:
                            prediction["prediction"] = "probably damaging"
                            consensus.append("-")
                        elif score >= 0.1:
                            prediction["prediction"] = "possibly damaging"
                            consensus.append("-")
                        else:
                            prediction["prediction"] = "benign"
                            consensus.append("+")
                        predictions.append(prediction)
                    elif algorithm.lower() == "sift":
                        prediction["predictor"] = "SIFT"
                        prediction["value"] = str(score)
                        if score <= 0.05:
                            prediction["prediction"] = "deleterious"
                            consensus.append("-")
                        else:
                            prediction["prediction"] = "tolerated"
                            consensus.append("+")
                        predictions.append(prediction)
            except:
                prediction["predictor"] = ""
                prediction["value"] = ""
                try:
                    effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                    if "benign" in effect.lower() or "tolerated" in effect.lower():
                        prediction["prediction"] = "tolerated"
                        consensus.append("+")
                    elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                        prediction["prediction"] = "deleterious"
                        consensus.append("-")
                    else:
                        prediction["prediction"] = "not indicated"
                except:
                    prediction["prediction"] = "not indicated"
                predictions.append(prediction)

        elif mutation_type.lower() == "stop gained":
            mutation["alternative residue"] = "termination"
            prediction["predictor"] = ""
            prediction["value"] = ""
            try:
                effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                if "benign" in effect.lower() or "tolerated" in effect.lower():
                    prediction["prediction"] = "tolerated"
                    consensus.append("+")
                elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                    prediction["prediction"] = "deleterious"
                    consensus.append("-")
                else:
                    prediction["prediction"] = "not indicated"
            except:
                prediction["prediction"] = "not indicated"
            predictions.append(prediction)

        elif mutation_type.lower() == "inframe deletion":
            mutation["alternative residue"] = "missing"
            prediction["predictor"] = ""
            prediction["value"] = ""
            try:
                effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                if "benign" in effect.lower() or "tolerated" in effect.lower():
                    prediction["prediction"] = "tolerated"
                    consensus.append("+")
                elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                    prediction["prediction"] = "deleterious"
                    consensus.append("-")
                else:
                    prediction["prediction"] = "not indicated"
            except:
                prediction["prediction"] = "not indicated"
            predictions.append(prediction)

        elif mutation_type.lower() == "frameshift":
            mutation["alternative residue"] = ""
            prediction["predictor"] = ""
            prediction["value"] = ""
            try:
                effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                if "benign" in effect.lower() or "tolerated" in effect.lower():
                    prediction["prediction"] = "tolerated"
                    consensus.append("+")
                elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                    prediction["prediction"] = "deleterious"
                    consensus.append("-")
                else:
                        prediction["prediction"] = "not indicated"
            except:
                prediction["prediction"] = "not indicated"
            predictions.append(prediction)

        elif mutation_type.lower() == "stop lost":
            mutation["alternative residue"] = variations_uniprot_json["features"][variant]["mutatedType"]
            prediction["predictor"] = ""
            prediction["value"] = ""
            try:
                effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                if "benign" in effect.lower() or "tolerated" in effect.lower():
                    prediction["prediction"] = "tolerated"
                    consensus.append("+")
                elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                    prediction["prediction"] = "deleterious"
                    consensus.append("-")
                else:
                    prediction["prediction"] = "not indicated"
            except:
                prediction["prediction"] = "not indicated"
            predictions.append(prediction)

        else: #skip if it not a missense, frameshift, stop gained, stop lost, inframe deletion
            continue

        mutation["effect(s)"] = predictions

        if len(consensus) == 0:
                consensus_ = "no consensus"
        else:
            for element in consensus:
                if len(list(set(consensus))) == 1:
                    if list(set(consensus))[0] == "+":
                        consensus_ = "tolerated"
                    elif list(set(consensus))[0] == "-":
                        consensus_ = "deleterious"
                else:
                    consensus_ = "no consensus"

        mutation["consensus effect"] = consensus_

        mutation["reference"] = f"https://www.uniprot.org/uniprotkb/{uniprot_id}/variant-viewer"
        mutations_Uniprot.append(mutation)

    return mutations_Uniprot

In [9]:
def retrieve_pharmaco_info_GPCRdb(pharmaco_data_json):
    pharmaco = []
    for dic in pharmaco_data_json:
        pharmaco_dic = {}
        pharmaco_dic["start"]= dic["mutation_pos"]
        pharmaco_dic["end"]= dic["mutation_pos"]
        pharmaco_dic["original residue"]= dic["mutation_from"]
        pharmaco_dic["alternative residue"]= dic["mutation_to"]
        pharmaco_dic["studied parameter"]= dic["exp_type"]
        DB = dic["ligand_id"]
        pharmaco_dic["ligand"]= dic["ligand_name"]
        if "CHEMBL" in DB: #chembl
            pharmaco_dic["link ligand"] = f"https://www.ebi.ac.uk/chembl/compound_report_card/{DB}/"
        elif DB.isnumeric(): #pubchem
            pharmaco_dic["link ligand"] = f"https://pubchem.ncbi.nlm.nih.gov/compound/{DB}"
        else:
            pharmaco_dic["link ligand"] = ""
        pharmaco_dic["ligand type"]= ""
        effect_value = round(dic["exp_fold_change"],1)
        if effect_value < 0:
            impact = "increase"
            effect_value = effect_value*-1
        else:
            impact = "decrease"

        if effect_value == 0.0:
            pharmaco_dic["effect"]="None"
        else:
            pharmaco_dic["effect"]= str(effect_value) + " fold " + impact

        pharmaco_dic["reference"]= dic["reference"]

        pharmaco.append(pharmaco_dic)
    return pharmaco

In [10]:
def calculate_sq_atom_distance(i, j):
    """Squared euclidean distance between two 3d points"""
    return (i[0] - j[0]) * (i[0] - j[0]) + \
            (i[1] - j[1]) * (i[1] - j[1]) + \
            (i[2] - j[2]) * (i[2] - j[2])

def identify_gaps(pdb_file, chain_pdb, offset, end): #code modified from pdb_gap.py file from pdbtools Copyright 2018 João Pedro Rodrigues
    fhandle = open(pdb_file, 'r')
    centroid = ' CA '  # respect spacing. 'CA  ' != ' CA '
    distance_threshold = 4.0 * 4.0
    prev_at = (None, None, None, None, (None, None, None))
    model = 0
    n_gaps = 0
    gap = []
    for line in fhandle:

        if line.startswith('MODEL'):
            model = int(line[10:14])

        elif line.startswith('ATOM'):
            atom_name = line[12:16]
            if atom_name != centroid:
                continue

            resn = line[17:20]
            resi = int(line[22:26])
            chain = line[21]
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])

            at_uid = (model, chain, resi, resn, atom_name, (x, y, z))
            #Detects gaps both by a distance criterion or discontinuous residue numbering. Only applies to protein residues.
            if prev_at[0] == at_uid[0] and prev_at[1] == at_uid[1]:
                d = calculate_sq_atom_distance(at_uid[5], prev_at[5])
                if d > distance_threshold:
                    gap.append([prev_at[1],prev_at[2],at_uid[1],at_uid[2]])
                    n_gaps += 1
                elif prev_at[2] + 1 != at_uid[2]:
                    gap.append([prev_at[1],prev_at[2],at_uid[1],at_uid[2]])
                    n_gaps += 1

            prev_at = at_uid

    gaps_cleaned = []
    start = offset
    for section in gap:
        if section[0] == chain_pdb and section[2] == chain_pdb:
            stop = section[1]
            if start < 1000 and stop < 1000:
                gaps_cleaned.append([start,stop])
            start = section[3]
    gaps_cleaned.append([start,end])
    return gaps_cleaned

In [None]:
def merge_duplicates(dicts):
    # Step 1: Group dictionaries by the 'start' key
    grouped = defaultdict(list)
    for d in dicts:
        grouped[d['start']].append(d)
    
    result = []
    conflicts = []

    # Step 2: Process each group
    for start, items in grouped.items():
        if len(items) > 1:
            # Check if all 'type' values are the same
            types = set(d['type'] for d in items)
            if len(types) == 1:
                # Merge 'reference' values
                merged_references = ""
                for d in items:
                    merged_references += d['description']
                # Create a new dictionary with merged references
                new_dict = items[0].copy()
                new_dict['description'] = merged_references[:-1]
                new_dict['reference'] = "https://www.ebi.ac.uk/pdbe/pisa/"
                result.append(new_dict)
            else:
                # Print dictionaries with different 'type' values
                for d in items:
                    conflicts.append(d)
        else:
            result.append(items[0])
    
    return result, conflicts

def retrieve_interacting_residues_PDB(pdb_id, chain_pdb,mapping_uniprot_PDB_dict,pdb_file_path):
    mapping_PDB_uniprot = {v: k for k, v in mapping_uniprot_PDB_dict.items()} #gives position of a aligned res in unaligned seq

    # retrieve the interacting residues in the PDBs from PISA, need to make sure it doesn't take into accound the interactions between 2 sym GPCRs
    # interacting residues is defined by a bsa > 0
    #https://github.com/PDBe-KB/pdbe-pisa-json/blob/main/PISA-APIs.ipynb

    interacting_residues_list = []
    binders_chain= []
    # for pdb_id, chain_pdb,uniprot_pdb_start,pdb_start in zip(pdb_ids,chain_pdbs,uniprot_pdb_starts,pdb_starts):
    # try: #when its just 1 chain or 1 chain and a ligand PISA doesn't work
        
    if "a"=="a":
        
        response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/pisa/assembly/{pdb_id.lower()}/1")
        interface_count = response.json()[pdb_id.lower()]["assembly"]["interface_count"]
        for i in range(1,interface_count+1):
            interacting_residues = []
            response_single_interface = requests.get(f"https://www.ebi.ac.uk/pdbe/api/pisa/interface/{pdb_id.lower()}/1/{i}/")
            data = response_single_interface.json()
            if "/" in chain_pdb:
                chain_pdb = chain_pdb.split("/")
            for j in range(len(data["molecules"])):
                if isinstance(chain_pdb,str):
                    if data["molecules"][j]["chain_id"]==chain_pdb:
                        for bsa,position in zip(data["molecules"][j]["buried_surface_areas"],data["molecules"][j]['residue_seq_ids']):
                            if bsa > 0.0:
                                try:
                                    interacting_residues.append(mapping_PDB_uniprot[int(position)])
                                except:
                                    continue
                        if j == 0: #there is supposed to be only 2 molecules, the GPCR and the interacting molecule
                            chain_interacting_molecule = data["molecules"][1]["chain_id"]
                        else: 
                            chain_interacting_molecule = data["molecules"][0]["chain_id"]
                        binders_chain.append(extract_name_binders(chain_interacting_molecule,pdb_file_path))
                        interacting_residues_list.append(list(set(interacting_residues)))

                elif isinstance(chain_pdb,list):
                    if chain_pdb[0] in data["molecules"][j]["chain_id"] and chain_pdb[1] in data["molecules"][j+1]["chain_id"]:
                        break
                    else:
                        if chain_pdb[0] in data["molecules"][j]["chain_id"] or chain_pdb[1] in data["molecules"][j]["chain_id"]:
                            for bsa,position in zip(data["molecules"][j]["buried_surface_areas"],data["molecules"][j]['residue_seq_ids']):
                                if bsa >0.0:
                                    try:
                                        interacting_residues.append(mapping_PDB_uniprot[int(position)])
                                    except:
                                        continue
                        if j == 0: #there is supposed to be only 2 molecules, the GPCR and the interacting molecule
                            chain_interacting_molecule = data["molecules"][1]["chain_id"]
                        else: 
                            chain_interacting_molecule = data["molecules"][0]["chain_id"]
                        binders_chain.append(extract_name_binders(chain_interacting_molecule,pdb_file_path))
                        interacting_residues_list.append(list(set(interacting_residues)))

        return interacting_residues_list,binders_chain
    # except:
    #     return [],""


def extract_name_binders(chain_of_interest,pdb_file_path):
    molecule_name = None
    current_molecule = ""
    reading_molecule = False
    found_chain = False
    
    # Open and read the PDB file
    with open(pdb_file_path, 'r') as file:
        lines = file.readlines()
    
    for line in lines:
        if line.startswith("COMPND"):
            # Start reading the molecule name if "MOLECULE" is in the line
            if "MOLECULE" in line:
                reading_molecule = True
                current_molecule = line.split(":")[1].strip().rstrip(";")  # Extract initial part of the molecule name
            # If the molecule name is being read and it continues on the next line
            elif reading_molecule and "CHAIN" not in line:
                pattern = r"\d+\s+(.+)"
                match = re.search(pattern, line)
                current_molecule += " "+match.group(1).strip().rstrip(";")
            # Once we reach the chain of interest
            if bool(re.search(rf"CHAIN:\s*(?:[^,]*,\s*)*{chain_of_interest}\b", line)):
                found_chain = True
            # If molecule and chain have been found, stop reading
            if found_chain and current_molecule and ";" in line:
                molecule_name = current_molecule
                break
    return molecule_name


In [12]:
from Bio import PDB

def extract_resolution_and_method(file_path):
    """
    Extracts resolution and experimental method from a PDB or mmCIF file.

    Parameters:
    file_path (str): Path to the structure file (.pdb or .cif).

    Returns:
    dict: A dictionary containing the resolution and experimental method.
    """
    method_abb = {"ELECTRON MICROSCOPY":"EM","X-RAY DIFFRACTION":"X-RAY","SOLUTION NMR":"NMR", "SOLID-STATE NMR":"NMR",
                  "ELECTRON CRYSTALLOGRAPHY":"EM"}
    
    if file_path.endswith(".cif"):
        parser = PDB.MMCIFParser(QUIET=True)
    elif file_path.endswith(".pdb"):
        parser = PDB.PDBParser(QUIET=True)
    else:
        raise ValueError("Unsupported file format. Please provide a .pdb or .cif file.")

    structure = parser.get_structure("structure", file_path)

    # Extracting metadata from the structure header
    header = structure.header

    resolution = str(header.get("resolution", "Not available"))+"Å"
    experimental_method = method_abb[header.get("structure_method", "Not available").upper()]
    if experimental_method == "NMR":
        resolution = ""

    return resolution,experimental_method

In [None]:
def retrieve_pdb_dsbonds_interactions(uniprot_json,uniprot_id,sequence):
    structures = []
    ds_bonds = []
    pdbs = []
    chains = []
    uniprot_pdb_starts = []
    pdb_starts = []
    interacting_residues_list = []
    binders_list= []
    for i in range(len(uniprot_json['uniProtKBCrossReferences'])):
        if uniprot_json['uniProtKBCrossReferences'][i]['database'] == 'PDB':
            try:
                pdb_id = uniprot_json['uniProtKBCrossReferences'][i]['id']
                length_chain = 0
                if ',' in uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"]: #when there is a "," it means that there are multiple fragments, let's assume it's longer than 200 residues then
                    sections = uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"].split(",")
                else:
                    sections = [uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"]]
                for j in range(len(sections)):
                    range_chain=sections[j].split("=")[1].split("-")
                    length_chain += int(range_chain[1])-int(range_chain[0])
                if length_chain > 200:
                    #download pdb file
                    if not os.path.exists('../data/tmp/'):
                        os.mkdir('../data/tmp/')
                    try:
                        mmcif=False
                        pdb_file_path = f'../data/tmp/{pdb_id}.pdb'
                        urllib.request.urlretrieve(f'https://files.rcsb.org/download/{pdb_id}.pdb', pdb_file_path)
                    except:
                        mmcif=True
                        # pdb_file_path = f'../data/tmp/{pdb_id}.cif'
                        # urllib.request.urlretrieve(f'https://files.rcsb.org/download/{pdb_id}.cif', pdb_file_path)
                    
                    if not mmcif:
                        #get conformational state structure => provided by GPCRdb
                        state = "Undetermined" #default
                        for structure in structures_chain:
                            if structure["pdb_code"] == pdb_id:
                                state = structure["state"]
                                break
                        chain_pdb =  uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"][0]
                        # full_chain_pdb = uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"].split("=")[0]

                        #get uniprot to pdb mapping
                        folder_mapping_json = "../examples/3Dstructures/uniprot_pdb_mapping/"
                        mapping_file = folder_mapping_json+pdb_id+".json"
                        if not os.path.exists(folder_mapping_json+pdb_id+".json"):
                            mapping_uniprot_pdb_dict= mapping_uniprot_pdb.map_PDB_uniprot(pdb_id,pdb_file_path,chain_pdb,uniprot_id,sequence,folder_mapping_json, type_gpcr = "natural")
                        else:
                            mapping_uniprot_pdb_dict = json.load(open(mapping_file))
                            mapping_uniprot_pdb_dict = {int(k): v for k, v in mapping_uniprot_pdb_dict.items()} #keys are strings

                            
                        #this is needed to find the interactions within the pdb file
                        pdbs.append(pdb_id)

                        resolution,method =  extract_resolution_and_method(pdb_file_path)
                        if mmcif:
                            url = f"https://files.rcsb.org/download/{pdb_id}.cif"
                        else:
                            url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
                            
                        structures.append({"offset":  0, "gaps": [],"value":pdb_id,"chain": chain_pdb, "state":state, "mapping": f"file:///examples/3Dstructures/uniprot_pdb_mapping/{pdb_id}.json", "resolution": resolution, "method": method, "url":url, "reference":f"https://www.rcsb.org/structure/{pdb_id.upper()}", "date":""})

                        #find interacting residues at ligand binding site and G protein binding site
                        interacting_residues,binders = retrieve_interacting_residues_PDB(pdb_id,chain_pdb,mapping_uniprot_pdb_dict,pdb_file_path)
                        if len(interacting_residues)>0:
                            interacting_residues_list.append(interacting_residues)
                            binders_list.append(binders)

                        #remove pdb file
                        os.remove(pdb_file_path)
            except:
                continue

            else:
                with open("to_look_at_pdbs.txt","a") as f:
                    f.write(pdb_id+"\n")
                    f.close()
  
    return structures, interacting_residues_list, binders_list, pdbs

In [14]:
#GPCRdb finds sodium pockets
#As microswitches are well defined in literature we can check ourselves if these well knwon microswitches are present in our gpcrs
#All known microswitches in literature for class A
E_DRY_W = {"positions":["3.49", "3.50", "3.51"],"residues":["ED", "R", "WY"], "name": "E/DRY/W motif (ionic lock switch)"}
CWxP = {"positions":["6.47", "6.48", "6.50"], "residues":["C", "W", "P"], "name": "CWxP motif (transmission toggle switch)"}
NPxxY = {"positions":["7.49", "7.50", "7.53"], "residues":["N","P","Y"], "name": "NPxxY motif (tyr toggle switch)"}
PIF = {"positions": ["5.50", "3.40", "6.44"], "residues":["P","I","F"], "name": "PIF motif"}
hydrophobic_lock = {"positions":["3.43","6.40"], "residues":["LVIM", "LVIM"], "name": "hydrophobic lock"}
# ionic_lock = {"positions":["6.30"], "residues":["DE"], "name": "ionic lock"}
#disulfide bond between TM3 and ECL2 is already identified by Uniprot in the "Disulfide bonds" section
#Sodium binding pocket (allosteric action): middle of the 7TMs. Identified by GPCRdb but are the identified ones all of them???

#the positions are in human readable format (not pyton - starts at 0)
MSA_E_DRY_W = {"positions":[684,685,686],"residues":["ED", "R", "WY"], "name": "E/DRY/W motif (ionic lock switch)"}
MSA_CWxP = {"positions":[1191,1192,1194], "residues":["C", "W", "P"], "name": "CWxP motif (transmission toggle switch)"}
MSA_NPxxY = {"positions":[1265,1266,1269], "residues":["N","P","Y"], "name": "NPxxY motif (tyr toggle switch)"}
MSA_PIF = {"positions": [930,675,1187], "residues":["P","I","F"], "name": "PIF motif"}
MSA_hydrophobic_lock = {"positions":[678,1183], "residues":["LVIM", "LVIM"], "name": "Hydrophobic lock"}
# MSA_ionic_lock = {"positions":[1190], "residues":["DE"], "name": "Ionic lock"}
# MSA_sodium_pocket = {"positions":[620,684], "residues":["D","S"], "name": "Sodium binding pocket"}

TM1x50={"positions":[579],"residues":["N"], "name": "1.50 (BW numbering)"}
TM2x50={"positions":[620],"residues":["D"], "name": "2.50 (BW numbering)"}
TM3x50={"positions":[685],"residues":["R"], "name": "3.50 (BW numbering)"}
TM4x50={"positions":[729],"residues":["W"], "name": "4.50 (BW numbering)"}
TM5x50={"positions":[930],"residues":["P"], "name": "5.50 (BW numbering)"}
TM6x50={"positions":[1194],"residues":["P"], "name": "6.50 (BW numbering)"}
TM7x50={"positions":[1266],"residues":["P"], "name": "7.50 (BW numbering)"}

#equivalence between positions in sequence and in MSA
#dictionary with list of list. In every sublist, 2 elements, 1st is the position in sequence, the 2nd the position in MSA
def map_seq_MSA(sequence_aligned):
    previous = 0
    translate = {}
    sequence_nogaps = sequence_aligned.replace("-","")
    for res in range(len(sequence_nogaps)):
        idx_msa = previous + sequence_aligned[previous:].index(sequence_nogaps[res])
        translate[res+1]=idx_msa+1
        previous = idx_msa + 1
    return translate

#Microswitches/motifs - identify them based on their defined columns in mammalian MSA
def motifs_microswitches_literature(MSA,uniprot_id):

    alignment = AlignIO.read(open(MSA), "fasta")
    len_MSA=alignment.get_alignment_length()
    record_dict = SeqIO.index(MSA, "fasta")
    aligned_seq_interest = str(record_dict[uniprot_id].seq)
    translate_seq_MSA = map_seq_MSA(aligned_seq_interest) #gives position of a unaligned res in msa
    translate_MSA_seq = {v: k for k, v in translate_seq_MSA.items()} #gives position of a aligned res in unaligned seq
    microswitch_types = [MSA_E_DRY_W, MSA_CWxP, MSA_NPxxY, MSA_PIF, MSA_hydrophobic_lock,
                         TM1x50,TM2x50,TM3x50,TM4x50,TM5x50,TM6x50,TM7x50]
    
    microswitches = []
    microswitches_residues = []

    for microswitch_type in microswitch_types:
        are_there = []
        for position, residue in zip(microswitch_type["positions"], microswitch_type["residues"]):
            if aligned_seq_interest[position-1] in residue:
                are_there.append(True)
            else:
                are_there.append(False)
        for i, (position, residue) in enumerate(zip(microswitch_type["positions"], microswitch_type["residues"])):
            microswitch_residue = {}
            
            #take into account the possibility that there is a gap at that position in the MSA
            if position in translate_MSA_seq:
                microswitch_residue["start"] = translate_MSA_seq[position]
                microswitch_residue["end"] = translate_MSA_seq[position]
                residue_motif = aligned_seq_interest[position-1]
            else:
                for next in range(position+1,len_MSA):
                    if next in translate_MSA_seq:
                        microswitch_residue["start"] = translate_MSA_seq[next]
                        microswitch_residue["end"] = translate_MSA_seq[next]
                        residue_motif = aligned_seq_interest[next-1]
                        break

            if not all(are_there) and not are_there[i]:
                if residue_motif == "F" and microswitch_type["name"]=="PIF motif":
                    microswitch_residue["description"] = residue_motif + " instead of "+ residue+ " from " + " (part of " + microswitch_type["name"]+ ")"
                elif residue_motif == "R" and microswitch_type["name"]=="E/DRY/W motif (ionic lock switch)":
                    microswitch_residue["description"] = residue_motif + " instead of "+ residue + " (part of " + microswitch_type["name"]+ ")"
                elif "(BW numbering)" in microswitch_type["name"]:
                    microswitch_residue["description"] = residue_motif+ " instead of "+ residue + " " + microswitch_type["name"]
                else:
                    microswitch_residue["description"] = residue_motif + " instead of "+ residue + " (part of " + microswitch_type["name"]+ ")"
            else:
                if residue_motif == "F" and microswitch_type["name"]=="PIF motif":
                    microswitch_residue["description"] = residue_motif + " part of " + microswitch_type["name"] +" and hydrophobic lock"
                elif residue_motif == "R" and microswitch_type["name"]=="E/DRY/W motif (ionic lock switch)":
                    microswitch_residue["description"] = residue_motif+ " part of " + microswitch_type["name"] +" and ionic lock"
                elif "(BW numbering)" in microswitch_type["name"]:
                    microswitch_residue["description"] = residue_motif+ " " + microswitch_type["name"]
                else:
                    microswitch_residue["description"] = residue_motif+ " part of " + microswitch_type["name"]
            if are_there[i]:
                microswitch_residue["conserved"] = "yes"
            else:
                microswitch_residue["conserved"] = "no"
            microswitch_residue["reference"] = "Based on alignment"
            microswitches_residues.append(microswitch_residue)  

    return microswitches_residues

def computeBW_numbering(uniprotID,all_regions,MSA):
    aligned_seq_interest = str(SeqIO.to_dict(SeqIO.parse(MSA,"fasta"))[uniprotID].seq)
    translate_seq_MSA = map_seq_MSA(aligned_seq_interest) #gives position of a unaligned res in msa
    translate_MSA_seq = {v: k for k, v in translate_seq_MSA.items()} #gives position of a aligned res in unaligned seq

    BW_conserved_positions = [TM1x50,TM2x50,TM3x50,TM4x50,TM5x50,TM6x50,TM7x50]

    uniprot_BW_mapping =  {i + 1: "" for i in range(len(aligned_seq_interest.replace("-","")))}
    counter = 1

    for TMname in all_regions:
        if "TM" in TMname["name"]:
            position_50 = BW_conserved_positions[counter-1]["positions"][0]
            try:
                position_50_seq = translate_MSA_seq[position_50]
            except:
                try:
                    position_50_seq = translate_MSA_seq[position_50+1]
                except:
                    try:
                        position_50_seq = translate_MSA_seq[position_50-1]
                    except:
                        print('This entry has no residue that align with most conserved residue of TM, neither one left or right of it, weird')
            numberTM = str(counter)+"."

            uniprot_BW_mapping[position_50_seq]+=(numberTM+str(50))

            start = TMname["start"]
            end = TMname["end"]
            distance = 1
            for residue in range(position_50_seq-1,start-1,-1):
                pos = 50-distance
                uniprot_BW_mapping[residue]+=(numberTM+str(pos))
                distance +=1

            distance = 1
            for residue in range(position_50_seq+1,end+1):
                pos = 50+distance
                uniprot_BW_mapping[residue]+=(numberTM+str(pos))
                distance +=1
            
            counter +=1
            
    for pos, value in uniprot_BW_mapping.items():
        if len(value)==0:
            uniprot_BW_mapping[pos]="N/A"
    return uniprot_BW_mapping

In [15]:
#features from Uniprot binding site, PTM, natural variants
def features_uniprot(uniprot_json,uniprot_id):
    binding_sites = []
    PTMs = []
    disulfide_bonds = []
    mutagenesiss = []

    for i in range(len(uniprot_json['features'])):

        #Motifs/Microswitches Uniprot are already identified by the self written code above where we check if all well defined microswitches knwon in literature are present/absent in the GPCR of interest

        #Binding site (orthosteric & allosteric)
        if uniprot_json['features'][i]['type'] == 'Binding site':
            binding_site = {}
            binding_site['start'] = uniprot_json['features'][i]['location']['start']['value']
            binding_site['end'] = uniprot_json['features'][i]['location']['end']['value']
            binding_site['type'] = uniprot_json['features'][i]['ligand']['name']
            binding_site['description'] = "Orthosteric or allosteric extracellular binding site."
            binding_site["reference"] = f"https://www.uniprot.org/uniprotkb/{uniprot_id}/entry"
            binding_sites.append(binding_site)
        ##PTMs
        #Glycosylation
        elif uniprot_json['features'][i]['type'] == 'Glycosylation' or uniprot_json['features'][i]['type'] == 'Lipidation' or uniprot_json['features'][i]['type'] == 'Modified residue':
            PTM = {}
            if uniprot_json['features'][i]['type'] == 'Glycosylation':
                PTM['start'] = uniprot_json['features'][i]['location']['start']['value']
                PTM['end'] = uniprot_json['features'][i]['location']['end']['value']
                PTM['description'] = 'Glycosylation'
            elif uniprot_json['features'][i]['type'] == 'Lipidation':
                PTM['start'] = uniprot_json['features'][i]['location']['start']['value']
                PTM['end'] = uniprot_json['features'][i]['location']['end']['value']
                PTM['description'] = 'Lipidation'
            else:
                ptm_types = ["phospho","methyl","acetyl", "amid", "pyrro", "hydroxy", "l-", "d-", "sulf","nitro"]
                full_ptm_types = ["Phosphorylation", "Methylation", "Acetylation", "Amidation", "Pyrrolidone carboxylic acid", "Hydroxylation", "Isomerization", "Isomerization", "Sulfation", "Nitrosylation"]
                description = uniprot_json['features'][i]['description']
                for idx, ptm_type in enumerate(ptm_types):
                    if ptm_type in uniprot_json['features'][i]['description'].lower():
                        description = full_ptm_types[idx]
                        break
                PTM['start'] = uniprot_json['features'][i]['location']['start']['value']
                PTM['end'] = uniprot_json['features'][i]['location']['end']['value']
                PTM['description'] = description
            PTM['reference'] = f"https://www.uniprot.org/uniprotkb/{uniprot_id}/entry"
            PTMs.append(PTM)

        # #Disulfide bond",
        elif uniprot_json['features'][i]['type'] == 'Disulfide bond':
            disulfide_bond = {}
            disulfide_bond['start'] = uniprot_json['features'][i]['location']['start']['value'] #start and end actually mean between residue x and y - start & end are the residues involved in disulfide bridge"
            disulfide_bond['end'] = uniprot_json['features'][i]['location']['end']['value'] #start and end actually mean between residue x and y - start & end are the residues involved in disulfide bridge"
            disulfide_bond["description"] = "Disulfide bond"
            disulfide_bond["reference"] = "UniProt"
            disulfide_bonds.append(disulfide_bond)

    return binding_sites,PTMs,disulfide_bonds

In [16]:
#retrieve PTMs on Scop3P and compare with PTMs we have already found
def retrieve_PTM_Scop3P(uniprot_id):
    requestURL = f"https://iomics.ugent.be/scop3p/api/modifications?accession={uniprot_id}"
    try: 
        r = requests.get(requestURL)
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        scop3P_PTM = json.loads(r.text)
    except:
        scop3P_PTM = []

    if len(scop3P_PTM)>0:

        #add if not alrady there
        ptms = []
        for ptm in scop3P_PTM["modifications"]:
            position = ptm['position']
            # if not position in positions_previous:
            ptm_dict = {'start': position , 'end': position, 'description': 'Phosphorylation', 'reference': "https://iomics.ugent.be/scop3p/index?protein={uniprot_id}"}
            ptms.append(ptm_dict)
    else:
        ptms=[]
    return ptms

In [17]:
def run_dssp(uniprotID, type ="natural"):

    dssp_folder = "../examples/3Dstructures/dssp/"
    dssp_filename = dssp_folder+uniprotID+".dssp"
    remove_tmp = False
    mapping_uniprot_pdb_numbering = None

    if not os.path.exists(dssp_filename):
        try: 
            pdb_representative = representative_structures_json[uniprotID]["pdb_id"]
            gpcr_chain = representative_structures_json[uniprotID]["gpcr_chain"]
            print("Problem, GPCR has exp structure and has no DSSP file", pdb_representative, gpcr_chain)
            #add chimera exp structure
        except:
            #AF
            #check if gpcrdb has updated model
            if type == "natural":
                if os.path.exists(f"../examples/3Dstructures/AF_gpcrdb_2024/{uniprotID}.pdb"):
                    input_file = f"../examples/3Dstructures/AF_gpcrdb_2024/{uniprotID}.pdb"
                    gpcr_chain = "A"
                else:
                    #AlphaFold2 DB
                    input_file = f'../examples/3Dstructures/tmp/{uniprotID}.pdb'
                    urllib.request.urlretrieve(f"https://alphafold.ebi.ac.uk/files/AF-{uniprotID}-F1-model_v4.pdb", input_file)
                    gpcr_chain = "A"
                    remove_tmp = True
            else:
                if os.path.exists(f"../examples/3Dstructures/AF_chimera_2024/{uniprotID}.pdb"):
                    input_file = f"../examples/3Dstructures/AF_chimera_2024/{uniprotID}.pdb"
                    gpcr_chain = "A"
            
            subprocess.run(
                    ["mkdssp", input_file, dssp_filename])
                
            if remove_tmp:
                os.remove(input_file)
        return dssp_filename, gpcr_chain, mapping_uniprot_pdb_numbering

    else:
        try: 
            pdb_representative = representative_structures_json[uniprotID]["pdb_id"]
            gpcr_chain = representative_structures_json[uniprotID]["gpcr_chain"]
            mapping_uniprot_pdb_numbering = json.load(open("../examples/3Dstructures/uniprot_pdb_mapping/"+pdb_representative+".json"))
            mapping_uniprot_pdb_numbering = {int(k): v for k, v in mapping_uniprot_pdb_numbering.items()} #keys are strings
        except: #AF model
            gpcr_chain = "A"
        return dssp_filename, gpcr_chain, mapping_uniprot_pdb_numbering

def find_closest(target,dict_interest):
    if target in dict_interest:
        min_start = dict_interest[target]
    else:
        closest_key = min(dict_interest.keys(), key=lambda x: abs(x - target))
        min_start = dict_interest[closest_key]
    return min_start
    
def remap_min_max_limits_prot_interest(min_max_limits,aligned_seq_interest):
    translate_seq_MSA = map_seq_MSA(aligned_seq_interest) #gives position of a unaligned res in msa
    translate_MSA_seq = {v: k for k, v in translate_seq_MSA.items()} #gives position of a aligned res in unaligned seq
    min_max_limits_translated = {}
    for TMname, limits in min_max_limits.items():
        min_start = find_closest(limits[0][0],translate_MSA_seq)
        max_start = find_closest(limits[0][1],translate_MSA_seq)
        min_end = find_closest(limits[1][0],translate_MSA_seq)
        max_end = find_closest(limits[1][1],translate_MSA_seq)
        min_max_limits_translated[TMname]=[[min_start,max_start],[min_end,max_end]]
    return min_max_limits_translated


def refine_TM_regions(min_max_limits_translated,TM_regions_prot):

    refined_TMs = {}
    
    # Step 1: Ensure each TM aligns with MSA-defined limits
    for tm_label, (dssp_start, dssp_end) in TM_regions_prot:
        if tm_label in min_max_limits_translated:
            (min_start, max_start), (min_end, max_end) = min_max_limits_translated[tm_label]

            # Step 2: Adjust start and end points based on MSA limits
            adjusted_start = max(min_start, min(dssp_start, max_start))  # Keep within min-max range
            adjusted_end = min(max_end, max(dssp_end, min_end))  # Keep within min-max range

            # Ensure the TM region has at least 10 residues
            if adjusted_end - adjusted_start + 1 < 10:
                # If too short, expand towards the closest allowed limit
                if adjusted_start > min_start:
                    adjusted_start = max(min_start, adjusted_start - (10 - (adjusted_end - adjusted_start + 1)))
                if adjusted_end < max_end:
                    adjusted_end = min(max_end, adjusted_end + (10 - (adjusted_end - adjusted_start + 1)))

            refined_TMs[tm_label] = (adjusted_start, adjusted_end)

    # Step 3: Handle cases where DSSP predicts fewer than 7 TMs
    if len(refined_TMs) < 7:
        missing_TMs = [tm for tm in min_max_limits_translated.keys() if tm not in refined_TMs]
        for tm in missing_TMs:
            (min_start, max_start), (min_end, max_end) = min_max_limits_translated[tm]
            refined_TMs[tm] = (min_start, max_end)  # Assign entire range if missing

    # Step 4: Handle cases where DSSP predicts too many TMs
    if len(refined_TMs) > 7:
        # Merge small or overlapping helices
        tm_keys = sorted(refined_TMs.keys(), key=lambda x: int(x[2:]))  # Sort TM labels (TM1, TM2, ...)
        merged_TMs = {}
        prev_tm = None

        for tm in tm_keys:
            if prev_tm is None:
                merged_TMs[tm] = refined_TMs[tm]
            else:
                prev_start, prev_end = merged_TMs[prev_tm]
                curr_start, curr_end = refined_TMs[tm]

                # Merge if overlap or short segment
                if curr_start - prev_end < 5 or (curr_end - curr_start + 1 < 10):
                    merged_TMs[prev_tm] = (prev_start, curr_end)  # Extend previous TM
                else:
                    merged_TMs[tm] = refined_TMs[tm]

            prev_tm = tm
        
        refined_TMs = merged_TMs

    # Step 5: Ensure exactly 7 TM regions
    if len(refined_TMs) < 7:
        missing_TMs = [tm for tm in min_max_limits_translated.keys() if tm not in refined_TMs]
        for tm in missing_TMs[: 7 - len(refined_TMs)]:  # Fill in missing TMs up to 7
            (min_start, max_start), (min_end, max_end) = min_max_limits_translated[tm]
            refined_TMs[tm] = (min_start, max_end)

    return refined_TMs

def map_min_max_limitsMSA_chimera(min_max_limits_translated_parent,aligned_seq_interest_ref,seq_chimera):

    unaligned_ref = aligned_seq_interest_ref.replace("-","")

    alignments_global = pairwise2.align.globalms(
        unaligned_ref, seq_chimera, match=2, mismatch=-1,
        open=-2, extend=-1,
        one_alignment_only=True
    )
    aligned_ref_MSAchimera, aligned_seq_interest_chimera = alignments_global[0].seqA, alignments_global[0].seqB

    #map min max limits TMs parent to MSA with chimera
    min_max_limits_MSAchimera = {}
    translate_seq_MSA = map_seq_MSA(aligned_ref_MSAchimera) #gives position of a unaligned res in msa
    for TMname, limits in min_max_limits_translated_parent.items():
        min_start = translate_seq_MSA[limits[0][0]]
        max_start = translate_seq_MSA[limits[0][1]]
        min_end = translate_seq_MSA[limits[1][0]]
        max_end = translate_seq_MSA[limits[1][1]]
        min_max_limits_MSAchimera[TMname]=[[min_start,max_start],[min_end,max_end]]
    return min_max_limits_MSAchimera,aligned_seq_interest_chimera

def compute_dssp_TM_regions(uniprotID,MSA,type_gpcr = "natural",ref_id=None, seq_chimera = None):

    record_dict = SeqIO.index(MSA, "fasta")
    if type_gpcr == "natural":
        aligned_seq_interest = str(record_dict[uniprotID].seq)
        length_prot = len(aligned_seq_interest.replace("-",""))
    else:
        length_prot = len(seq_chimera)
        aligned_seq_interest_ref = str(record_dict[ref_id].seq)

    dssp_filename, gpcr_chain, mapping_uniprot_pdb_numbering  = run_dssp(uniprotID,type_gpcr) #run dssp if needed on AF model else retrieve dssp file
    dssp_tup = make_dssp_dict(dssp_filename)
    dssp_dic = dssp_tup[0]

    # Extract DSSP codes along with their actual residue positions
    dssp_positions = []
    for key, value in dssp_dic.items():
        chain, res_info = key  # Extract chain and residue details
        if chain == gpcr_chain:  # Filter only the chain of interest
            res_id = res_info[1]  # Actual residue position in the protein
            dssp_code = value[1]  # DSSP secondary structure code
            dssp_positions.append((res_id, dssp_code))

    # Replace '-' with 'X' in DSSP codes
    dssp_positions = [(res_id, code.replace('-', 'X')) for res_id, code in dssp_positions]

    # Define conserved secondary structure elements (Helix structures)
    conserved_2structure_dssp = {"H", "I", "G"}

    # Filter for helix positions
    helix_positions = [res_id for res_id, code in dssp_positions if code in conserved_2structure_dssp]
    if mapping_uniprot_pdb_numbering != None:
        helix_positions_renumbered = []
        mapping_pdb_uniprot_numbering = {v: k for k, v in mapping_uniprot_pdb_numbering.items()}
        for pos in helix_positions:
            try:
                helix_positions_renumbered.append(mapping_pdb_uniprot_numbering[int(pos)])
            except: #not part of GPCR
                continue
        helix_positions = helix_positions_renumbered
    TM_regions = []
    #find start and end anchor points
    start = helix_positions[0]
    stop = None
    counter = 1
    min_tm_length = 10 #a TM are typically at least 10 residues long otherwise they are considered as small helices connecting the TMs
    for i in range(1, len(helix_positions)):
        if helix_positions[i] - helix_positions[i - 1] > 1:
            if helix_positions[i-1] - start >= min_tm_length :
                stop = helix_positions[i-1]
            else:
                start = helix_positions[i]
        if stop != None:
            # if stop-start+1 > 3: #we need at least 3 consecutive columns with enough conserved secondary structure elements before considering it as an anchor region
            TM_regions.append(("TM"+str(counter),(start,stop)))
            counter +=1
            stop = None
            start = helix_positions[i]
    TM_regions.append(("TM"+str(counter),(start,helix_positions[-1])))
    
    #adapt TM regions based on 50%-80% limits
    min_max_limits = {'TM1':[[559,563],[588,588]],
                        'TM2':[[606,606],[636,637]],
                        'TM3':[[654,657],[690,690]],
                        'TM4':[[717,717],[741,742]],
                        'TM5':[[913,916],[946,948]],
                        'TM6':[[1169,1174],[1203,1204]],
                        'TM7':[[1246,1248],[1270,1271]],} #based on partial dssp MSA with mapping dssp exp structures representative
    if type_gpcr == "natural":
        min_max_limits_translated_interest = remap_min_max_limits_prot_interest(min_max_limits,aligned_seq_interest)
        refined_TM_regions = refine_TM_regions(min_max_limits_translated_interest,TM_regions)
    else:
        min_max_limits_translated_parent = remap_min_max_limits_prot_interest(min_max_limits,aligned_seq_interest_ref)
        min_max_limits_MSAchimera_parent,aligned_seq_interest_chimera = map_min_max_limitsMSA_chimera(min_max_limits_translated_parent,aligned_seq_interest_ref,seq_chimera)
        min_max_limits_translated_interest = remap_min_max_limits_prot_interest(min_max_limits_MSAchimera_parent,aligned_seq_interest_chimera)
        refined_TM_regions = refine_TM_regions(min_max_limits_translated_interest, TM_regions)
    
    structured_regions = [
        ("Nterm", 1, refined_TM_regions["TM1"][0] - 1),
        ("TM1", refined_TM_regions["TM1"][0], refined_TM_regions["TM1"][1]),
        ("ICL1", refined_TM_regions["TM1"][1] + 1, refined_TM_regions["TM2"][0] - 1),
        ("TM2", refined_TM_regions["TM2"][0],refined_TM_regions["TM2"][1]),
        ("ECL1", refined_TM_regions["TM2"][1] + 1, refined_TM_regions["TM3"][0] - 1),
        ("TM3",  refined_TM_regions["TM3"][0], refined_TM_regions["TM3"][1]),
        ("ICL2",  refined_TM_regions["TM3"][1] + 1,  refined_TM_regions["TM4"][0] - 1),
        ("TM4",  refined_TM_regions["TM4"][0],refined_TM_regions["TM4"][1]),
        ("ECL2", refined_TM_regions["TM4"][1] + 1, refined_TM_regions["TM5"][0] - 1),
        ("TM5", refined_TM_regions["TM5"][0],refined_TM_regions["TM5"][1]),
        ("ICL3", refined_TM_regions["TM5"][1] + 1, refined_TM_regions["TM6"][0] - 1),
        ("TM6", refined_TM_regions["TM6"][0],refined_TM_regions["TM6"][1]),
        ("ECL3", refined_TM_regions["TM6"][1] + 1, refined_TM_regions["TM7"][0] - 1),
        ("TM7", refined_TM_regions["TM7"][0],refined_TM_regions["TM7"][1]),
        ("H8&Cterm", refined_TM_regions["TM7"][1] + 1, length_prot)
    ]

    regions = []
    for name, start, end in structured_regions:
        regions.append({
            "name": name,
            "start": start,
            "end": end,
            "reference": "DSSP"
        })

    return regions


In [18]:
def cutting_pts_2_ss_region(dict_regions,all_regions=None):
    translated_regions = []
    if isinstance(dict_regions,dict):
        for region in dict_regions.keys():
            lower_lim = dict_regions[region][0]
            upper_lim = dict_regions[region][1]
            for ss_region in all_regions:
                if lower_lim >= ss_region["start"] and lower_lim <= ss_region["end"]:
                    ss_lower_lim = ss_region["name"]
                if upper_lim >= ss_region["start"] and upper_lim <= ss_region["end"]:
                    ss_upper_lim = ss_region["name"]
            translated_regions.append(str(lower_lim)+"-"+str(upper_lim)+f" ({ss_lower_lim}-{ss_upper_lim})")
    elif isinstance(dict_regions,list):
        if all_regions:
            for section in dict_regions:
                info = []
                for pos in section:
                    for ss_region in all_regions:
                        if int(pos) >= ss_region["start"] and int(pos) <= ss_region["end"]:
                            ss_pos = ss_region["name"]
                            break
                    info.append(pos)
                    info.append(ss_pos)
                translated_regions.append(str(info[0])+"-"+str(info[2])+f" ({info[1]}-{info[3]})")
        else:
            for region in dict_regions:
                lower_lim = region[0]
                upper_lim = region[1]
                translated_regions.append(str(lower_lim)+"-"+str(upper_lim))
    return translated_regions

def often_used_cutting_pts(cutting_points_parent,chimera,sequence,all_regions_cutting_pts,related_chimeras,dict_regions):
    cutting_pts = chimera[cutting_points_parent]
    for region in cutting_pts:
        positions=region.split(" ")[0]
        name=region.split(" ")[1][1:-1]
        if positions.split("-")[0] != "1":
            position = int(positions.split("-")[0])
            name_region= name.split("-")[0]
            lower_lim = [d for d in dict_regions if d.get("name") == name_region][0]["start"]
            upper_lim = [d for d in dict_regions if d.get("name") == name_region][0]["end"]
            a_third = round((upper_lim-lower_lim)/3)
            if position < (lower_lim + a_third):
                idx = 0
            elif position < (lower_lim + 2*a_third):
                idx = 1
            else:
                idx = 2
            try:
                all_regions_cutting_pts[name_region].append([sequence[position-1]+str(position),idx])
            except:
                all_regions_cutting_pts[name_region]=[[sequence[position-1]+str(position),idx]]
            try:
                related_chimeras[sequence[position-1]+str(position)].append(chimera["name"])
            except:
                related_chimeras[sequence[position-1]+str(position)]=[chimera["name"]]
        if positions.split("-")[1] != str(len(sequence)):
            position = int(positions.split("-")[1])
            name_region= name.split("-")[1]
            lower_lim = [d for d in dict_regions if d.get("name") == name_region][0]["start"]
            upper_lim = [d for d in dict_regions if d.get("name") == name_region][0]["end"]
            a_third = lower_lim+round((upper_lim-lower_lim)/3)
            if position < (lower_lim + a_third):
                idx = 0
            elif position < (lower_lim + 2*a_third):
                idx = 1
            else:
                idx = 2
            try:
                all_regions_cutting_pts[name_region].append([sequence[position-1]+str(position),idx])
            except:
                all_regions_cutting_pts[name_region]=[[sequence[position-1]+str(position),idx]]
            try:
                related_chimeras[sequence[position-1]+str(position)].append(chimera["name"])
            except:
                related_chimeras[sequence[position-1]+str(position)]=[chimera["name"]]
    return all_regions_cutting_pts,related_chimeras


def retrieve_involvement_natural_chimeric_design(uniprot_id,abb_name,sequence,chimeric_design_df):

    involvement = []

    all_regions_cutting_pts_all = {}
    related_chimeras_all = {}
    for parent_column_id in ['Reference_id','Target_id']:

        #find rows that have uniprot as ref id or target id
        designs_parent = chimeric_design_df[chimeric_design_df[parent_column_id] == uniprot_id]

        #Info from rows
        names_chimeras = designs_parent['Chimera_name'].tolist()
        ids_chimeras = designs_parent['Chimera_name_ids'].tolist()
        regions_chimera = designs_parent['Chimera_parts'].tolist()
        name_target_chimeras = designs_parent['Target_name'].tolist()
        id_target_chimeras = designs_parent['Target_id'].tolist()

        name_ref_chimeras = designs_parent['Reference_name'].tolist()
        id_ref_chimeras = designs_parent['Reference_id'].tolist()

        regions_ref_chimeras = designs_parent['Reference_cutting_points'].tolist()
        regions_target_chimeras = designs_parent['Target_cutting_points'].tolist()

        expression = designs_parent['Expression binary'].tolist()
        fct = designs_parent['Function binary'].tolist()

        application = designs_parent['Application'].tolist()
        type_chimera = designs_parent['Chimera Type (1/2/3)'].tolist()
        Gprot = designs_parent['G-protein'].tolist()
        Ligand = designs_parent['Ligand'].tolist()
        structures = designs_parent['3D structure PDB'].tolist()
        biblio = designs_parent['DOI'].tolist()

        for i,(name, id) in enumerate(zip(names_chimeras,ids_chimeras)):

            sequence_chimera = str(chimeras_record_dict[name].seq)
            all_regions = compute_dssp_TM_regions(id,MSA,type_gpcr = "chimera",ref_id=id_ref_chimeras[i], seq_chimera = sequence_chimera)
            cutting_pt_chimera = cutting_pts_2_ss_region(eval(regions_chimera[i]),all_regions) 

            all_regions_ref = compute_dssp_TM_regions(id_ref_chimeras[i],MSA,type_gpcr = "natural")
            cutting_pt_ref = cutting_pts_2_ss_region(eval(regions_ref_chimeras[i]),all_regions_ref)

            all_regions_target = compute_dssp_TM_regions(id_target_chimeras[i],MSA,type_gpcr = "natural")
            cutting_pt_target = cutting_pts_2_ss_region(eval(regions_target_chimeras[i]),all_regions_target)

            pharma_name_ref = html.unescape(get_pharma_name(id_ref_chimeras[i],name_ref_chimeras[i]))
            pharma_name_target = html.unescape(get_pharma_name(id_target_chimeras[i], name_target_chimeras[i]))

            pharma_name_ref_ = pharma_name_ref
            pharma_name_target_ = pharma_name_target
            if "receptor" in pharma_name_ref.lower():
                pharma_name_ref_ = pharma_name_ref.replace(" receptor","")
            if "receptor" in pharma_name_target.lower():
                pharma_name_target_ = pharma_name_target.replace(" receptor","")
            pharma_name = pharma_name_ref_ + " " + pharma_name_target_ + " receptor"
            if "adrenoceptor" in pharma_name:
                pharma_name = pharma_name.replace(" receptor","")

            if isinstance(structures[i],str):
                pdb = structures[i]
            else:
                pdb = ""

            if isinstance(Gprot[i],str):
                gprot=Gprot[i]
            else:
                gprot=""

            if isinstance(Ligand[i],str):
                ligand=Ligand[i]
            else:
                ligand=""

            chimera={
            "name":name,
            "name_pharma":pharma_name,
            "id":ids_chimeras[i],
            "ref": name_ref_chimeras[i],
            "ref_pharma_name": pharma_name_ref,
            "target": name_target_chimeras[i],
            "target_pharma_name":pharma_name_target,
            "cutting_point_chimera": cutting_pt_chimera,
            "cutting_point_ref": cutting_pt_ref,
            "cutting_point_target": cutting_pt_target,
            "expression_function": fct[i],
            "type":type_chimera[i],
            "GprotLigand": gprot+" "+ligand,
            "application": application[i]+" "+pdb,
            "reference": biblio[i]
            }

            involvement.append(chimera)

            if "_".join(name.split("_")[:2]) == abb_name:
                all_regions_cutting_pts_all,related_chimeras_all=often_used_cutting_pts("cutting_point_ref",chimera,sequence,all_regions_cutting_pts_all,related_chimeras_all,all_regions_ref)
            else: 
                all_regions_cutting_pts_all,related_chimeras_all=often_used_cutting_pts("cutting_point_target",chimera,sequence,all_regions_cutting_pts_all,related_chimeras_all,all_regions_target)                
    
    for key,value in all_regions_cutting_pts_all.items():
        unique_items = list(map(list, set(map(tuple, value))))
        all_regions_cutting_pts_all[key] = unique_items

    for key,value in related_chimeras_all.items():
        unique_items = list(set(value))
        related_chimeras_all[key] = unique_items

    return involvement,all_regions_cutting_pts_all,related_chimeras_all

In [19]:
def get_pharma_name(uniprotID,abb_name):
    #GtoP or gpcrdb_name or pharmacological name
    try:
        requestURL = f"https://gpcrdb.org/services/protein/accession/{uniprotID}"

        r = requests.get(requestURL, headers={ "Accept" : "application/json"})
        if not r.ok:
            info_entry = None
        else:
            info_entry = json.loads(r.text)
    except:
        info_entry = None

    if not info_entry is None:
        clean_html_tags = re.compile('<.*?>')
        pharma_name = re.sub(clean_html_tags, '', info_entry["name"])
    else:
        pharma_name = abb_name

    return pharma_name

In [20]:
def remove_duplicates(list_dictionaries,uniprot_id, descriminator1, descriminator2=None):
    # Initialize a dictionary to count occurrences of (start, description) pairs
    count = {}

    # First pass: Count occurrences of each (start, description) pair
    for d in list_dictionaries:
        if descriminator2 != None:
            identifier = (d[descriminator1], d[descriminator2])
        else:
            identifier = (d[descriminator1])
        if identifier in count:
            count[identifier] += 1
        else:
            count[identifier] = 1

    # Initialize a set to track seen (start, description) pairs
    seen = set()
    # Initialize a list to store the filtered dictionaries
    unique_dict_list = []

    # Second pass: Filter dictionaries and update "other_key" for duplicates
    for d in list_dictionaries:
        if descriminator2 != None:
            identifier = (d[descriminator1], d[descriminator2])
        else:
            identifier = (d[descriminator1])
        if identifier not in seen:
            # If this (start, description) pair is a duplicate (appears more than once)
            if count[identifier] > 1:
                # don't choose randomly the reference, set it to the preferred reference type
                if descriminator2 != None:
                    d["reference"] = f"https://iomics.ugent.be/scop3p/index?protein={uniprot_id}"
                else:
                    d["reference"] = "https://www.ebi.ac.uk/pdbe/pisa/"
            # Add it to the seen set
            seen.add(identifier)
            # Add the dictionary to the unique list
            unique_dict_list.append(d)
    return unique_dict_list

In [21]:
#convert scientific name UniProt to common name
def parse_species_file(file_path):
    species_dict = {}
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    scientific_name = None
    
    for line in lines:
        if "N=" in line:
            scientific_name = line.split("N=")[1].strip()
        elif "C=" in line and scientific_name:
            common_name = line.split("C=")[1].strip()
            species_dict[scientific_name] = common_name
            scientific_name = None  # Reset for the next entry
    
    return species_dict

file_path = '../data/UniProt_names_scientific_common.txt' 
species_dict = parse_species_file(file_path)

In [22]:
def retrieve_predicted_models(structures,uniprotID):

    #AF
    #check if gpcrdb has updated model
    if os.path.exists(f"../examples/3Dstructures/AF_gpcrdb_2024/{uniprotID}.pdb"):
        af_gpcrdb = f"file:///examples/3Dstructures/AF_gpcrdb_2024/{uniprotID}.pdb"
        states_AF_json = json.load(open("../examples/3Dstructures/AF_gpcrdb_2024/AF_gpcrdb_state.json","r"))
        structures.append({"value":"AlphaFold2","chain": "A",  "state":states_AF_json[uniprotID], "offset":  0, "gaps": [], "resolution": "", "method": "Predicted", "url": af_gpcrdb, "reference":"GPCRdb", "date":"2024"})
    else:
        #AlphaFold2 DB
        structures.append({"value":"AlphaFold2","chain": "A",  "state":"Undetermined", "offset":  0, "gaps": [], "resolution": "", "method": "Predicted", "url": f"https://alphafold.ebi.ac.uk/files/AF-{uniprotID}-F1-model_v4.pdb", "reference":"AFDB", "date":"2022"})

    #AF multistate active
    #check if gpcrdb has updated model 
    if os.path.exists(f"../examples/3Dstructures/AFms_gpcrdb_2024/Active/{uniprotID}.pdb"):
        af_ms_active = f"file:///examples/3Dstructures/AFms_gpcrdb_2024/Active/{uniprotID}.pdb"
        structures.append({"value":"AlphaFold2-Multistate Active","chain": "A", "state":"Active", "offset":  0, "gaps": [], "resolution": "", "method": "Predicted", "url": af_ms_active, "reference":"AlphaFold multistate", "date":"2024"})
    else:
        #AlphaFold multistate. Don't have a AF ms for every GPCR (only humans). Need to check if file exist:
        if os.path.exists("../examples/3Dstructures/AFms_2023/Active/{uniprotID}.pdb"):
            af_ms_active = f"file:///examples/3Dstructures/AFms_2023/Active/{uniprotID}.pdb"
            structures.append({"value":"AlphaFold2-Multistate Active","chain": "A", "state":"Active", "offset":  0, "gaps": [], "resolution": "", "method": "Predicted", "url": af_ms_active, "reference":"AlphaFold multistate", "date":"2023"})

    #AF multistate inactive 
    #check if gpcrdb has updated model 
    if os.path.exists(f"../examples/3Dstructures/AFms_gpcrdb_2024/Inactive/{uniprotID}.pdb"):

        af_ms_inactive = f"file:///examples/3Dstructures/AFms_gpcrdb_2024/Inactive/{uniprotID}.pdb"
        structures.append({"value":"AlphaFold2-Multistate Inactive","chain": "A", "state":"Inactive", "offset":  0, "gaps": [], "resolution": "", "method": "Predicted", "url": af_ms_inactive, "reference":"AlphaFold multistate", "date":"2024"})
    else:
        #AlphaFold multistate. Don't have a AF ms for every GPCR (only humans). Need to check if file exist:
        if os.path.exists(f"../examples/3Dstructures/AFms_2023/Inactive/{uniprotID}.pdb"):
            af_ms_inactive = f"file:///examples/3Dstructures/AFms_2023/Inactive/{uniprotID}.pdb"
            structures.append({"value":"AlphaFold2-Multistate Inactive","chain": "A", "state":"Inactive", "offset":  0, "gaps": [], "resolution": "", "method": "Predicted", "url": af_ms_inactive, "reference":"AlphaFold multistate", "date":"2023"})

    return structures


In [None]:
def gather_interacting_residues(interacting_residues_list,binders_list,pdbs):
    # Reprocessing the data with the updated binders list
    interactions = {}

    for i, residue_groups in enumerate(interacting_residues_list):
        binders = binders_list[i]
        pdb_id = pdbs[i]

        for j,residue_group in enumerate(residue_groups):
            for residue in residue_group:
                if residue not in interactions:
                    interactions[residue] = [[binders[j],pdb_id]]
                else:
                    interactions[residue].append([binders[j],pdb_id])

    contacts_list =[]
    for residue,info in interactions.items():
        pdbs = []
        binders = []
        for data in info:
            if not data[0] is None:
                binders.append(data[0])
                pdbs.append(data[1])
        binders=list(set(binders))
        if len(binders)>0:
            contacts_list.append({
                "start": residue,
                "end": residue,
                "type": ", ".join(binders),  # Ensuring unique binders
                "description": f"Inferred from {', '.join(pdbs)}.",
                "reference": "https://www.ebi.ac.uk/pdbe/pisa/"
            })

    return contacts_list

In [24]:
for uniprotID, prot_seq_aligned in entry_uniprotID_seq.items():
    prot_seq = prot_seq_aligned.replace("-","")
    requestURL = f"https://rest.uniprot.org/uniprotkb/{uniprotID}.json"
    r = requests.get(requestURL, headers={ "Accept" : "application/json"})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    uniprot_json = json.loads(r.text)

    class_ = 'A' #always class A for now
    abbreviated_name = uniprot_json["uniProtkbId"]
    names = []
    names.append({"value":uniprot_json['proteinDescription']['recommendedName']['fullName']['value'], "reference":"UniProt"})
    try:
        if 'alternativeNames' in uniprot_json['proteinDescription']:
            for i in range(len(uniprot_json['proteinDescription']['alternativeNames'])):
                names.append({"value":uniprot_json['proteinDescription']['alternativeNames'][i]['fullName']['value'],"reference":"UniProt"})
    except:
        pass

    #if "a"=="a":
    # if uniprotID == "P02699":
    if not os.path.exists(f'../examples/json_entries/new_mammals_3/{abbreviated_name.upper()}.json'):
        print(uniprotID)
        species_scientific = uniprot_json['organism']['scientificName']
        if species_scientific in species_dict:
            species = species_dict[species_scientific]
        
        #find classfication based on human classification in GPCRdb
        #find human ortholog
        family = ""
        subclass_ligand = ""
        subclass_phylo = ""
        try:
            if listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprotID]["Phylogenetically-based"].values[0] == "A-other":
                if not abbreviated_name.endswith("HUMAN"):
                    abbreviated_name_human = (abbreviated_name.split('_')[0]+"_"+"HUMAN").lower()
                    uniprot_id_human = listGPCRdb_df[listGPCRdb_df['Name'] == abbreviated_name_human]['Uniprot ID'].values[0] 
            else:
                uniprot_id_human = uniprotID
            family = listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprot_id_human]["Subclass"].values[0].rstrip() #need to change this to interpro API for those not on GPCRdb
            if "Class A" in family:
                family = family.replace("Class A ","")
                family = family[0].upper()+family[1:]
            if "receptors" in family:
                family = family.replace("receptors","").rstrip()
            elif "receptor" in family:
                family = family.replace("receptor","").rstrip()
            subclass_ligand = listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprot_id_human]["Ligand-based"].values[0].rstrip() #need to change so it works for all mammals (put same as what we have for humans? What with those not on GPCRdb?)
            if "receptors" in subclass_ligand:
                subclass_ligand = subclass_ligand.replace("receptors","").rstrip()
            elif "receptor" in subclass_ligand:
                subclass_ligand = subclass_ligand.replace("receptor","").rstrip()
            subclass_phylo = listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprot_id_human]["Phylogenetically-based"].values[0]
            if "A-" in subclass_phylo:
                subclass_phylo = subclass_phylo.split('-')[1].rstrip()
                subclass_phylo = subclass_phylo[0].upper()+subclass_phylo[1:]
        except:
            if any("olfactory" in entry["value"].lower() for entry in names): #many of the olfactory humans are not in GPCRdb so no classification. We took the olf human ones from GROSS
                family = "Olfactory"
                subclass_ligand = "Olfactory"
                subclass_phylo = "Olfactory"
            else:
                print("Problem with classification (family,subfamily...)")

        #GtoP or gpcrdb_name or pharmacological name
        pharma_name = get_pharma_name(uniprotID,abbreviated_name)
        
        #Gprot and Barr coupling data from GPCRdb
        try:
            Gprot_coupling_data,Barr_coupling_data=coupling_Gprot_Barr(uniprotID)
        except:
            Gprot_coupling_data,Barr_coupling_data=[],[]
        
        #get structures: pdb id, chain, state and offset
        #From the PDB (exp structures)
        structures,interacting_residues_list, binders_list, PDBs = retrieve_pdb_dsbonds_interactions(uniprot_json,uniprotID,prot_seq)
        structures = retrieve_predicted_models(structures,uniprotID)

        #Variations Uniprot
        requestURL = f"https://www.ebi.ac.uk/proteins/api/variation/{uniprotID}"

        r = requests.get(requestURL, headers={ "Accept" : "application/json"})
        if not r.ok:
            variations_uniprot_json = None
        else:
            variations_uniprot_json = json.loads(r.text)

        #Pharmacological info from GPCRdb mutants (API service)
        requestURL = f"https://gpcrdb.org/services/mutants/{abbreviated_name.lower()}"
        try:
            r = requests.get(requestURL, headers={ "Accept" : "application/json"})
            pharmaco_data_json = json.loads(r.text)
        except:
            pharmaco_data_json = None

        #name endogenous ligands
        ligands = []
        for value in endogenous_ligands:
            if value["receptor"] == abbreviated_name.lower():
                # ligands.append({"value":  value["ligand_name"].replace("&", "").replace(";", "").replace("<sub>","_"), "reference": "GPCRdb"})
                ligands.append({"value":  value["ligand_name"].replace("&", "").replace(";", "").replace("<sup>","").replace("</sup>","").replace("<sub>","").replace("</sub>",""), "reference": "GPCRdb"})

        seen_values = set()
        ligands = [item for item in ligands if item["value"] not in seen_values and not seen_values.add(item["value"])]
    
        microswitches_literature = motifs_microswitches_literature(MSA,uniprotID)

        #Retrieve the mutagenesis information from Uniprot related to the entry
        if variations_uniprot_json:
            mutations_Uniprot = retrieve_mutagenesis_info_Uniprot(variations_uniprot_json,uniprotID)
        else:
            mutations_Uniprot = []

        #Pharmacological data - mutants info from GPCRdb API
        if pharmaco_data_json:
            pharmaco = retrieve_pharmaco_info_GPCRdb(pharmaco_data_json)
        else:
            pharmaco = []

        #If no conformational biosensor just put confo_biosensor = []
        confo_biosensor = []

        #find chimeras this parent is involved in
        #find the cutting points used for this parent in all designs that we know so we can learn form that
        chimeras,all_regions_cutting_pts,related_chimeras=retrieve_involvement_natural_chimeric_design(uniprotID,abbreviated_name,prot_seq,chimeric_design_df)
        
        cutting_point_values = [] #leave empty if it's a natural
        
        #retrieve all limits of the secondary structure elements
        allregions = compute_dssp_TM_regions(uniprotID,MSA,type_gpcr = "natural")

        #Add BW numbering based on limits TM regions and x.50 position
        BW_numbering=computeBW_numbering(uniprotID,allregions,MSA)

        #retrieve the residues interacting with ligand/Gprot/Nb/Ab in PDB and link it to region
        #Add manually extra IC and EC contacts
        #should follow the following structure: list regrouping all dictionaries with 1 dict per contact
        #in dictionary: {"start":,"end","type","description","reference"}
        #for the EC contacts the types can be "orthosteric","allosteric","VHH EC"
        #for the IC contacts the types can be "G-protein","VHH IC"
        # manual_ICs,manual_ECs = translate_interacting_residues_IC_EC(interacting_residues,binders,PDBs,allregions)
        contacts =gather_interacting_residues(interacting_residues_list,binders_list,PDBs)

        #Features uniprot PTM, binding site uniptor
        features = {}
        ligand_BS_uniprot,PTMs_uniprot,disulfide_bonds_uniprot = features_uniprot(uniprot_json,uniprotID)

        #Scop3P phosphorylations
        PTMs_scop3P=retrieve_PTM_Scop3P(uniprotID)

        info = {}

        #Abbreviated name
        info["Abbreviated name"] = [{"value": abbreviated_name.upper(), "reference": "UniProt"}]

        #pharma name
        info["Pharmacological name"] = [{"value": pharma_name, "reference": "GPCRdb"}]

        #Name
        info["Name(s)"] = names

        #Uniprot ID
        info["Uniprot ID"] = [{"value": uniprotID, "reference": "UniProt"}]

        #Species
        info["Organism"] =  [{"value":species, "reference": "UniProt"}]

        #Class
        info["Class"] = [{"value":class_, "reference": "GPCRdb"}]

        #Family
        info["Family"] = [{"value": family, "reference": "GPCRdb"}]

        #Subclass
        #Phylogenetically based & Ligand based
        info["Subclass"] = {"Phylogenetically based": [{"value": subclass_phylo, "reference": "10.1124/mol.63.6.1256"}],
                            "Ligand based": [{"value":subclass_ligand, "reference": "GPCRdb"}]}

        #Endogenous ligand 
        info["Endogenous ligand"]=ligands

        #Gport and Barr coupling data
        info["G-protein coupling"]=Gprot_coupling_data
        info["Beta-arrestin coupling"] = Barr_coupling_data

        #Structures
        info["Structures"] = structures

        #Info related to chimeric design
        info["Conformational biosensor"] = confo_biosensor
        info["Involvement in chimeric design"] = chimeras
        info["Cutting point values"] = cutting_point_values
        info["Known cutting points and designs"] = {"Known cutting points":all_regions_cutting_pts,"Known designs":related_chimeras}

        features['Microswitches'] = microswitches_literature

        #remove duplicate PTMs, keep Scop3P ref
        PTMs_tot = PTMs_uniprot + PTMs_scop3P
        PTMs_tot_unique = remove_duplicates(PTMs_tot,uniprotID,"start",descriminator2="description")

        features['PTMs'] = PTMs_tot_unique
        features['Disulfide bonds'] = disulfide_bonds_uniprot
        features['Mutagenesis'] = mutations_Uniprot
        features['Pharmacological mutagenesis'] = pharmaco

        #remove duplicate contacts, keep uniprot
        # Contacts_EC_tot = manual_ECs + ligand_BS_uniprot
        # Contacts_EC_unique = remove_duplicates(Contacts_EC_tot,uniprotID,"start")

        features["Contacts"] = contacts+ligand_BS_uniprot

        info["Features"] = features

        #Sequence
        info["Sequence"] = [{"value":prot_seq, "reference": "UniProt"}]

        #BW numbering
        info["BWnumbering"] = [{"value":BW_numbering, "reference": "MSA"}] 

        #Secondary structure info
        info["Limits regions"] = allregions

        #Gather info that could be useful for chimeric design
        known_info = []
        if len(confo_biosensor) > 0:
            known_info.append({"value": "Confo biosensor"})
        if len(chimeras) > 0:
            known_info.append({"value": "Parent chimera"})
            info["entryType"] ="parent"
        else:
            info["entryType"] ="natural"

        info["Known info chimeric design"] = known_info

        json.dump(info, open(f'../examples/json_entries/new_mammals_3/{abbreviated_name.upper()}.json', 'w'), indent=2)

Q8NGE7
A6ND48
A6NF89
A6NHG9
A6NKK0
A6NDH6
A6NM03
O14581
O60431
O76100
O60412
O60403
A0A0X1KG70
O95371
O95222
P0C617
A3KFT3
O95918
O95006
O60404
B2RN74
A6NL26
A4D2G3
P0C626
P0C604
P0C623
P0C645
O43869
P0C7N1
O95221
P0C7N5
P0C629
O76000
A6NJZ3
A6NH00
O76002
A6NMS3
O76099
O76001
A6NND4
O95013
O95047
O95007
A6NIJ9
Q9H346
P0C7T3
Q8NGI1
Q8NGF1
Q8NGJ3
P0C646
Q8NGH5
Q8NGI0
A6NGY5
A6NMU1
Q6IF63
Q8NGF0
Q8NGJ8
Q6IFG1
Q8NGI3
Q8NGF3
Q8NGI2
Q8NGH7
Q8NGH8
Q8NGJ6
Q8NGJ9
Q8NGH6
Q8NGJ7
Q8NGK2
Q8NGJ4
Q8NGK1
Q8NGJ2
Q8NGK3
Q8NGK4
Q8NGJ5
O14842
Q7Z602
Q8NGS0
Q8NGR6
Q8NGQ2
Q8NGZ6
Q8NGN4
Q8NGN3
Q8NGN1
Q8NGN5
Q8NGP0
Q8NGN8
Q8NGQ1
Q8NGR1
Q8NGR9
Q8NGS1
Q8NGQ5
Q8NH72
Q96R84
Q8NH70
Q96R09
Q9NZP0
Q9NZP5
Q8NGZ3
Q96RD1
A6NET4
A6NL08
A6NHA9
A6NM76
P0C628
Q8NGM9
A6NMZ5
A6NDL8
Q8NGY2
Q8NGS2
P0DN82
P47883
P47888
P58180
P47881
Q15612
P47893
P58182
Q15619
Q6IF82
Q8NG99
Q8NGC3
Q6IF36
Q8NGM1
Q8NGI8
O00574
P25025
P32246
P32248
P32302
P41597
P49238
P49682
P51677
P51679
P51681
P51684
P51686
P61073
Q9NPB9
P32249
Q13304
P25106
P4