In [19]:
import pandas as pd
import requests, sys
import json
import os
import urllib.request
from Bio import SeqIO
from Bio import AlignIO
import re
import html
from collections import defaultdict

In [None]:
#upload data
of_interest=['']

naturals_entry_data = "../data/all_designs.fasta"
entry_uniprotID_seq = {}
for record in SeqIO.parse(naturals_entry_data,"fasta"):
    # if record.id in of_interest:
        entry_uniprotID_seq[record.id]=str(record.seq)

print(len(entry_uniprotID_seq))
print(entry_uniprotID_seq)

170
{'OPSD_BOVIN_AA2AR_HUMAN_1': 'MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVAIRIPLRYNGLVTENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQIFLAARRQLKQMESQPLPGERARSTLQKEVHAAKSLIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMRIREFRQTFRKIIRSHVLRQQEPFKAAGTSARILAAHGSDGEQVSLRLNGHPPGVWANGSAPHPERRPNGYALGLVSGGSAQESQGNTGLPDVELLSHELKGVCPEPPGLDDPLAQDGAGVS', 'OPSD_BOVIN_ADRB2_MESAU_1': 'MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVAITSPFKYQSLLTENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGRVFQVAKRQLQKIDKSEGRFHSPNLGQVEQDGRSGHGLRRSSKFCLKEHKALKTLIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMRSPDFRIAFQELLCLRRSSSKAYGNGYSSNSNGKTDYMGEASGCQLGQEKESERLCEDPPGTESFVNCQGTVPSLSLDSQGRNCSTNDSPL', 'OPSD_BOVIN_ADRB2_MESAU_2': 'MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSM

In [21]:
#Download file with classification info
filename_listGPCRdb = "../data/240923_Classification_GPCRdb.xlsx"
listGPCRdb_df = pd.read_excel(filename_listGPCRdb)

#chimeric design info
filename_chimeric_designs = "../data/previous_designs.xlsx"
chimeric_design_df = pd.read_excel(filename_chimeric_designs)

In [22]:
def calculate_sq_atom_distance(i, j):
    """Squared euclidean distance between two 3d points"""
    return (i[0] - j[0]) * (i[0] - j[0]) + \
            (i[1] - j[1]) * (i[1] - j[1]) + \
            (i[2] - j[2]) * (i[2] - j[2])

def identify_gaps(pdb_file, chain_pdb, offset, end): #code modified from pdb_gap.py file from pdbtools Copyright 2018 João Pedro Rodrigues
    fhandle = open(pdb_file, 'r')
    centroid = ' CA '  # respect spacing. 'CA  ' != ' CA '
    distance_threshold = 4.0 * 4.0
    prev_at = (None, None, None, None, (None, None, None))
    model = 0
    n_gaps = 0
    gap = []
    for line in fhandle:

        if line.startswith('MODEL'):
            model = int(line[10:14])

        elif line.startswith('ATOM'):
            atom_name = line[12:16]
            if atom_name != centroid:
                continue

            resn = line[17:20]
            resi = int(line[22:26])
            chain = line[21]
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])

            at_uid = (model, chain, resi, resn, atom_name, (x, y, z))
            if prev_at[0] == at_uid[0] and prev_at[1] == at_uid[1]:
                d = calculate_sq_atom_distance(at_uid[5], prev_at[5])
                if d > distance_threshold:
                    gap.append([prev_at[1],prev_at[2],at_uid[1],at_uid[2]])
                    # sys.stdout.write(fmt_GAPd.format(prev_at, at_uid, d**0.5))
                    n_gaps += 1
                elif prev_at[2] + 1 != at_uid[2]:
                    # sys.stdout.write(fmt_GAPs.format(prev_at, at_uid))
                    gap.append([prev_at[1],prev_at[2],at_uid[1],at_uid[2]])
                    n_gaps += 1

            prev_at = at_uid

    gaps_cleaned = []
    start = offset
    for section in gap:
        if section[0] == chain_pdb and section[2] == chain_pdb:
            stop = section[1]
            if start < 1000 and stop < 1000:
                gaps_cleaned.append([start,stop])
            start = section[3]
    gaps_cleaned.append([start,end])
    return gaps_cleaned

In [None]:
def merge_duplicates(dicts):
    # Step 1: Group dictionaries by the 'start' key
    grouped = defaultdict(list)
    for d in dicts:
        grouped[d['start']].append(d)
    
    result = []
    conflicts = []

    # Step 2: Process each group
    for start, items in grouped.items():
        if len(items) > 1:
            # Check if all 'type' values are the same
            types = set(d['type'] for d in items)
            if len(types) == 1:
                # Merge 'reference' values
                merged_references = "PDBePISA "
                for d in items:
                    merged_references += d['description'].split('.')[0] + " "
                    merged_references += d['description'].split(' ')[-1] + " "
                # Create a new dictionary with merged references
                new_dict = items[0].copy()
                new_dict['description'] = merged_references[:-1]
                new_dict['reference'] = "https://www.ebi.ac.uk/pdbe/pisa/"
                result.append(new_dict)
            else:
                # Print dictionaries with different 'type' values
                for d in items:
                    conflicts.append(d)
        else:
            result.append(items[0])
    
    return result, conflicts

def retrieve_interacting_residues_PDB(pdb_id, chain_pdb,uniprot_pdb_start,pdb_start):
    # retrieve the interacting residues in the PDBs from PISA, need to make sure it doesn't take into accound the interactions between 2 sym GPCRs
    # interacting residues is defined by a bsa > 0
    #https://github.com/PDBe-KB/pdbe-pisa-json/blob/main/PISA-APIs.ipynb


    interacting_residues = []
    chain_interacting_molecule = ""
    try: #when its just 1 chain or 1 chain and a ligand PISA doesn't work
        difference_pdb_uniprot_start = pdb_start-uniprot_pdb_start
        response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/pisa/assembly/{pdb_id.lower()}/1")
        interface_count = response.json()[pdb_id.lower()]["assembly"]["interface_count"]
        for i in range(1,interface_count+1):
            response_single_interface = requests.get(f"https://www.ebi.ac.uk/pdbe/api/pisa/interface/{pdb_id.lower()}/1/{i}/")
            data = response_single_interface.json()
            if "/" in chain_pdb:
                chain_pdb = chain_pdb.split("/")
            for j in range(len(data["molecules"])):
                if isinstance(chain_pdb,str):
                    if data["molecules"][j]["chain_id"]==chain_pdb:
                        for bsa,position in zip(data["molecules"][j]["buried_surface_areas"],data["molecules"][j]['residue_seq_ids']):
                            if bsa >0.0:
                                interacting_residues.append(int(position)-difference_pdb_uniprot_start)
                        if j == 0: #there is supposed to be only 2 molecules, the GPCR and the interacting molecule
                            chain_interacting_molecule = data["molecules"][1]["chain_id"]
                        else: 
                            chain_interacting_molecule = data["molecules"][0]["chain_id"]
                elif isinstance(chain_pdb,list):
                    if chain_pdb[0] in data["molecules"][j]["chain_id"] and chain_pdb[1] in data["molecules"][j+1]["chain_id"]:
                        break
                    else:
                        if chain_pdb[0] in data["molecules"][j]["chain_id"] or chain_pdb[1] in data["molecules"][j]["chain_id"]:
                            for bsa,position in zip(data["molecules"][j]["buried_surface_areas"],data["molecules"][j]['residue_seq_ids']):
                                if bsa >0.0:
                                    interacting_residues.append(int(position)-difference_pdb_uniprot_start)
                        if j == 0: #there is supposed to be only 2 molecules, the GPCR and the interacting molecule
                            chain_interacting_molecule = data["molecules"][1]["chain_id"]
                        else: 
                            chain_interacting_molecule = data["molecules"][0]["chain_id"]
    except:
        pass


    return interacting_residues,chain_interacting_molecule

def extract_name_binders(chain_of_interest,pdb_file_path):

    molecule_name = None
    current_molecule = ""
    reading_molecule = False
    found_chain = False
    
    # Open and read the PDB file
    with open(pdb_file_path, 'r') as file:
        lines = file.readlines()
    
    for line in lines:
        if line.startswith("COMPND"):
            # Start reading the molecule name if "MOLECULE" is in the line
            if "MOLECULE" in line:
                reading_molecule = True
                current_molecule = line.split(":")[1].strip().rstrip(";")  # Extract initial part of the molecule name
            # If the molecule name is being read and it continues on the next line
            elif reading_molecule and "CHAIN" not in line:
                pattern = r"\d+\s+(.+)"
                match = re.search(pattern, line)
                current_molecule += " "+match.group(1).strip().rstrip(";")
            # Once we reach the chain of interest
            if f"CHAIN: {chain_of_interest}" in line:
                found_chain = True
            # If molecule and chain have been found, stop reading
            if found_chain and current_molecule and ";" in line:
                molecule_name = current_molecule
                break
    return molecule_name
 
    
def translate_interacting_residues_IC_EC(interacting_residues,binders,pdb_list,all_regions):
    interactions_pdb_list_EC = []
    interactions_pdb_list_IC = []
    for residues,binder,pdb_id in zip(interacting_residues,binders,pdb_list):
        for residue in residues:
            for region in all_regions:
                if residue >= region["start"] and residue <= region["end"]:
                    if region["name"] in ["Nterm","ECL1","ECL2","ECL3"]:
                        region_residue = "EC"
                    elif region["name"] in ["Cterm","ICL1","ICL2","ICL3"]:
                        region_residue = "IC"
                    elif region["name"] in ["TM1","TM3","TM5","TM7"]:
                        if residue <= region["start"] + round(((region["end"]-region["start"])/2)):
                            region_residue = "EC"
                        else:
                            region_residue = "IC"
                    elif region["name"] in ["TM2","TM4","TM6"]:
                        if residue <= region["start"] + round(((region["end"]-region["start"])/2)):
                            region_residue = "IC"
                        else:
                            region_residue = "EC"

                    interactions_pdb = {}
                    interactions_pdb["start"]=int(residue)
                    interactions_pdb["end"]=int(residue)
                    if region_residue == "IC":
                        interactions_pdb["type"]="Intracellular binding pocket residue"
                    else:
                        interactions_pdb["type"]="Extracellular binding pocket residue"
                    
                    interactions_pdb["description"]=f"{binder}. Inferred from {pdb_id}"
                    interactions_pdb["reference"]="https://www.ebi.ac.uk/pdbe/pisa/"

                    if region_residue == "IC":
                        interactions_pdb_list_IC.append(interactions_pdb)
                    else:
                        interactions_pdb_list_EC.append(interactions_pdb)

    #merge duplicates
    interactions_pdb_list_IC_no_duplicates,_ = merge_duplicates(interactions_pdb_list_IC)
    interactions_pdb_list_EC_no_duplicates,_ = merge_duplicates(interactions_pdb_list_EC)

    return interactions_pdb_list_IC_no_duplicates, interactions_pdb_list_EC_no_duplicates

In [24]:
def pad_line(line):
    """Helper function to pad line to 80 characters in case it is shorter"""
    size_of_line = len(line)
    if size_of_line < 80:
        padding = 80 - size_of_line + 1
        line = line.strip('\n') + ' ' * padding + '\n'
    return line[:81]  # 80 + newline character

def check_first_residue_pdb_uniprot(pdb_file,chain_interest,sequence):
    #SIFT mapping identifies 1st residue from PDB in uniprot. But that 1st residue is not necessarly the first residue that is resolved in the PDB
    #Here check if the first residue that is resolved corresponds to a residue at the same position in sequence
    #if yes, no need to renumber
    Three_to_One_AA = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
     'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

    fhandle = open(pdb_file, 'r')
    _pad_line = pad_line
    for line in fhandle:
        line = _pad_line(line)
        if line.startswith("ATOM") and line[21]==chain_interest:
            residue = Three_to_One_AA[line[17:20].upper()]
            pos = int(line[24:27])
            break
    if sequence[pos-1] == residue:
        return True
    else:
        return False

In [25]:
def SIFT_mapping(pdb_id,uniprot_id):
    #find equivalence start uniprot and pdb 
    # https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
    response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}")
    data = response.json()
    uniprot_pdb_start = data[pdb_id.lower()]['UniProt'][uniprot_id]['mappings'][0]['unp_start']
    pdb_start = data[pdb_id.lower()]['UniProt'][uniprot_id]['mappings'][0]['start']['author_residue_number']
    if not pdb_start:
        pdb_start = data[pdb_id.lower()]['UniProt'][uniprot_id]['mappings'][0]['start']['residue_number']
    return uniprot_pdb_start, pdb_start

def renumber_pdb_uniprot_start(pdb_file,pdb_id,chain,uniprot_id,sequence):
    uniprot_pdb_start, pdb_start=SIFT_mapping(pdb_id,uniprot_id)
    new_pdb_file_path = f"../examples/3Dstructures/pdb_renumbered/{pdb_id}.pdb"
    modify = False
    if not check_first_residue_pdb_uniprot(pdb_file,chain,sequence) and uniprot_pdb_start != pdb_start :
    #if both SIFT and the manually matching with th e sequence doesn't work, renumber
        modify = True
    if modify:
        !python pdb_reres.py -$uniprot_pdb_start -$chain $pdb_file > $new_pdb_file_path
    else:
        uniprot_pdb_start, pdb_start = 0, 0
    return new_pdb_file_path, uniprot_pdb_start, pdb_start

In [None]:
def retrieve_pdb_dsbonds_chimera_info(structures_pdb,uniprot_id_ref,sequence):
    structures = []
    ds_bonds = []
    pdbs = []
    chains = []
    uniprot_pdb_starts = []
    pdb_starts = []
    binders_list= []
    interacting_residues_list = []

    for pdb_info in structures_pdb:
        pdb_id = pdb_info["PDB id"]
        chain_pdb = pdb_info["chain"]
        state = pdb_info["state"]
        pdb_file_path = f'../data/tmp/{pdb_id}.pdb'
        urllib.request.urlretrieve(f'https://files.rcsb.org/download/{pdb_id}.pdb', pdb_file_path)

        #get SIFTS mapping PDB seq > UniProt mapping and renumber pdb. New pdb being written
        new_pdb_file_path, uniprot_pdb_start, pdb_start=renumber_pdb_uniprot_start(pdb_file_path,pdb_id,chain_pdb,uniprot_id_ref,sequence)
        if os.path.exists(new_pdb_file_path):
            path_DB_pdb = f"file:///examples/3Dstructures/pdb_renumbered/{pdb_id}.pdb"
        else:
            path_DB_pdb = f"https://files.rcsb.org/download/{pdb_id}.pdb"
            new_pdb_file_path = pdb_file_path
        
        # #this is needed to find the interactions within the pdb file
        pdbs.append(pdb_id)
        chains.append(chain_pdb)
        uniprot_pdb_starts.append(uniprot_pdb_start)
        pdb_starts.append(pdb_start)

        #get offset and gaps in structure
        offset = 0
        start_found = False
        gaps = []
        gap = False
        with open(new_pdb_file_path, 'r') as f:
            for line in f:
                line_list = line.split()
                if not start_found:
                    if line_list[0]=="ATOM" and line_list[4]==chain_pdb:
                        if int(line_list[5]) >= 1:
                            offset = int(line_list[5]) - 1 #line_list[5] gives the 1st position so the offset is line_list[5] -1
                            start_found = True
                else:
                    if line_list[0]=="ATOM" and line_list[4]==chain_pdb:
                        end = int(line_list[5])
                
                
        gaps = identify_gaps (new_pdb_file_path, chain_pdb, offset+1, end)
        structures.append({"value":pdb_id,"chain": chain_pdb, "state":state, "offset":  offset, "gaps": gaps, "url":path_DB_pdb, "reference":f"https://www.rcsb.org/structure/{pdb_id.upper()}"})

        #Look for disulfide bridges in PDB files
        with open(new_pdb_file_path, 'r') as f:
            for line in f:
                line_list = line.split()
                if line_list[0]=="SSBOND":
                    if line_list[3] == chain_pdb and line_list[6] == chain_pdb:
                        ssbond = {}
                        ssbond["start"] = int(line_list[4])
                        ssbond["end"]= int(line_list[7])
                        ssbond["description"] = "Disulfide bond"
                        ssbond["reference"] = "Extracted from PDB files"
                        ds_bonds.append(ssbond)

        #find interacting residues at ligand binding site and G protein binding site
        interacting_residues,binder_chain = retrieve_interacting_residues_PDB(pdb_id,chain_pdb,uniprot_pdb_start,pdb_start)

        interacting_residues_list.append(list(set(interacting_residues)))
        binder = extract_name_binders(binder_chain,pdb_file_path)
        if binder != None:
            binders_list.append(binder)

        #remove pdb file
        os.remove(pdb_file_path)
    
    #Keep only the DS bonds that are in every PDB file (sometimes add DS bond to stabilize structure to be crystalized)
    ds_bonds_noduplicates=[dict(t) for t in {tuple(d.items()) for d in ds_bonds}]
    for single in ds_bonds_noduplicates:
        count = 0
        for multiple in ds_bonds:
            if single == multiple:
                count +=1
        if count != len(structures):
            ds_bonds_noduplicates.remove(single)   
            
    return structures, ds_bonds_noduplicates, interacting_residues_list, binders_list, pdbs

In [None]:
#find TM regions based on TMbed predictions
#find the TM regions

def TMbed_retrieve_allregions(TM_regions_file,abbreviated_name):
    
    regions = []
    region_names = ['Nterm','TM1', 'ICL1', 'TM2', 'ECL1', 'TM3', 'ICL2', 'TM4', 'ECL2', 'TM5', 'ICL3', 'TM6', 'ECL3', 'TM7','Cterm']

    with open(TM_regions_file, 'r') as f:
        count = 0
        TM_counter = 1
        name =""
        ok=False
        error = False
        for line in f:
            line=line.replace('\n',"").replace('\t'," ")
            line=list(line.split())
            count +=1
            if ">" in line[0]:
                try:
                    name = line[0].split('|')[1]
                except:
                    name = line[0][1:]
                count = 1
            if name == abbreviated_name and (count%3==0):
                counter_region = 0
                start = False
                
                previous = 1
                for idx,element in enumerate(line[0]):
                    if start == False:
                        if element == "H" or element == "h":
                            start = True
                            lower_lim_TM = idx+1 #human readble limits
                            region = {}
                            region["name"] = region_names[counter_region]
                            region["start"] = previous
                            region["end"] = lower_lim_TM-1
                            region["reference"] = "TMbed"
                            regions.append(region)
                            counter_region +=1
                    else:
                        if element != "H" and element != "h":
                            upper_lim_TM = idx  #human readble limits
                            start = False
                            region = {}
                            region["name"] = region_names[counter_region]
                            region["start"] = lower_lim_TM
                            region["end"] = upper_lim_TM
                            region["reference"] = "TMbed"
                            regions.append(region)
                            counter_region +=1
                            previous=upper_lim_TM+1
                region = {}
                region["name"] = region_names[counter_region]
                region["start"] = previous
                region["end"] = len(line[0])
                region["reference"] = "TMbed"
                regions.append(region)
                break

    return regions

In [None]:
def cutting_pts_2_ss_region(dict_regions,all_regions=None):
    translated_regions = []
    if isinstance(dict_regions,dict):
        for region in dict_regions.keys():
            lower_lim = dict_regions[region][0]
            upper_lim = dict_regions[region][1]
            for ss_region in all_regions:
                if lower_lim >= ss_region["start"] and lower_lim <= ss_region["end"]:
                    ss_lower_lim = ss_region["name"]
                if upper_lim >= ss_region["start"] and upper_lim <= ss_region["end"]:
                    ss_upper_lim = ss_region["name"]
            translated_regions.append(str(lower_lim)+"-"+str(upper_lim)+f" ({ss_lower_lim}-{ss_upper_lim})")

    elif isinstance(dict_regions,list):
        if all_regions:
            for section in dict_regions:
                info = []
                for pos in section:
                    for ss_region in all_regions:
                        if int(pos) >= ss_region["start"] and int(pos) <= ss_region["end"]:
                            ss_pos = ss_region["name"]
                            break
                    info.append(pos)
                    info.append(ss_pos)
                translated_regions.append(str(info[0])+"-"+str(info[2])+f" ({info[1]}-{info[3]})")
        else:
            for region in dict_regions:
                lower_lim = region[0]
                upper_lim = region[1]
                translated_regions.append(str(lower_lim)+"-"+str(upper_lim))
    return translated_regions


def sections_coloring_chimera(info_specific_chimera,allregions):
    coloring = {"Nterm":{0:"",1:"",2:"",3:"",4:""},
                "TM1":{0:"",1:"",2:"",3:"",4:""},
                "TM1-Loop-TM2":{0:"",1:"",2:"",3:"",4:""},
                "TM2":{0:"",1:"",2:"",3:"",4:""},
                "TM2-Loop-TM3":{0:"",1:"",2:"",3:"",4:""},
                "TM3":{0:"",1:"",2:"",3:"",4:""},
                "TM3-Loop-TM4":{0:"",1:"",2:"",3:"",4:""},
                "TM4":{0:"",1:"",2:"",3:"",4:""},
                "TM4-Loop-TM5":{0:"",1:"",2:"",3:"",4:""},
                "TM5":{0:"",1:"",2:"",3:"",4:""},
                "TM5-Loop-TM6":{0:"",1:"",2:"",3:"",4:""},
                "TM6":{0:"",1:"",2:"",3:"",4:""},
                "TM6-Loop-TM7":{0:"",1:"",2:"",3:"",4:""},
                "TM7":{0:"",1:"",2:"",3:"",4:""},
                "Cterm":{0:"",1:"",2:"",3:"",4:""}}

    cutting_pts_with_ss = info_specific_chimera["cutting_point_chimera"]
    for parent,region in enumerate(cutting_pts_with_ss):
        positions=region.split(" ")[0]
        name=region.split(" ")[1][1:-1]
        info = []
        for i in range(1):
            position = int(positions.split("-")[i])
            name_region= name.split("-")[i]
            lower_lim = [d for d in allregions if d.get("name") == name_region][0]["start"]
            upper_lim = [d for d in allregions if d.get("name") == name_region][0]["end"]
            a_fifth = round((upper_lim-lower_lim)/5)
            if position < (lower_lim + a_fifth):
                idx = 0
            elif position < (lower_lim + 2*a_fifth):
                idx = 1
            elif position < (lower_lim + 3*a_fifth):
                idx = 2
            elif position < (lower_lim + 4*a_fifth):
                idx = 3
            else:
                idx = 4

            #make names match
            name_loops = {"ICL1":"TM1-Loop-TM2","ECL1":"TM2-Loop-TM3",
                            "ICL2":"TM3-Loop-TM4","ECL2":"TM4-Loop-TM5",
                            "ICL3":"TM5-Loop-TM6","ECL3":"TM6-Loop-TM7"}
            if "IC" in name_region or "EC" in name_region:
                name_region = name_loops[name_region]

            if (parent % 2)==0:
                parent_name = "EC"
            else:
                parent_name = "IC"

            if len(coloring[name_region][idx]) != 0:
                coloring[name_region][idx+1]=parent_name
            else:
                coloring[name_region][idx]=parent_name
            
    content = "EC"
    for region_coloring, idx_coloring in coloring.items():
            for idx in idx_coloring.keys():
                if len(idx_coloring[idx]) == 0:
                    idx_coloring[idx] = content
                else:
                    content = idx_coloring[idx]
               
    return coloring

def retrieve_involvement_natural_chimeric_design(uniprot_id,all_regions):

    involvement = []

    for parent_column_id in ['Reference_id','Target_id']:

        #find rows that have uniprot as ref id or target id
        designs_parent = chimeric_design_df[chimeric_design_df[parent_column_id] == uniprot_id]

        #Info from rows
        names_chimeras = designs_parent['Chimera_name'].tolist()
        ids_chimeras = designs_parent['Chimera_name_ids'].tolist()
        regions_chimera = designs_parent['Chimera_parts'].tolist()
        name_target_chimeras = designs_parent['Target_name'].tolist()
        id_target_chimeras = designs_parent['Target_id'].tolist()

        name_ref_chimeras = designs_parent['Reference_name'].tolist()
        id_ref_chimeras = designs_parent['Reference_id'].tolist()

        regions_ref_chimeras = designs_parent['Reference_cutting_points'].tolist()
        regions_target_chimeras = designs_parent['Target_cutting_points'].tolist()

        expression = designs_parent['Expression binary'].tolist()
        fct = designs_parent['Function binary'].tolist()

        application = designs_parent['Application'].tolist()
        type_chimera = designs_parent['Chimera Type (1/2/3)'].tolist()
        Gprot = designs_parent['G-protein'].tolist()
        Ligand =designs_parent['Ligand'].tolist()
        structures = designs_parent['3D structure PDB'].tolist()
        biblio = designs_parent['DOI'].tolist()

        for i,name in enumerate(names_chimeras):
            
            cutting_pt_chimera = cutting_pts_2_ss_region(eval(regions_chimera[i]),all_regions) 

            all_regions_ref = TMbed_retrieve_allregions("../data/Uniprot_TMbed.txt",id_ref_chimeras[i])
            cutting_pt_ref = cutting_pts_2_ss_region(eval(regions_ref_chimeras[i]),all_regions_ref)

            all_regions_target = TMbed_retrieve_allregions("../data/Uniprot_TMbed.txt",id_target_chimeras[i])
            cutting_pt_target = cutting_pts_2_ss_region(eval(regions_target_chimeras[i]),all_regions_target)

            pharma_name_ref = html.unescape(get_pharma_name(id_ref_chimeras[i],name_ref_chimeras[i]))
            pharma_name_target = html.unescape(get_pharma_name(id_target_chimeras[i], name_target_chimeras[i]))

            pharma_name_ref_ = pharma_name_ref
            pharma_name_target_ = pharma_name_target
            if "receptor" in pharma_name_ref.lower():
                pharma_name_ref_ = pharma_name_ref.replace(" receptor","")
            if "receptor" in pharma_name_target.lower():
                pharma_name_target_ = pharma_name_target.replace(" receptor","")
            pharma_name = pharma_name_ref_ + " " + pharma_name_target_ + " receptor"
            if "adrenoceptor" in pharma_name:
                pharma_name = pharma_name.replace(" receptor","")

            if isinstance(structures[i],str):
                pdb = structures[i]
            else:
                pdb = ""

            if isinstance(Gprot[i],str):
                gprot=Gprot[i]
            else:
                gprot=""

            if isinstance(Ligand[i],str):
                ligand=Ligand[i]
            else:
                ligand=""

            chimera={
            "name":name,
            "name_pharma":pharma_name,
            "id":ids_chimeras[i],
            "ref": name_ref_chimeras[i],
            "ref_pharma_name": pharma_name_ref,
            "target": name_target_chimeras[i],
            "target_pharma_name":pharma_name_target,
            "cutting_point_chimera": cutting_pt_chimera,
            "cutting_point_ref": cutting_pt_ref,
            "cutting_point_target": cutting_pt_target,
            "expression_function": fct[i],
            "type": type_chimera[i],
            "GprotLigand": gprot+" "+ligand,
            "application": application[i]+" "+pdb,
            "reference": biblio[i]
            }
            involvement.append(chimera)

    return involvement


In [29]:
def retrieve_3D_structure_info(info_3Dstructure):
    structures_pdb = []
    for pdb_info in info_3Dstructure.split(","):
        pdb_id = pdb_info.split('(')[0]
        chain = pdb_info.split('(')[1].split(' ')[0]
        state = pdb_info.split('(')[1].split(' ')[1][:-1]
        pdb_dict={"PDB id":pdb_id,"chain":chain,"state":state}
        structures_pdb.append(pdb_dict)
    return structures_pdb

def split_string(input_string):
    match = re.match(r"([a-zA-Z]?)(-?)(\d*)(-?)([a-zA-Z]?)", input_string)
    if match:
        return list(filter(None, match.groups()))
    return []

def retrieve_mutations_vs_parents_info(mutations,biblio):
    mutations_list = []
    if len(mutations)>=4:
        for mut in mutations.split(","):
            if len(mut)>3:
                split_mut = split_string(mut)
                position = split_mut[1]
                parent_res = split_mut[0]
                chimera_res = split_mut[2]
                mut_dict = {"start":int(position),"end":int(position),"original residue":parent_res, "alternative residue":chimera_res,"reference":biblio}
                mutations_list.append(mut_dict)
        return mutations_list
    return TypeError

def retrieve_chimera_design_info(abb_name):

    all_info = chimeric_design_df[chimeric_design_df["Chimera_name"] == abb_name]

    uniprot_id = all_info["Chimera_name_ids"].values[0]
    given_name = all_info["Given name"].values[0]
    cutting_points = eval(all_info["Chimera_parts"].values[0])
    ref_parent_name = all_info["Reference_name"].values[0]
    ref_parent_id = all_info["Reference_id"].values[0]
    target_parent_name = all_info["Target_name"].values[0]
    target_parent_id = all_info["Target_id"].values[0]
    Gprot = all_info['G-protein'].values[0]

    if isinstance(Gprot,str):
        gprot=Gprot
    else:
        gprot=""  
    
    biblio = all_info['DOI'].values[0]
    try:
        structures_pdb =  retrieve_3D_structure_info(all_info["3D structure PDB"].values[0])
    except:
        structures_pdb = []
    try:
        mutations_vs_parents = retrieve_mutations_vs_parents_info(all_info["Mutations"].values[0],biblio)
    except:
        mutations_vs_parents = []

    return uniprot_id,given_name,cutting_points,ref_parent_name,ref_parent_id,target_parent_name,target_parent_id,structures_pdb,mutations_vs_parents,gprot,biblio

In [None]:
def familly_subclass_parents(ref_parent_name,ref_parent_id,target_parent_name,target_parent_id):
    #find classfication based on human classification in GPCRdb
    #find human ortholog
    family_chimera = []
    subclass_ligand_chimera = []
    subclass_phylo_chimera = []

    for abbreviated_name,uniprotID in zip([ref_parent_name,target_parent_name],[ref_parent_id,target_parent_id]):
        try:
            if listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprotID]["Phylogenetically-based"].values[0] == "A-other":
                if not abbreviated_name.endswith("HUMAN"):
                    abbreviated_name_human = (abbreviated_name.split('_')[0]+"_"+"HUMAN").lower()
                    uniprot_id_human = listGPCRdb_df[listGPCRdb_df['Name'] == abbreviated_name_human]['Uniprot ID'].values[0] 
            else:
                uniprot_id_human = uniprotID


            family = listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprot_id_human]["Subclass"].values[0].rstrip() #need to change this to interpro API for those not on GPCRdb


            if "Class A" in family:
                family = family.replace("Class A ","")
                family = family[0].upper()+family[1:]
            if "receptors" in family:
                family = family.replace("receptors","").rstrip()
            elif "receptor" in family:
                family = family.replace("receptor","").rstrip()
            subclass_ligand = listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprot_id_human]["Ligand-based"].values[0].rstrip() #need to change so it works for all mammals (put same as what we have for humans? What with those not on GPCRdb?)

            
            if "receptors" in subclass_ligand:
                subclass_ligand = subclass_ligand.replace("receptors","").rstrip()
            elif "receptor" in subclass_ligand:
                subclass_ligand = subclass_ligand.replace("receptor","").rstrip()
            subclass_phylo = listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprot_id_human]["Phylogenetically-based"].values[0]
            if "A-" in subclass_phylo:
                subclass_phylo = subclass_phylo.split('-')[1].rstrip()
                subclass_phylo = subclass_phylo[0].upper()+subclass_phylo[1:]

        except:
                family = "Olfactory"
                subclass_ligand = "Olfactory"
                subclass_phylo = "Olfactory"

        family_chimera.append(family)
        subclass_ligand_chimera.append(subclass_ligand)
        subclass_phylo_chimera.append(subclass_phylo)
    
    family_chimera_str = " & ".join(family_chimera)
    subclass_ligand_chimera_str = " & ".join(subclass_ligand_chimera)
    subclass_phylo_chimera_str = " & ".join(subclass_phylo_chimera)
    
    return family_chimera_str,subclass_ligand_chimera_str,subclass_phylo_chimera_str

In [31]:
def write_cutting_point_region(cutting_points,ref_parent_name,target_parent_name,biblio):
    cutting_points_regions_parent = []
    for i,region in enumerate(cutting_points):
        if (i%2)==0:
            parent = ref_parent_name
            parent_type = "EC side parent"
        else:
            parent = target_parent_name
            parent_type = "IC side parent"

        region_parent =  {
        "name": f"{parent} ({parent_type})",
        "start": region[0],
        "end": region[1],
        "reference": biblio
        }
        cutting_points_regions_parent.append(region_parent)
    return cutting_points_regions_parent

In [32]:
#convert scientific name UniProt to common name
def parse_species_file(file_path):
    species_dict = {}
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    scientific_name = None
    
    for line in lines:
        if "N=" in line:
            scientific_name = line.split("N=")[1].strip()
        elif "C=" in line and scientific_name:
            common_name = line.split("C=")[1].strip()
            species_dict[scientific_name] = common_name
            scientific_name = None  # Reset for the next entry
    
    return species_dict

def species_uniprot(ref_parent_id,target_parent_id):
    species_parents = []
    for prot_id in [ref_parent_id,target_parent_id]:
        requestURL = f"https://rest.uniprot.org/uniprotkb/{prot_id}.json"
        r = requests.get(requestURL, headers={ "Accept" : "application/json"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        uniprot_json = json.loads(r.text)
        species = uniprot_json['organism']['scientificName']
        species_parents.append(species)

    #convert scientific names to common names
    uniprot_names = "../data/UniProt_names_scientific_common.txt"
    species_dict = parse_species_file(uniprot_names)
    species_parents_common_name = []
    for species in species_parents:
        species_parents_common_name.append(species_dict[species])
    return " & ".join(species_parents_common_name)

In [33]:
def retrieve_seq_parents(ref_parent_id,target_parent_id):
    proteins_mammalians = SeqIO.index("/Users/charlottecrauwels/Library/CloudStorage/OneDrive-VrijeUniversiteitBrussel/PhD/WP1/Data/Sequence/all_mammals_classA/240923_cleaned_all_mammals_classA.fasta", "fasta")
    ref_seq = str(proteins_mammalians[ref_parent_id].seq)
    target_seq = str(proteins_mammalians[target_parent_id].seq)
    return ref_seq,target_seq

In [34]:
#equivalence between positions in sequence and in MSA
#dictionary with list of list. In every sublist, 2 elements, 1st is the position in sequence, the 2nd the position in MSA
def map_seq_MSA(sequence_aligned):
    previous = 0
    translate = {}
    sequence_nogaps = sequence_aligned.replace("-","")
    for res in range(len(sequence_nogaps)):
        idx_msa = previous + sequence_aligned[previous:].index(sequence_nogaps[res])
        translate[res+1]=idx_msa+1
        previous = idx_msa + 1
    return translate

#Microswitches/motifs - identify them based on their defined columns in mammalian MSA
def motifs_microswitches_literature(MSA,uniprot_id):
    #GPCRdb finds sodium pockets
    #As microswitches are well defined in literature we can check ourselves if these well knwon microswitches are present in our gpcrs
    #All known microswitches in literature for class A
    E_DRY_W = {"positions":["3.49", "3.50", "3.51"],"residues":["ED", "R", "WY"], "name": "E/DRY/W motif (ionic lock switch)"}
    CWxP = {"positions":["6.47", "6.48", "6.50"], "residues":["C", "W", "P"], "name": "CWxP motif (transmission toggle switch)"}
    NPxxY = {"positions":["7.49", "7.50", "7.53"], "residues":["N","P","Y"], "name": "NPxxY motif (tyr toggle switch)"}
    PIF = {"positions": ["5.50", "3.40", "6.44"], "residues":["P","I","F"], "name": "PIF motif"}
    hydrophobic_lock = {"positions":["3.43","6.40"], "residues":["LVIM", "LVIM"], "name": "hydrophobic lock"}
    ionic_lock = {"positions":["6.30"], "residues":["DE"], "name": "ionic lock"}
    #disulfide bond between TM3 and ECL2 is already identified by Uniprot in the "Disulfide bonds" section
    #Sodium binding pocket (allosteric action): middle of the 7TMs. Identified by GPCRdb but are the identified ones all of them???

    #the positions are in human readable format (not pyton - starts at 0)
    MSA_E_DRY_W = {"positions":[694,695,696],"residues":["ED", "R", "WY"], "name": "E/DRY/W motif (ionic lock switch)"}
    MSA_CWxP = {"positions":[1211,1212,1214], "residues":["C", "W", "P"], "name": "CWxP motif (transmission toggle switch)"}
    MSA_NPxxY = {"positions":[1290,1291,1294], "residues":["N","P","Y"], "name": "NPxxY motif (tyr toggle switch)"}
    MSA_PIF = {"positions": [947,685,1204], "residues":["P","I","F"], "name": "PIF motif"}
    MSA_hydrophobic_lock = {"positions":[688,1200], "residues":["LVIM", "LVIM"], "name": "Hydrophobic lock"}
    MSA_ionic_lock = {"positions":[1190], "residues":["DE"], "name": "Ionic lock"}
    MSA_sodium_pocket = {"positions":[619,684], "residues":["D","S"], "name": "Sodium binding pocket"}

    TM1x50={"positions":[579],"residues":["N"], "name": "1.50 (BW numbering)"}
    TM2x50={"positions":[619],"residues":["D"], "name": "2.50 (BW numbering)"}
    TM3x50={"positions":[695],"residues":["R"], "name": "3.50 (BW numbering)"}
    TM4x50={"positions":[751],"residues":["P"], "name": "4.50 (BW numbering)"}
    TM5x50={"positions":[947],"residues":["P"], "name": "5.50 (BW numbering)"}
    TM6x50={"positions":[1214],"residues":["P"], "name": "6.50 (BW numbering)"}
    TM7x50={"positions":[1291],"residues":["P"], "name": "7.50 (BW numbering)"}

    alignment = AlignIO.read(open(MSA), "fasta")
    len_MSA=alignment.get_alignment_length()
    record_dict = SeqIO.index(MSA, "fasta")
    aligned_seq_interest = str(record_dict[uniprot_id].seq)
    translate_seq_MSA = map_seq_MSA(aligned_seq_interest) #gives position of a unaligned res in msa
    translate_MSA_seq = {v: k for k, v in translate_seq_MSA.items()} #gives position of a aligned res in unaligned seq
    microswitch_types = [MSA_E_DRY_W, MSA_CWxP, MSA_NPxxY, MSA_PIF, MSA_hydrophobic_lock, MSA_ionic_lock,
                         TM1x50,TM2x50,TM3x50,TM4x50,TM5x50,TM6x50,TM7x50]
    microswitches = []
    microswitches_residues = []

    for microswitch_type in microswitch_types:
        are_there = []
        for position, residue in zip(microswitch_type["positions"], microswitch_type["residues"]):
            if aligned_seq_interest[position-1] in residue:
                are_there.append(True)
            else:
                are_there.append(False)
        for i, (position, residue) in enumerate(zip(microswitch_type["positions"], microswitch_type["residues"])):
            microswitch_residue = {}

            #take into account the possibility that there is a gap at that position in the MSA
            if position in translate_MSA_seq:
                microswitch_residue["start"] = translate_MSA_seq[position]
                microswitch_residue["end"] = translate_MSA_seq[position]
                residue_motif = aligned_seq_interest[position-1]
            else:
                for next in range(position+1,len_MSA):
                    if next in translate_MSA_seq:
                        microswitch_residue["start"] = translate_MSA_seq[next]
                        microswitch_residue["end"] = translate_MSA_seq[next]
                        residue_motif = aligned_seq_interest[next-1]
                        break
                                    
            if not all(are_there) and not are_there[i]:
                if residue_motif == "F" and microswitch_type["name"]=="PIF motif":
                    microswitch_residue["description"] = residue_motif + " instead of "+ residue+ " from " + " (part of " + microswitch_type["name"]+ ")"
                elif residue_motif == "R" and microswitch_type["name"]=="E/DRY/W motif (ionic lock switch)":
                    microswitch_residue["description"] = residue_motif + " instead of "+ residue + " (part of " + microswitch_type["name"]+ ")"
                elif "(BW numbering)" in microswitch_type["name"]:
                    microswitch_residue["description"] = residue_motif+ " instead of "+ residue + " " + microswitch_type["name"]
                else:
                    microswitch_residue["description"] = residue_motif + " instead of "+ residue + " (part of " + microswitch_type["name"]+ ")"
            else:
                if residue_motif == "F" and microswitch_type["name"]=="PIF motif":
                    microswitch_residue["description"] = residue_motif + " part of " + microswitch_type["name"] +" and hydrophobic lock"
                elif residue_motif == "R" and microswitch_type["name"]=="E/DRY/W motif (ionic lock switch)":
                    microswitch_residue["description"] = residue_motif+ " part of " + microswitch_type["name"] +" and ionic lock"
                elif "(BW numbering)" in microswitch_type["name"]:
                    microswitch_residue["description"] = residue_motif+ " " + microswitch_type["name"]
                else:
                    microswitch_residue["description"] = residue_motif+ " part of " + microswitch_type["name"]
            if are_there[i]:
                microswitch_residue["conserved"] = "yes"
            else:
                microswitch_residue["conserved"] = "no"
            microswitch_residue["reference"] = "Based on alignment"
            microswitches_residues.append(microswitch_residue)

    return microswitches_residues

def find_motifs_parent_chimera(cutting_points_parent,microswitches_to_look_at,cutting_points_chimera,counter):
    microswitches_found = []
    for region_parent in cutting_points_parent:
        region_chimera = cutting_points_chimera[counter]
        start_region_chimera = int(region_chimera.split(' ')[0].split('-')[0])
        stop_region_chimera = int(region_chimera.split(' ')[0].split('-')[1])

        start_region_parent = int(region_parent.split(' ')[0].split('-')[0])
        stop_region_parent = int(region_parent.split(' ')[0].split('-')[1])

        if start_region_parent > start_region_chimera:
            difference = start_region_parent - start_region_chimera
        else:
            difference = start_region_chimera - start_region_parent

        for position in range(start_region_parent,stop_region_parent+1):
            motifs_associated_pos = []
                    
            motifs_associated_pos = [item for item in microswitches_to_look_at if item["start"] == position]
            if len(motifs_associated_pos)>0:
                if start_region_parent > start_region_chimera:
                    position_chimera = position - difference
                else:
                    position_chimera = position + difference
                for motif in motifs_associated_pos:
                    motif_chimera = motif.copy()
                    motif_chimera["start"] = position_chimera
                    motif_chimera["end"] = position_chimera
                    microswitches_found.append(motif_chimera)
    

        counter +=2
    return microswitches_found

def motifs_microswitches_chimera(ref_parent_id,target_parent_id,info_specific_chimera):
    #find motifs chimera based on motifs parents. For every region coming from a particular parent, look if the parent has motifs in that region. If yes, chimera also has it.

    cutting_points_chimera = info_specific_chimera["cutting_point_chimera"]
    cutting_points_ref = info_specific_chimera["cutting_point_ref"]
    cutting_points_target = info_specific_chimera["cutting_point_target"]

    MSA = "../data/MSA_all_mammalian.fasta"
    microswitches_ref = motifs_microswitches_literature(MSA,ref_parent_id)
    microswitches_target = motifs_microswitches_literature(MSA,target_parent_id)
    microswitches_chimera_ref = find_motifs_parent_chimera(cutting_points_ref,microswitches_ref,cutting_points_chimera,0)
    microswitches_chimera_target = find_motifs_parent_chimera(cutting_points_target,microswitches_target,cutting_points_chimera,1)

    microswitches_residues = microswitches_chimera_ref + microswitches_chimera_target
    return microswitches_residues

In [35]:
def get_pharma_name(uniprotID,abb_name):
    #GtoP or gpcrdb_name or pharmacological name
    try:
        requestURL = f"https://gpcrdb.org/services/protein/accession/{uniprotID}"

        r = requests.get(requestURL, headers={ "Accept" : "application/json"})
        if not r.ok:
            info_entry = None
        else:
            info_entry = json.loads(r.text)
    except:
        info_entry = None

    if not info_entry is None:
        clean_html_tags = re.compile('<.*?>')
        pharma_name = re.sub(clean_html_tags, '', info_entry["name"])
    else:
        pharma_name = abb_name

    return pharma_name

def retrieve_pharma_name_parents(ref_parent_name,ref_parent_id,target_parent_name,target_parent_id):
    pharma_name_ref = get_pharma_name(ref_parent_id,ref_parent_name)
    pharma_name_target = get_pharma_name(target_parent_id,target_parent_name)

    if "receptor" in pharma_name_ref.lower():
        pharma_name_ref = pharma_name_ref.replace(" receptor","")
    if "receptor" in pharma_name_target.lower():
        pharma_name_target = pharma_name_target.replace(" receptor","")
    
    pharma_name = pharma_name_ref + " " + pharma_name_target + " receptor"

    if "adrenoceptor" in pharma_name:
        pharma_name = pharma_name.replace(" receptor","")

    return pharma_name_ref,pharma_name_target,pharma_name

In [36]:
for abbreviated_name, prot_seq in entry_uniprotID_seq.items():
    print(abbreviated_name)
    if not os.path.exists(f'../examples/json_entries/updated_chimeras/{abbreviated_name.upper()}.json'):
    # if "a" == "a":

        class_ = 'A & A' #always class A for now
        uniprotID,given_name,cutting_points,ref_parent_name,ref_parent_id,target_parent_name,target_parent_id,structures_pdb,mutations_vs_parents,gprot,biblio = retrieve_chimera_design_info(abbreviated_name)

        ref_parent_seq,target_parent_seq = retrieve_seq_parents(ref_parent_id,target_parent_id)

        #pharma name based on pharma name parents
        pharma_name_ref,pharma_name_target,pharma_name = retrieve_pharma_name_parents(ref_parent_name,ref_parent_id,target_parent_name,target_parent_id)
        
        species = species_uniprot(ref_parent_id,target_parent_id)
        
        family,subclass_ligand,subclass_phylo=familly_subclass_parents(ref_parent_name,ref_parent_id,target_parent_name,target_parent_id)

        #Secondary structure info
        TM_regions_file = "../data/TM_regions_chimeras_TMbed.txt"
        allregions = TMbed_retrieve_allregions(TM_regions_file,abbreviated_name)

        #structures
        structures, ds_bonds_noduplicates, interacting_residues_list, binders, pdbs = retrieve_pdb_dsbonds_chimera_info(structures_pdb,ref_parent_id,prot_seq)
        #AlphaFold2
        af_model_path = f"file:///examples/3Dstructures/AF_chimera/{uniprotID}.pdb"
        structures.append({"value":f"AlphaFold2","chain": "A",  "state":"Undetermined", "offset":  0, "gaps": [], "url": af_model_path, "reference":"AlphaFold2"})
        
        #AlphaFold multistate. Don't have a AF ms for every GPCR (only humans). Need to check if file exist:
        af_ms_active = f"../examples/3Dstructures/AFms/Active/{uniprotID}.pdb"
        af_ms_inactive = f"../examples/3Dstructures/AFms/Inactive/{uniprotID}.pdb"
        esmf = f"../examples/3Dstructures/ESMF_chimera/{uniprotID}.pdb"
        if os.path.exists(af_ms_active):
            af_ms_active = f"file:///examples/3Dstructures/AFms/Active/{uniprotID}.pdb"
            structures.append({"value":f"AlphaFold2 Active","chain": "A",  "state":"Active", "offset":  0, "gaps": [], "url": af_ms_active, "reference":"AlphaFold multistate"})
        if os.path.exists(af_ms_inactive):
            af_ms_inactive = f"file:///examples/3Dstructures/AFms/Inactive/{uniprotID}.pdb"
            structures.append({"value":f"AlphaFold2 Inactive","chain": "A",  "state":"Inactive", "offset":  0, "gaps": [], "url": af_ms_inactive, "reference":"AlphaFold multistate"})
        if os.path.exists(esmf):
            esmf = f"file:///examples/3Dstructures/ESMF_chimera/{uniprotID}.pdb"
            structures.append({"value":f"ESMFold","chain": "A",  "state":"Undetermined", "offset":  0, "gaps": [], "url": esmf, "reference":"ESMFold"})

        #retrieve the residues interacting with ligand/Gprot/Nb/Ab in PDB and link it to region
        #Add manually extra IC and EC contacts
        #should follow the following structure: list regrouping all dictionaries with 1 dict per contact
        #in dictionary: {"start":,"end","type","description","reference"}
        #for the EC contacts the types can be "orthosteric","allosteric","VHH EC"
        #for the IC contacts the types can be "G-protein","VHH IC"
        manual_ICs,manual_ECs = translate_interacting_residues_IC_EC(interacting_residues_list,binders,pdbs,allregions)

        #chimeric info
        chimeras_ref_involved = retrieve_involvement_natural_chimeric_design(ref_parent_id,allregions)
        chimeras_target_involved = retrieve_involvement_natural_chimeric_design(target_parent_id,allregions)  
        chimeras = chimeras_ref_involved + chimeras_target_involved
        chimeras_no_duplicates = {}
        for d in chimeras:
            chimeras_no_duplicates[d['name']] = d
        chimeras_no_duplicates = list(chimeras_no_duplicates.values())

        confo_biosensor = []

        #chimeric info specfic design being studied
        for d in chimeras_no_duplicates:
            if d.get("name") == abbreviated_name:
                info_specific_chimera = d.copy()
        info_specific_chimera["mutations vs parents"] = mutations_vs_parents
        info_specific_chimera["ref parent seq"] = ref_parent_seq
        info_specific_chimera["target parent seq"] = target_parent_seq
        info_specific_chimera["coloring"] =sections_coloring_chimera(info_specific_chimera,allregions)

        #microswitches based on microswitches/motifs in parents
        microswitches = motifs_microswitches_chimera(ref_parent_id,target_parent_id,info_specific_chimera)
        info = {}
        #standard needed in json chimera for DB
        info["chimera"] = True
        info["ancestors"] = [ref_parent_name,target_parent_name]

        info["info specific chimera"]=info_specific_chimera

        #Abbreviated name
        info["Abbreviated name"] = [{"value": abbreviated_name.upper(), "reference": "GPCRchimeraDB"}]

        #pharma name
        info["Pharmacological name"] = [{"value": pharma_name, "reference": "GPCRdb"}]

        #Name
        info["Name(s)"] = [{"value": given_name, "reference": "GPCRchimeraDB"}]

        #Uniprot ID
        info["Uniprot ID"] = [{"value": uniprotID, "reference": "GPCRchimeraDB"}]

        #Species
        info["Organism"] =  [{"value":species, "reference": "UniProt"}]

        #Class
        info["Class"] = [{"value":class_, "reference": "GPCRdb"}]

        #Family
        info["Family"] = [{"value": family, "reference": "GPCRdb"}]

        #Subclass
        #Phylogenetically based & Ligand based
        info["Subclass"] = {"Phylogenetically based": [{"value": subclass_phylo, "reference": "10.1124/mol.63.6.1256"}],
                            "Ligand based": [{"value":subclass_ligand, "reference": "GPCRdb"}]}

        #endogenous ligand
        info["Endogenous ligand"]= []
        
        #Gport and Barr coupling data
        if len(gprot)>0:
            info["G-protein coupling"]= [{"value":gprot, "reference": biblio}]
        else:
            info["G-protein coupling"]= []
        info["Beta-arrestin coupling"] = []

        #Structures
        info["Structures"] = structures

        #Info related to chimeric design
        info["Conformational biosensor"] = confo_biosensor
        info["Involvement in chimeric design"] = chimeras_no_duplicates
        info["Cutting point values"] = write_cutting_point_region(cutting_points,ref_parent_name,target_parent_name,biblio)

        features = {}
        features['Microswitches'] = microswitches
        features['PTMs'] = []
        features['Disulfide bonds'] = ds_bonds_noduplicates
        features['Mutagenesis'] = []
        features['Pharmacological mutagenesis'] = []
        features["Contacts"] = manual_ECs +  manual_ICs
        info["Features"] = features

        #Sequence
        info["Sequence"] = [{"value":prot_seq, "reference": biblio}]

        #Secondary structure info
        info["Limits regions"] = allregions

        #Gather info that could be useful for chimeric design: for chimera put just yes chimera so that filter can be used on DB?
        known_info = [{"value": "Yes chimera"}]
        info["Known info chimeric design"] = known_info

        json.dump(info, open(f'../examples/json_entries/updated_chimeras/{abbreviated_name.upper()}.json', 'w'), indent=2)

OPSD_BOVIN_AA2AR_HUMAN_1
OPSD_BOVIN_ADRB2_MESAU_1
OPSD_BOVIN_ADRB2_MESAU_2
OPSD_BOVIN_ACM2_HUMAN_1
OPSD_BOVIN_ACM3_HUMAN_1
OPSD_BOVIN_ACM1_HUMAN_1
OPSD_BOVIN_DRD2_HUMAN_1
OPSD_BOVIN_DRD1_HUMAN_1
OPSD_BOVIN_AA2AR_HUMAN_2
OPSD_BOVIN_FFAR3_HUMAN_1
OPSD_BOVIN_ADA1A_RAT_1
OPSD_BOVIN_ADRB2_HUMAN_1
OPSD_BOVIN_GP183_HUMAN_1
OPSD_BOVIN_GP182_HUMAN_1
OPSD_BOVIN_GP176_HUMAN_1
OPSD_BOVIN_GP174_HUMAN_1
OPSD_BOVIN_GP173_HUMAN_1
OPSD_BOVIN_GP171_HUMAN_1
OPSD_BOVIN_GP162_HUMAN_1
OPSD_BOVIN_GP161_HUMAN_1
OPSD_BOVIN_GP153_HUMAN_1
OPSD_BOVIN_GP152_HUMAN_1
OPSD_BOVIN_GP151_HUMAN_1
OPSD_BOVIN_GP150_HUMAN_1
OPSD_BOVIN_GP149_HUMAN_1
OPSD_BOVIN_GP148_HUMAN_1
OPSD_BOVIN_GP146_HUMAN_1
OPSD_BOVIN_GP142_HUMAN_1
OPSD_BOVIN_GP141_HUMAN_1
OPSD_BOVIN_GP139_HUMAN_1
OPSD_BOVIN_GP135_HUMAN_1
OPSD_BOVIN_GP132_HUMAN_1
OPSD_BOVIN_FFAR4_HUMAN_1
OPSD_BOVIN_GP119_HUMAN_1
OPSD_BOVIN_GPR88_HUMAN_1
OPSD_BOVIN_GPR87_HUMAN_1
OPSD_BOVIN_GPR85_HUMAN_1
OPSD_BOVIN_GPR84_HUMAN_1
OPSD_BOVIN_GPR83_HUMAN_1
OPSD_BOVIN_GPR82_HUMAN_1
OPSD_BO