In [None]:
import pandas as pd
import requests, sys
import json
import os
import urllib.request
import numpy as np
from collections import defaultdict
from Bio import AlignIO, SeqIO
import re
import html
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

In [2]:
# Download the needed files

#Download uniprot ID of all proteins on GPCRdb and their classification

#Download file with classification info
filename_listGPCRdb = "../data/240923_Classification_GPCRdb.xlsx"
listGPCRdb_df = pd.read_excel(filename_listGPCRdb)

#Load the excel file of your protein entry from https://gpcrdb.org/mutational_landscape/ #check for updates?!

filename_mutagenesis = "../data/GPCRdb_variants.xlsx"
mutagenesis_GPCRdb_raw_data = pd.read_excel(filename_mutagenesis)

#Load the excel file with Gprot and Barr coupling data from https://gproteindb.org/signprot/statistics_venn (filtered with own code)
#this info is also on GtoP but not as complete
filename_Gprot = "../data/GproteinDB_table.xlsx"
Gprot_GPCRdb = pd.read_excel(filename_Gprot)
filename_Barr = "../data/BarrDB_table.xlsx"
Barr_GPCRdb = pd.read_excel(filename_Barr)

#chimeric design info
filename_chimeric_designs = "../data/previous_designs.xlsx"
chimeric_design_df = pd.read_excel(filename_chimeric_designs)

# #Download all structures on GPCRdb to access the GPCR state
# requestURL = "https://gpcrdb.org/services/structure/"
# r = requests.get(requestURL)

# if not r.ok:
#   r.raise_for_status()
#   sys.exit()

# structures_chain = json.loads(r.text)
# json.dump(structures_chain, open("../data/structures_data.json", "w"), indent=2)
structures_chain= json.load(open("../data/structures_data.json"))

#Download all endogenous on GPCRdb to access the GPCR state
# requestURL = "https://gpcrdb.org/services/ligands/endogenousligands/"
# r = requests.get(requestURL)

# if not r.ok:
#   r.raise_for_status()
#   sys.exit()

# endogenous_ligands = json.loads(r.text)
# json.dump(endogenous_ligands, open("../data/endogenous_ligands.json", "w"), indent=2)
endogenous_ligands = json.load(open("../data/endogenous_ligands.json"))

In [None]:
#for now select only 6 (demonstration purposes)
of_interest=[""]

naturals_entry_data = "../data/240923_cleaned_all_mammals_classA.fasta"
entry_uniprotID_seq = {}
for record in SeqIO.parse(naturals_entry_data,"fasta"):
    # if record.id in of_interest:
        entry_uniprotID_seq[record.id]=str(record.seq)
print(len(entry_uniprotID_seq))
print(entry_uniprotID_seq)

8
{'Q8IYL9': 'MNSTCIEEQHDLDHYLFPIVYIFVIIVSIPANIGSLCVSFLQAKKESELGIYLFSLSLSDLLYALTLPLWIDYTWNKDNWTFSPALCKGSAFLMYMNFYSSTAFLTCIAVDRYLAVVYPLKFFFLRTRRFALMVSLSIWILETIFNAVMLWEDETVVEYCDAEKSNFTLCYDKYPLEKWQINLNLFRTCTGYAIPLVTILICNRKVYQAVRHNKATENKEKKRIIKLLVSITVTFVLCFTPFHVMLLIRCILEHAVNFEDHSNSGKRTYTMYRITVALTSLNCVADPILYCFVTETGRYDMWNILKFCTGRCNTSQRQRKRILSVSTKDTMELEVLE', 'Q8TDS4': 'MNRHHLQDHFLEIDKKNCCVFRDDFIVKVLPPVLGLEFIFGLLGNGLALWIFCFHLKSWKSSRIFLFNLAVADFLLIICLPFLMDNYVRRWDWKFGDIPCRLMLFMLAMNRQGSIIFLTVVAVDRYFRVVHPHHALNKISNRTAAIISCLLWGITIGLTVHLLKKKMPIQNGGANLCSSFSICHTFQWHEAMFLLEFFLPLGIILFCSARIIWSLRQRQMDRHAKIKRAITFIMVVAIVFVICFLPSVVVRIRIFWLLHTSGTQNCEVYRSVDLAFFITLSFTYMNSMLDPVVYYFSSPSFPNFFSTLINRCLQRKMTGEPDNNRSTSVELTGDPNKTRGAPEALMANSGEPWSPSYLGPTSP', 'P02699': 'MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMV

In [5]:
def coupling_Gprot_Barr(uniprot_id):
    Gprot_coupling_data = []
    Gprot_coupling_data_prot = {}
    gprot = ""
    for i in Gprot_GPCRdb[Gprot_GPCRdb['Uniprot ID'] == uniprot_id].iloc[0][1:]:
        if not i is np.NaN:
            gprot +=i
            gprot += ", "
    Gprot_coupling_data_prot["value"]=gprot[:-2]
    Gprot_coupling_data_prot["reference"]="https://gproteindb.org/signprot/statistics_venn"
    Gprot_coupling_data.append(Gprot_coupling_data_prot)

    Barr_coupling_data = []
    Barr_coupling_data_prot = {}
    barr = ""
    for i in Barr_GPCRdb[Barr_GPCRdb['Uniprot ID'] == uniprot_id].iloc[0][1:]:
        if not i is np.NaN:
            barr +=i
            barr += ", "
    Barr_coupling_data_prot["value"]=barr[:-2]
    Barr_coupling_data_prot["reference"]="https://arrestindb.org/signprot/arrestin_venn"
    Barr_coupling_data.append(Barr_coupling_data_prot)

    return Gprot_coupling_data,Barr_coupling_data

In [7]:
def retrieve_mutagenesis_info_Uniprot(variations_uniprot_json,uniprot_id):
    mutations_Uniprot = []
    for variant in range(len(variations_uniprot_json["features"])):
        mutation = {}
        mutation["start"] = int(variations_uniprot_json["features"][variant]["begin"])
        mutation["end"] = int(variations_uniprot_json["features"][variant]["end"])
        mutation_type = variations_uniprot_json["features"][variant]["consequenceType"]
        mutation["type"] = mutation_type

        typical_AA = "AVLIPMCFYWSTQNHKRDEG"
        mutation["original residue"] = variations_uniprot_json["features"][variant]["wildType"]
        if not mutation["original residue"] in typical_AA:
            continue
        predictions = []
        prediction = {}
        consensus = []
        if mutation_type.lower() == "missense":
            mutation["alternative residue"] = variations_uniprot_json["features"][variant]["mutatedType"]
            try:
                nb_predictors = len(variations_uniprot_json["features"][variant]["predictions"])
                for predictor in range(nb_predictors):
                    prediction = {}
                    algorithm = variations_uniprot_json["features"][variant]["predictions"][predictor]["predAlgorithmNameType"].lower()
                    score = variations_uniprot_json["features"][variant]["predictions"][predictor]["score"]
                    prediction_value = variations_uniprot_json["features"][variant]["predictions"][predictor]["predictionValType"]
                    if algorithm.lower() == "polyphen":
                        prediction["predictor"] = "polyphen"
                        prediction["value"] = str(score)
                        if score >= 0.2:
                            prediction["prediction"] = "probably damaging"
                            consensus.append("-")
                        elif score >= 0.1:
                            prediction["prediction"] = "possibly damaging"
                            consensus.append("-")
                        else:
                            prediction["prediction"] = "benign"
                            consensus.append("+")
                        predictions.append(prediction)
                    elif algorithm.lower() == "sift":
                        prediction["predictor"] = "SIFT"
                        prediction["value"] = str(score)
                        if score <= 0.05:
                            prediction["prediction"] = "deleterious"
                            consensus.append("-")
                        else:
                            prediction["prediction"] = "tolerated"
                            consensus.append("+")
                        predictions.append(prediction)
            except:
                prediction["predictor"] = ""
                prediction["value"] = ""
                try:
                    effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                    if "benign" in effect.lower() or "tolerated" in effect.lower():
                        prediction["prediction"] = "tolerated"
                        consensus.append("+")
                    elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                        prediction["prediction"] = "deleterious"
                        consensus.append("-")
                    else:
                        prediction["prediction"] = "not indicated"
                except:
                    prediction["prediction"] = "not indicated"
                predictions.append(prediction)

        elif mutation_type.lower() == "stop gained":
            mutation["alternative residue"] = "termination"
            prediction["predictor"] = ""
            prediction["value"] = ""
            try:
                effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                if "benign" in effect.lower() or "tolerated" in effect.lower():
                    prediction["prediction"] = "tolerated"
                    consensus.append("+")
                elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                    prediction["prediction"] = "deleterious"
                    consensus.append("-")
                else:
                    prediction["prediction"] = "not indicated"
            except:
                prediction["prediction"] = "not indicated"
            predictions.append(prediction)

        elif mutation_type.lower() == "inframe deletion":
            mutation["alternative residue"] = "missing"
            prediction["predictor"] = ""
            prediction["value"] = ""
            try:
                effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                if "benign" in effect.lower() or "tolerated" in effect.lower():
                    prediction["prediction"] = "tolerated"
                    consensus.append("+")
                elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                    prediction["prediction"] = "deleterious"
                    consensus.append("-")
                else:
                    prediction["prediction"] = "not indicated"
            except:
                prediction["prediction"] = "not indicated"
            predictions.append(prediction)

        elif mutation_type.lower() == "frameshift":
            mutation["alternative residue"] = ""
            prediction["predictor"] = ""
            prediction["value"] = ""
            try:
                effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                if "benign" in effect.lower() or "tolerated" in effect.lower():
                    prediction["prediction"] = "tolerated"
                    consensus.append("+")
                elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                    prediction["prediction"] = "deleterious"
                    consensus.append("-")
                else:
                        prediction["prediction"] = "not indicated"
            except:
                prediction["prediction"] = "not indicated"
            predictions.append(prediction)

        elif mutation_type.lower() == "stop lost":
            mutation["alternative residue"] = variations_uniprot_json["features"][variant]["mutatedType"]
            prediction["predictor"] = ""
            prediction["value"] = ""
            try:
                effect = variations_uniprot_json["features"][variant]["clinicalSignificances"][0]["type"]
                if "benign" in effect.lower() or "tolerated" in effect.lower():
                    prediction["prediction"] = "tolerated"
                    consensus.append("+")
                elif "deleterious" in effect.lower() or "damaging" in effect.lower():
                    prediction["prediction"] = "deleterious"
                    consensus.append("-")
                else:
                    prediction["prediction"] = "not indicated"
            except:
                prediction["prediction"] = "not indicated"
            predictions.append(prediction)

        else: #skip if it not a missense, frameshift, stop gained, stop lost, inframe deletion
            continue

        mutation["effect(s)"] = predictions

        if len(consensus) == 0:
                consensus_ = "no consensus"
        else:
            for element in consensus:
                if len(list(set(consensus))) == 1:
                    if list(set(consensus))[0] == "+":
                        consensus_ = "tolerated"
                    elif list(set(consensus))[0] == "-":
                        consensus_ = "deleterious"
                else:
                    consensus_ = "no consensus"

        mutation["consensus effect"] = consensus_

        mutation["reference"] = f"https://www.uniprot.org/uniprotkb/{uniprot_id}/variant-viewer"
        mutations_Uniprot.append(mutation)

    return mutations_Uniprot

In [8]:
def retrieve_pharmaco_info_GPCRdb(pharmaco_data_json):
    pharmaco = []
    for dic in pharmaco_data_json:
        pharmaco_dic = {}
        pharmaco_dic["start"]= dic["mutation_pos"]
        pharmaco_dic["end"]= dic["mutation_pos"]
        pharmaco_dic["original residue"]= dic["mutation_from"]
        pharmaco_dic["alternative residue"]= dic["mutation_to"]
        pharmaco_dic["studied parameter"]= dic["exp_type"]
        DB = dic["ligand_id"]
        pharmaco_dic["ligand"]= dic["ligand_name"]
        if "CHEMBL" in DB: #chembl
            pharmaco_dic["link ligand"] = f"https://www.ebi.ac.uk/chembl/compound_report_card/{DB}/"
        elif DB.isnumeric(): #pubchem
            pharmaco_dic["link ligand"] = f"https://pubchem.ncbi.nlm.nih.gov/compound/{DB}"
        else:
            pharmaco_dic["link ligand"] = ""
        pharmaco_dic["ligand type"]= ""
        effect_value = round(dic["exp_fold_change"],1)
        if effect_value < 0:
            impact = "increase"
            effect_value = effect_value*-1
        else:
            impact = "decrease"

        if effect_value == 0.0:
            pharmaco_dic["effect"]="None"
        else:
            pharmaco_dic["effect"]= str(effect_value) + " fold " + impact

        pharmaco_dic["reference"]= dic["reference"]

        pharmaco.append(pharmaco_dic)
    return pharmaco

In [None]:
def calculate_sq_atom_distance(i, j):
    """Squared euclidean distance between two 3d points"""
    return (i[0] - j[0]) * (i[0] - j[0]) + \
            (i[1] - j[1]) * (i[1] - j[1]) + \
            (i[2] - j[2]) * (i[2] - j[2])

def identify_gaps(pdb_file, chain_pdb, offset, end): #code modified from pdb_gap.py file from pdbtools Copyright 2018 João Pedro Rodrigues
    fhandle = open(pdb_file, 'r')
    centroid = ' CA '  # respect spacing. 'CA  ' != ' CA '
    distance_threshold = 4.0 * 4.0
    prev_at = (None, None, None, None, (None, None, None))
    model = 0
    n_gaps = 0
    gap = []
    for line in fhandle:

        if line.startswith('MODEL'):
            model = int(line[10:14])

        elif line.startswith('ATOM'):
            atom_name = line[12:16]
            if atom_name != centroid:
                continue

            resn = line[17:20]
            resi = int(line[22:26])
            chain = line[21]
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])

            at_uid = (model, chain, resi, resn, atom_name, (x, y, z))
            if prev_at[0] == at_uid[0] and prev_at[1] == at_uid[1]:
                d = calculate_sq_atom_distance(at_uid[5], prev_at[5])
                if d > distance_threshold:
                    gap.append([prev_at[1],prev_at[2],at_uid[1],at_uid[2]])
                    n_gaps += 1
                elif prev_at[2] + 1 != at_uid[2]:
                    gap.append([prev_at[1],prev_at[2],at_uid[1],at_uid[2]])
                    n_gaps += 1

            prev_at = at_uid

    gaps_cleaned = []
    start = offset
    for section in gap:
        if section[0] == chain_pdb and section[2] == chain_pdb:
            stop = section[1]
            if start < 1000 and stop < 1000:
                gaps_cleaned.append([start,stop])
            start = section[3]
    gaps_cleaned.append([start,end])
    return gaps_cleaned

In [None]:
def merge_duplicates(dicts):
    # Step 1: Group dictionaries by the 'start' key
    grouped = defaultdict(list)
    for d in dicts:
        grouped[d['start']].append(d)
    
    result = []
    conflicts = []

    # Step 2: Process each group
    for start, items in grouped.items():
        if len(items) > 1:
            # Check if all 'type' values are the same
            types = set(d['type'] for d in items)
            if len(types) == 1:
                # Merge 'reference' values
                merged_references = "PDBePISA "
                for d in items:
                    merged_references += d['description'].split('.')[0] + " "
                    merged_references += d['description'].split(' ')[-1] + " "
                # Create a new dictionary with merged references
                new_dict = items[0].copy()
                new_dict['description'] = merged_references[:-1]
                new_dict['reference'] = "https://www.ebi.ac.uk/pdbe/pisa/"
                result.append(new_dict)
            else:
                # Print dictionaries with different 'type' values
                for d in items:
                    conflicts.append(d)
        else:
            result.append(items[0])
    
    return result, conflicts

def retrieve_interacting_residues_PDB(pdb_id, chain_pdb,uniprot_pdb_start,pdb_start):
    # retrieve the interacting residues in the PDBs from PISA, need to make sure it doesn't take into accound the interactions between 2 sym GPCRs
    # interacting residues is defined by a bsa > 0
    #https://github.com/PDBe-KB/pdbe-pisa-json/blob/main/PISA-APIs.ipynb

    # interacting_residues_list = []
    # binders_chain= []
    # for pdb_id, chain_pdb,uniprot_pdb_start,pdb_start in zip(pdb_ids,chain_pdbs,uniprot_pdb_starts,pdb_starts):
        try: #when its just 1 chain or 1 chain and a ligand PISA doesn't work
            difference_pdb_uniprot_start = pdb_start-uniprot_pdb_start
            interacting_residues = []
            response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/pisa/assembly/{pdb_id.lower()}/1")
            interface_count = response.json()[pdb_id.lower()]["assembly"]["interface_count"]
            for i in range(1,interface_count+1):
                response_single_interface = requests.get(f"https://www.ebi.ac.uk/pdbe/api/pisa/interface/{pdb_id.lower()}/1/{i}/")
                data = response_single_interface.json()
                if "/" in chain_pdb:
                    chain_pdb = chain_pdb.split("/")
                for j in range(len(data["molecules"])):
                    if isinstance(chain_pdb,str):
                        if data["molecules"][j]["chain_id"]==chain_pdb:
                            for bsa,position in zip(data["molecules"][j]["buried_surface_areas"],data["molecules"][j]['residue_seq_ids']):
                                if bsa >0.0:
                                    interacting_residues.append(int(position)-difference_pdb_uniprot_start)
                            if j == 0: #there is supposed to be only 2 molecules, the GPCR and the interacting molecule
                                chain_interacting_molecule = data["molecules"][1]["chain_id"]
                            else: 
                                chain_interacting_molecule = data["molecules"][0]["chain_id"]
                    elif isinstance(chain_pdb,list):
                        if chain_pdb[0] in data["molecules"][j]["chain_id"] and chain_pdb[1] in data["molecules"][j+1]["chain_id"]:
                            break
                        else:
                            if chain_pdb[0] in data["molecules"][j]["chain_id"] or chain_pdb[1] in data["molecules"][j]["chain_id"]:
                                for bsa,position in zip(data["molecules"][j]["buried_surface_areas"],data["molecules"][j]['residue_seq_ids']):
                                    if bsa >0.0:
                                        interacting_residues.append(int(position)-difference_pdb_uniprot_start)
                            if j == 0: #there is supposed to be only 2 molecules, the GPCR and the interacting molecule
                                chain_interacting_molecule = data["molecules"][1]["chain_id"]
                            else: 
                                chain_interacting_molecule = data["molecules"][0]["chain_id"]
        except:
            pass

        return interacting_residues,chain_interacting_molecule

def extract_name_binders(chain_of_interest,pdb_file_path):

    molecule_name = None
    current_molecule = ""
    reading_molecule = False
    found_chain = False
    
    # Open and read the PDB file
    with open(pdb_file_path, 'r') as file:
        lines = file.readlines()
    
    for line in lines:
        if line.startswith("COMPND"):
            # Start reading the molecule name if "MOLECULE" is in the line
            if "MOLECULE" in line:
                reading_molecule = True
                current_molecule = line.split(":")[1].strip().rstrip(";")  # Extract initial part of the molecule name
            # If the molecule name is being read and it continues on the next line
            elif reading_molecule and "CHAIN" not in line:
                pattern = r"\d+\s+(.+)"
                match = re.search(pattern, line)
                current_molecule += " "+match.group(1).strip().rstrip(";")
            # Once we reach the chain of interest
            if f"CHAIN: {chain_of_interest}" in line:
                found_chain = True
            # If molecule and chain have been found, stop reading
            if found_chain and current_molecule and ";" in line:
                molecule_name = current_molecule
                break
    return molecule_name
 
    
def translate_interacting_residues_IC_EC(interacting_residues,binders,pdb_list,all_regions):
    interactions_pdb_list_EC = []
    interactions_pdb_list_IC = []
    for residues,binder,pdb_id in zip(interacting_residues,binders,pdb_list):
        for residue in residues:
            for region in all_regions:
                if residue >= region["start"] and residue <= region["end"]:
                    if region["name"] in ["Nterm","ECL1","ECL2","ECL3"]:
                        region_residue = "EC"
                    elif region["name"] in ["Cterm","ICL1","ICL2","ICL3"]:
                        region_residue = "IC"
                    elif region["name"] in ["TM1","TM3","TM5","TM7"]:
                        if residue <= region["start"] + round(((region["end"]-region["start"])/2)):
                            region_residue = "EC"
                        else:
                            region_residue = "IC"
                    elif region["name"] in ["TM2","TM4","TM6"]:
                        if residue <= region["start"] + round(((region["end"]-region["start"])/2)):
                            region_residue = "IC"
                        else:
                            region_residue = "EC"

                    interactions_pdb = {}
                    interactions_pdb["start"]=int(residue)
                    interactions_pdb["end"]=int(residue)
                    if region_residue == "IC":
                        interactions_pdb["type"]="Intracellular binding pocket residue"
                    else:
                        interactions_pdb["type"]="Extracellular binding pocket residue"
                    
                    interactions_pdb["description"]=f"{binder}. Inferred from {pdb_id}"
                    interactions_pdb["reference"]="https://www.ebi.ac.uk/pdbe/pisa/"

                    if region_residue == "IC":
                        interactions_pdb_list_IC.append(interactions_pdb)
                    else:
                        interactions_pdb_list_EC.append(interactions_pdb)

    #merge duplicates
    interactions_pdb_list_IC_no_duplicates,_ = merge_duplicates(interactions_pdb_list_IC)
    interactions_pdb_list_EC_no_duplicates,_ = merge_duplicates(interactions_pdb_list_EC)

    return interactions_pdb_list_IC_no_duplicates, interactions_pdb_list_EC_no_duplicates

In [11]:
def pad_line(line):
    """Helper function to pad line to 80 characters in case it is shorter"""
    size_of_line = len(line)
    if size_of_line < 80:
        padding = 80 - size_of_line + 1
        line = line.strip('\n') + ' ' * padding + '\n'
    return line[:81]  # 80 + newline character

def check_first_residue_pdb_uniprot(pdb_file,chain_interest,sequence):
    #SIFT mapping identifies 1st residue from PDB in uniprot. But that 1st residue is not necessarly the first residue that is resolved in the PDB
    #Here check if the first residue that is resolved corresponds to a residue at the same position in sequence
    #if yes, no need to renumber
    Three_to_One_AA = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
     'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

    fhandle = open(pdb_file, 'r')
    _pad_line = pad_line
    for line in fhandle:
        line = _pad_line(line)
        if line.startswith("ATOM") and line[21]==chain_interest:
            residue = Three_to_One_AA[line[17:20].upper()]
            pos = int(line[24:27])
            break
    if sequence[pos-1] == residue:
        return True
    else:
        return False
        
def SIFT_mapping(pdb_id,uniprot_id):
    #find equivalence start uniprot and pdb 
    # https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
    response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}")
    data = response.json()
    uniprot_pdb_start = data[pdb_id.lower()]['UniProt'][uniprot_id]['mappings'][0]['unp_start']
    pdb_start = data[pdb_id.lower()]['UniProt'][uniprot_id]['mappings'][0]['start']['author_residue_number']
    if not pdb_start:
        pdb_start = data[pdb_id.lower()]['UniProt'][uniprot_id]['mappings'][0]['start']['residue_number']
    return uniprot_pdb_start, pdb_start

def renumber_pdb_uniprot_start(pdb_file,pdb_id,chain,uniprot_id,sequence):
    uniprot_pdb_start, pdb_start=SIFT_mapping(pdb_id,uniprot_id)
    new_pdb_file_path = f"../examples/3Dstructures/pdb_renumbered/{pdb_id}.pdb"
    modify = False
    if not check_first_residue_pdb_uniprot(pdb_file,chain,sequence) and uniprot_pdb_start != pdb_start :
    #if both SIFT and the manually matching with the sequence doesn't work, renumber
        modify = True
    if modify:
        !python pdb_reres.py -$uniprot_pdb_start -$chain $pdb_file > $new_pdb_file_path
    else:
        uniprot_pdb_start, pdb_start = 0, 0
    return new_pdb_file_path, uniprot_pdb_start, pdb_start

In [None]:
def retrieve_pdb_dsbonds_interactions(uniprot_json,uniprot_id,sequence):
    structures = []
    ds_bonds = []
    pdbs = []
    chains = []
    uniprot_pdb_starts = []
    pdb_starts = []
    interacting_residues_list = []
    binders_list= []
    for i in range(len(uniprot_json['uniProtKBCrossReferences'])):
        if uniprot_json['uniProtKBCrossReferences'][i]['database'] == 'PDB':
            pdb_id = uniprot_json['uniProtKBCrossReferences'][i]['id']
            length_chain = 0
            if ',' in uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"]: #when there is a "," it means that there are multiple fragments, let's assume it's longer than 200 residues then
                sections = uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"].split(",")
            else:
                sections = [uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"]]
            for j in range(len(sections)):
                range_chain=sections[j].split("=")[1].split("-")
                length_chain += int(range_chain[1])-int(range_chain[0])
            if length_chain > 200:
                #download pdb file
                try:
                    pdb_file_path = f'../data/tmp/{pdb_id}.pdb'
                    urllib.request.urlretrieve(f'https://files.rcsb.org/download/{pdb_id}.pdb', pdb_file_path)

                    #get conformational state structure => provided by GPCRdb
                    state = "Undetermined" #default
                    for structure in structures_chain:
                        if structure["pdb_code"] == pdb_id:
                            state = structure["state"]
                            break
                    chain_pdb =  uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"][0]
                    full_chain_pdb = uniprot_json['uniProtKBCrossReferences'][i]["properties"][2]["value"].split("=")[0]

                    #get SIFTS mapping PDB seq > UniProt mapping and renumber pdb. New pdb being written
                    new_pdb_file_path, uniprot_pdb_start, pdb_start=renumber_pdb_uniprot_start(pdb_file_path,pdb_id,chain_pdb,uniprot_id,sequence)
                    if os.path.exists(new_pdb_file_path):
                        path_DB_pdb = f"file:///examples/3Dstructures/pdb_renumbered/{pdb_id}.pdb"
                    else:
                        path_DB_pdb = f"https://files.rcsb.org/download/{pdb_id}.pdb"
                        new_pdb_file_path = pdb_file_path
                    #this is needed to find the interactions within the pdb file
                    pdbs.append(pdb_id)
                    chains.append(full_chain_pdb)
                    uniprot_pdb_starts.append(uniprot_pdb_start)
                    pdb_starts.append(pdb_start)

                    #get offset and gaps in structure
                    offset = 0
                    start_found = False
                    gaps = []
                    gap = False
                    with open(new_pdb_file_path, 'r') as f:
                        for line in f:
                            line_list = line.split()
                            if not start_found:
                                if line_list[0]=="ATOM" and line_list[4]==chain_pdb:
                                    if int(line_list[5]) >= 1:
                                        offset = int(line_list[5]) - 1 #line_list[5] gives the 1st position so the offset is line_list[5] -1
                                        start_found = True
                            else:
                                if line_list[0]=="ATOM" and line_list[4]==chain_pdb:
                                    end = int(line_list[5])
                            
                    gaps = identify_gaps (new_pdb_file_path, chain_pdb, offset+1, end)
                    structures.append({"value":pdb_id,"chain": chain_pdb, "state":state, "offset":  offset, "gaps": gaps, "url":path_DB_pdb, "reference":f"https://www.rcsb.org/structure/{pdb_id.upper()}"})

                    #Look for disulfide bridges in PDB files
                    with open(new_pdb_file_path, 'r') as f:
                        for line in f:
                            line_list = line.split()
                            if line_list[0]=="SSBOND":
                                if line_list[3] == chain_pdb and line_list[6] == chain_pdb:
                                    ssbond = {}
                                    ssbond["start"] = int(line_list[4])
                                    ssbond["end"]= int(line_list[7])
                                    ssbond["description"] = "Disulfide bond"
                                    ssbond["reference"] = "Extracted from PDB files"
                                    ds_bonds.append(ssbond)


                    #find interacting residues at ligand binding site and G protein binding site
                    interacting_residues,binder_chain = retrieve_interacting_residues_PDB(pdb_id,chain_pdb,uniprot_pdb_start,pdb_start)
                    interacting_residues_list.append(list(set(interacting_residues)))
                    binder = extract_name_binders(binder_chain,pdb_file_path)
                    if binder != None:
                        binders_list.append(binder)

                    #remove pdb file
                    os.remove(pdb_file_path)
                
                except:
                    continue
            else:
                with open("to_look_at_pdbs.txt","a") as f:
                    f.write(pdb_id+"\n")
                    f.close()
    

    #Keep only the DS bonds that are in every PDB file (sometimes add DS bond to stabilize structure to be crystalized)
    ds_bonds_noduplicates=[dict(t) for t in {tuple(d.items()) for d in ds_bonds}]
    for single in ds_bonds_noduplicates:
        count = 0
        for multiple in ds_bonds:
            if single == multiple:
                count +=1
        if count != len(structures):
            ds_bonds_noduplicates.remove(single)   
            
    return structures, ds_bonds_noduplicates, interacting_residues_list, binders_list, pdbs

In [13]:
#equivalence between positions in sequence and in MSA
#dictionary with list of list. In every sublist, 2 elements, 1st is the position in sequence, the 2nd the position in MSA
def map_seq_MSA(sequence_aligned):
    previous = 0
    translate = {}
    sequence_nogaps = sequence_aligned.replace("-","")
    for res in range(len(sequence_nogaps)):
        idx_msa = previous + sequence_aligned[previous:].index(sequence_nogaps[res])
        translate[res+1]=idx_msa+1
        previous = idx_msa + 1
    return translate

#Microswitches/motifs - identify them based on their defined columns in mammalian MSA
def motifs_microswitches_literature(MSA,uniprot_id):
    #GPCRdb finds sodium pockets
    #As microswitches are well defined in literature we can check ourselves if these well knwon microswitches are present in our gpcrs
    #All known microswitches in literature for class A
    E_DRY_W = {"positions":["3.49", "3.50", "3.51"],"residues":["ED", "R", "WY"], "name": "E/DRY/W motif (ionic lock switch)"}
    CWxP = {"positions":["6.47", "6.48", "6.50"], "residues":["C", "W", "P"], "name": "CWxP motif (transmission toggle switch)"}
    NPxxY = {"positions":["7.49", "7.50", "7.53"], "residues":["N","P","Y"], "name": "NPxxY motif (tyr toggle switch)"}
    PIF = {"positions": ["5.50", "3.40", "6.44"], "residues":["P","I","F"], "name": "PIF motif"}
    hydrophobic_lock = {"positions":["3.43","6.40"], "residues":["LVIM", "LVIM"], "name": "hydrophobic lock"}
    ionic_lock = {"positions":["6.30"], "residues":["DE"], "name": "ionic lock"}
    #disulfide bond between TM3 and ECL2 is already identified by Uniprot in the "Disulfide bonds" section
    #Sodium binding pocket (allosteric action): middle of the 7TMs. Identified by GPCRdb but are the identified ones all of them???

    #the positions are in human readable format (not pyton - starts at 0)
    MSA_E_DRY_W = {"positions":[694,695,696],"residues":["ED", "R", "WY"], "name": "E/DRY/W motif (ionic lock switch)"}
    MSA_CWxP = {"positions":[1211,1212,1214], "residues":["C", "W", "P"], "name": "CWxP motif (transmission toggle switch)"}
    MSA_NPxxY = {"positions":[1290,1291,1294], "residues":["N","P","Y"], "name": "NPxxY motif (tyr toggle switch)"}
    MSA_PIF = {"positions": [947,685,1204], "residues":["P","I","F"], "name": "PIF motif"}
    MSA_hydrophobic_lock = {"positions":[688,1200], "residues":["LVIM", "LVIM"], "name": "Hydrophobic lock"}
    MSA_ionic_lock = {"positions":[1190], "residues":["DE"], "name": "Ionic lock"}
    MSA_sodium_pocket = {"positions":[619,684], "residues":["D","S"], "name": "Sodium binding pocket"}

    TM1x50={"positions":[579],"residues":["N"], "name": "1.50 (BW numbering)"}
    TM2x50={"positions":[619],"residues":["D"], "name": "2.50 (BW numbering)"}
    TM3x50={"positions":[695],"residues":["R"], "name": "3.50 (BW numbering)"}
    TM4x50={"positions":[751],"residues":["P"], "name": "4.50 (BW numbering)"}
    TM5x50={"positions":[947],"residues":["P"], "name": "5.50 (BW numbering)"}
    TM6x50={"positions":[1214],"residues":["P"], "name": "6.50 (BW numbering)"}
    TM7x50={"positions":[1291],"residues":["P"], "name": "7.50 (BW numbering)"}

    alignment = AlignIO.read(open(MSA), "fasta")
    len_MSA=alignment.get_alignment_length()
    record_dict = SeqIO.index(MSA, "fasta")
    aligned_seq_interest = str(record_dict[uniprot_id].seq)
    translate_seq_MSA = map_seq_MSA(aligned_seq_interest) #gives position of a unaligned res in msa
    translate_MSA_seq = {v: k for k, v in translate_seq_MSA.items()} #gives position of a aligned res in unaligned seq
    microswitch_types = [MSA_E_DRY_W, MSA_CWxP, MSA_NPxxY, MSA_PIF, MSA_hydrophobic_lock, MSA_ionic_lock,
                         TM1x50,TM2x50,TM3x50,TM4x50,TM5x50,TM6x50,TM7x50]
    microswitches = []
    microswitches_residues = []

    for microswitch_type in microswitch_types:
        are_there = []
        for position, residue in zip(microswitch_type["positions"], microswitch_type["residues"]):
            if aligned_seq_interest[position-1] in residue:
                are_there.append(True)
            else:
                are_there.append(False)
        for i, (position, residue) in enumerate(zip(microswitch_type["positions"], microswitch_type["residues"])):
            microswitch_residue = {}
            
            #take into account the possibility that there is a gap at that position in the MSA
            if position in translate_MSA_seq:
                microswitch_residue["start"] = translate_MSA_seq[position]
                microswitch_residue["end"] = translate_MSA_seq[position]
                residue_motif = aligned_seq_interest[position-1]
            else:
                for next in range(position+1,len_MSA):
                    if next in translate_MSA_seq:
                        microswitch_residue["start"] = translate_MSA_seq[next]
                        microswitch_residue["end"] = translate_MSA_seq[next]
                        residue_motif = aligned_seq_interest[next-1]
                        break

            if not all(are_there) and not are_there[i]:
                if residue_motif == "F" and microswitch_type["name"]=="PIF motif":
                    microswitch_residue["description"] = residue_motif + " instead of "+ residue+ " from " + " (part of " + microswitch_type["name"]+ ")"
                elif residue_motif == "R" and microswitch_type["name"]=="E/DRY/W motif (ionic lock switch)":
                    microswitch_residue["description"] = residue_motif + " instead of "+ residue + " (part of " + microswitch_type["name"]+ ")"
                elif "(BW numbering)" in microswitch_type["name"]:
                    microswitch_residue["description"] = residue_motif+ " instead of "+ residue + " " + microswitch_type["name"]
                else:
                    microswitch_residue["description"] = residue_motif + " instead of "+ residue + " (part of " + microswitch_type["name"]+ ")"
            else:
                if residue_motif == "F" and microswitch_type["name"]=="PIF motif":
                    microswitch_residue["description"] = residue_motif + " part of " + microswitch_type["name"] +" and hydrophobic lock"
                elif residue_motif == "R" and microswitch_type["name"]=="E/DRY/W motif (ionic lock switch)":
                    microswitch_residue["description"] = residue_motif+ " part of " + microswitch_type["name"] +" and ionic lock"
                elif "(BW numbering)" in microswitch_type["name"]:
                    microswitch_residue["description"] = residue_motif+ " " + microswitch_type["name"]
                else:
                    microswitch_residue["description"] = residue_motif+ " part of " + microswitch_type["name"]
            if are_there[i]:
                microswitch_residue["conserved"] = "yes"
            else:
                microswitch_residue["conserved"] = "no"
            microswitch_residue["reference"] = "Based on alignment"
            microswitches_residues.append(microswitch_residue)

    return microswitches_residues

In [None]:
#features from Uniprot binding site, PTM, natural variants
def features_uniprot(uniprot_json,uniprot_id):
    binding_sites = []
    PTMs = []
    disulfide_bonds = []
    mutagenesiss = []

    for i in range(len(uniprot_json['features'])):

        #Motifs/Microswitches Uniprot are already identified by the self written code above where we check if all well defined microswitches knwon in literature are present/absent in the GPCR of interest

        #Binding site (orthosteric & allosteric)
        if uniprot_json['features'][i]['type'] == 'Binding site':
            binding_site = {}
            binding_site['start'] = uniprot_json['features'][i]['location']['start']['value']
            binding_site['end'] = uniprot_json['features'][i]['location']['end']['value']
            binding_site['type'] = "Ligand binding residue"
            binding_site['description'] = uniprot_json['features'][i]['ligand']['name']
            binding_site["reference"] = f"https://www.uniprot.org/uniprotkb/{uniprot_id}/entry"
            binding_sites.append(binding_site)
        ##PTMs
        #Glycosylation
        elif uniprot_json['features'][i]['type'] == 'Glycosylation' or uniprot_json['features'][i]['type'] == 'Lipidation' or uniprot_json['features'][i]['type'] == 'Modified residue':
            PTM = {}
            if uniprot_json['features'][i]['type'] == 'Glycosylation':
                PTM['start'] = uniprot_json['features'][i]['location']['start']['value']
                PTM['end'] = uniprot_json['features'][i]['location']['end']['value']
                PTM['description'] = 'Glycosylation'
            elif uniprot_json['features'][i]['type'] == 'Lipidation':
                PTM['start'] = uniprot_json['features'][i]['location']['start']['value']
                PTM['end'] = uniprot_json['features'][i]['location']['end']['value']
                PTM['description'] = 'Lipidation'
            else:
                ptm_types = ["phospho","methyl","acetyl", "amid", "pyrro", "hydroxy", "l-", "d-", "sulf","nitro"]
                full_ptm_types = ["Phosphorylation", "Methylation", "Acetylation", "Amidation", "Pyrrolidone carboxylic acid", "Hydroxylation", "Isomerization", "Isomerization", "Sulfation", "Nitrosylation"]
                description = uniprot_json['features'][i]['description']
                for idx, ptm_type in enumerate(ptm_types):
                    if ptm_type in uniprot_json['features'][i]['description'].lower():
                        description = full_ptm_types[idx]
                        break
                PTM['start'] = uniprot_json['features'][i]['location']['start']['value']
                PTM['end'] = uniprot_json['features'][i]['location']['end']['value']
                PTM['description'] = description
            PTM['reference'] = f"https://www.uniprot.org/uniprotkb/{uniprot_id}/entry"
            PTMs.append(PTM)

    return binding_sites,PTMs

In [None]:
#retrieve PTMs on Scop3P and compare with PTMs we have already found
def retrieve_PTM_Scop3P(uniprot_id):
    requestURL = f"https://iomics.ugent.be/scop3p/api/modifications?accession={uniprot_id}"
    try: 
        r = requests.get(requestURL)
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        scop3P_PTM = json.loads(r.text)
    except:
        scop3P_PTM = []

    if len(scop3P_PTM)>0:

        #add if not alrady there
        ptms = []
        for ptm in scop3P_PTM["modifications"]:
            position = ptm['position']
            # if not position in positions_previous:
            ptm_dict = {'start': position , 'end': position, 'description': 'Phosphorylation', 'reference': "https://iomics.ugent.be/scop3p/index?protein={uniprot_id}"}
            ptms.append(ptm_dict)
    else:
        ptms=[]
    return ptms

In [None]:
#find TM regions based on TMbed predictions
#find the TM regions
def TMbed_retrieve_allregions(TM_regions_file,uniprotID):
    
    regions = []
    region_names = ['Nterm','TM1', 'ICL1', 'TM2', 'ECL1', 'TM3', 'ICL2', 'TM4', 'ECL2', 'TM5', 'ICL3', 'TM6', 'ECL3', 'TM7','Cterm']

    with open(TM_regions_file, 'r') as f:
        count = 0
        TM_counter = 1
        name =""
        ok=False
        error = False
        for line in f:
            line=line.replace('\n',"").replace('\t'," ")
            line=list(line.split())
            count +=1
            if ">" in line[0]:
                try:
                    name = line[0].split('|')[1]
                except:
                    name = line[0][1:]
                count = 1
            if name == uniprotID and (count%3==0):
                counter_region = 0
                start = False
                
                previous = 1
                for idx,element in enumerate(line[0]):
                    if start == False:
                        if element == "H" or element == "h":
                            start = True
                            lower_lim_TM = idx+1 #human readble limits
                            region = {}
                            region["name"] = region_names[counter_region]
                            region["start"] = previous
                            region["end"] = lower_lim_TM-1
                            region["reference"] = "TMbed"
                            regions.append(region)
                            counter_region +=1
                    else:
                        if element != "H" and element != "h":
                            upper_lim_TM = idx  #human readble limits
                            start = False
                            region = {}
                            region["name"] = region_names[counter_region]
                            region["start"] = lower_lim_TM
                            region["end"] = upper_lim_TM
                            region["reference"] = "https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-022-04873-x"
                            regions.append(region)
                            counter_region +=1
                            previous=upper_lim_TM+1
                region = {}
                region["name"] = region_names[counter_region]
                region["start"] = previous
                region["end"] = len(line[0])
                region["reference"] = "https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-022-04873-x"
                regions.append(region)
                break

    return regions

In [None]:
def cutting_pts_2_ss_region(dict_regions,all_regions=None):
    translated_regions = []
    if isinstance(dict_regions,dict):
        for region in dict_regions.keys():
            lower_lim = dict_regions[region][0]
            upper_lim = dict_regions[region][1]
            for ss_region in all_regions:
                if lower_lim >= ss_region["start"] and lower_lim <= ss_region["end"]:
                    ss_lower_lim = ss_region["name"]
                if upper_lim >= ss_region["start"] and upper_lim <= ss_region["end"]:
                    ss_upper_lim = ss_region["name"]
            translated_regions.append(str(lower_lim)+"-"+str(upper_lim)+f" ({ss_lower_lim}-{ss_upper_lim})")
    elif isinstance(dict_regions,list):
        if all_regions:
            for section in dict_regions:
                info = []
                for pos in section:
                    for ss_region in all_regions:
                        if int(pos) >= ss_region["start"] and int(pos) <= ss_region["end"]:
                            ss_pos = ss_region["name"]
                            break
                    info.append(pos)
                    info.append(ss_pos)
                translated_regions.append(str(info[0])+"-"+str(info[2])+f" ({info[1]}-{info[3]})")
        else:
            for region in dict_regions:
                lower_lim = region[0]
                upper_lim = region[1]
                translated_regions.append(str(lower_lim)+"-"+str(upper_lim))
    return translated_regions

def often_used_cutting_pts(cutting_points_parent,chimera,sequence,all_regions_cutting_pts,related_chimeras,dict_regions):
    cutting_pts = chimera[cutting_points_parent]
    for region in cutting_pts:
        positions=region.split(" ")[0]
        name=region.split(" ")[1][1:-1]
        if positions.split("-")[0] != "1":
            position = int(positions.split("-")[0])
            name_region= name.split("-")[0]
            lower_lim = [d for d in dict_regions if d.get("name") == name_region][0]["start"]
            upper_lim = [d for d in dict_regions if d.get("name") == name_region][0]["end"]
            a_third = round((upper_lim-lower_lim)/3)
            if position < (lower_lim + a_third):
                idx = 0
            elif position < (lower_lim + 2*a_third):
                idx = 1
            else:
                idx = 2
            try:
                all_regions_cutting_pts[name_region].append([sequence[position-1]+str(position),idx])
            except:
                all_regions_cutting_pts[name_region]=[[sequence[position-1]+str(position),idx]]
            try:
                related_chimeras[sequence[position-1]+str(position)].append(chimera["name"])
            except:
                related_chimeras[sequence[position-1]+str(position)]=[chimera["name"]]
        if positions.split("-")[1] != str(len(sequence)):
            position = int(positions.split("-")[1])
            name_region= name.split("-")[1]
            lower_lim = [d for d in dict_regions if d.get("name") == name_region][0]["start"]
            upper_lim = [d for d in dict_regions if d.get("name") == name_region][0]["end"]
            a_third = lower_lim+round((upper_lim-lower_lim)/3)
            if position < (lower_lim + a_third):
                idx = 0
            elif position < (lower_lim + 2*a_third):
                idx = 1
            else:
                idx = 2
            try:
                all_regions_cutting_pts[name_region].append([sequence[position-1]+str(position),idx])
            except:
                all_regions_cutting_pts[name_region]=[[sequence[position-1]+str(position),idx]]
            try:
                related_chimeras[sequence[position-1]+str(position)].append(chimera["name"])
            except:
                related_chimeras[sequence[position-1]+str(position)]=[chimera["name"]]
    return all_regions_cutting_pts,related_chimeras


def retrieve_involvement_natural_chimeric_design(uniprot_id,abb_name,sequence,chimeric_design_df):

    involvement = []

    all_regions_cutting_pts_all = {}
    related_chimeras_all = {}
    for parent_column_id in ['Reference_id','Target_id']:

        #find rows that have uniprot as ref id or target id
        designs_parent = chimeric_design_df[chimeric_design_df[parent_column_id] == uniprot_id]

        #Info from rows
        names_chimeras = designs_parent['Chimera_name'].tolist()
        ids_chimeras = designs_parent['Chimera_name_ids'].tolist()
        regions_chimera = designs_parent['Chimera_parts'].tolist()
        name_target_chimeras = designs_parent['Target_name'].tolist()
        id_target_chimeras = designs_parent['Target_id'].tolist()

        name_ref_chimeras = designs_parent['Reference_name'].tolist()
        id_ref_chimeras = designs_parent['Reference_id'].tolist()

        regions_ref_chimeras = designs_parent['Reference_cutting_points'].tolist()
        regions_target_chimeras = designs_parent['Target_cutting_points'].tolist()

        expression = designs_parent['Expression binary'].tolist()
        fct = designs_parent['Function binary'].tolist()

        application = designs_parent['Application'].tolist()
        type_chimera = designs_parent['Chimera Type (1/2/3)'].tolist()
        Gprot = designs_parent['G-protein'].tolist()
        Ligand = designs_parent['Ligand'].tolist()
        structures = designs_parent['3D structure PDB'].tolist()
        biblio = designs_parent['DOI'].tolist()

        for i,name in enumerate(names_chimeras):

            all_regions = TMbed_retrieve_allregions("../data/TM_regions_chimeras_TMbed.txt",name)
            cutting_pt_chimera = cutting_pts_2_ss_region(eval(regions_chimera[i]),all_regions) 

            all_regions_ref = TMbed_retrieve_allregions("../data/Uniprot_TMbed.txt",id_ref_chimeras[i])
            cutting_pt_ref = cutting_pts_2_ss_region(eval(regions_ref_chimeras[i]),all_regions_ref)

            all_regions_target = TMbed_retrieve_allregions("../data/Uniprot_TMbed.txt",id_target_chimeras[i])
            cutting_pt_target = cutting_pts_2_ss_region(eval(regions_target_chimeras[i]),all_regions_target)

            pharma_name_ref = html.unescape(get_pharma_name(id_ref_chimeras[i],name_ref_chimeras[i]))
            pharma_name_target = html.unescape(get_pharma_name(id_target_chimeras[i], name_target_chimeras[i]))

            pharma_name_ref_ = pharma_name_ref
            pharma_name_target_ = pharma_name_target
            if "receptor" in pharma_name_ref.lower():
                pharma_name_ref_ = pharma_name_ref.replace(" receptor","")
            if "receptor" in pharma_name_target.lower():
                pharma_name_target_ = pharma_name_target.replace(" receptor","")
            pharma_name = pharma_name_ref_ + " " + pharma_name_target_ + " receptor"
            if "adrenoceptor" in pharma_name:
                pharma_name = pharma_name.replace(" receptor","")

            if isinstance(structures[i],str):
                pdb = structures[i]
            else:
                pdb = ""

            if isinstance(Gprot[i],str):
                gprot=Gprot[i]
            else:
                gprot=""

            if isinstance(Ligand[i],str):
                ligand=Ligand[i]
            else:
                ligand=""

            chimera={
            "name":name,
            "name_pharma":pharma_name,
            "id":ids_chimeras[i],
            "ref": name_ref_chimeras[i],
            "ref_pharma_name": pharma_name_ref,
            "target": name_target_chimeras[i],
            "target_pharma_name":pharma_name_target,
            "cutting_point_chimera": cutting_pt_chimera,
            "cutting_point_ref": cutting_pt_ref,
            "cutting_point_target": cutting_pt_target,
            "expression_function": fct[i],
            "type":type_chimera[i],
            "GprotLigand": gprot+" "+ligand,
            "application": application[i]+" "+pdb,
            "reference": biblio[i]
            }

            involvement.append(chimera)

            if "_".join(name.split("_")[:2]) == abb_name:
                all_regions_cutting_pts_all,related_chimeras_all=often_used_cutting_pts("cutting_point_ref",chimera,sequence,all_regions_cutting_pts_all,related_chimeras_all,all_regions_ref)
            else: 
                all_regions_cutting_pts_all,related_chimeras_all=often_used_cutting_pts("cutting_point_target",chimera,sequence,all_regions_cutting_pts_all,related_chimeras_all,all_regions_target)                
    
    for key,value in all_regions_cutting_pts_all.items():
        unique_items = list(map(list, set(map(tuple, value))))
        all_regions_cutting_pts_all[key] = unique_items

    for key,value in related_chimeras_all.items():
        unique_items = list(set(value))
        related_chimeras_all[key] = unique_items

    return involvement,all_regions_cutting_pts_all,related_chimeras_all

In [19]:
def get_pharma_name(uniprotID,abb_name):
    #GtoP or gpcrdb_name or pharmacological name
    try:
        requestURL = f"https://gpcrdb.org/services/protein/accession/{uniprotID}"

        r = requests.get(requestURL, headers={ "Accept" : "application/json"})
        if not r.ok:
            info_entry = None
        else:
            info_entry = json.loads(r.text)
    except:
        info_entry = None

    if not info_entry is None:
        clean_html_tags = re.compile('<.*?>')
        pharma_name = re.sub(clean_html_tags, '', info_entry["name"])
    else:
        pharma_name = abb_name

    return pharma_name

In [20]:
def remove_duplicates(list_dictionaries,uniprot_id, descriminator1, descriminator2=None):
    # Initialize a dictionary to count occurrences of (start, description) pairs
    count = {}

    # First pass: Count occurrences of each (start, description) pair
    for d in list_dictionaries:
        if descriminator2 != None:
            identifier = (d[descriminator1], d[descriminator2])
        else:
            identifier = (d[descriminator1])
        if identifier in count:
            count[identifier] += 1
        else:
            count[identifier] = 1

    # Initialize a set to track seen (start, description) pairs
    seen = set()
    # Initialize a list to store the filtered dictionaries
    unique_dict_list = []

    # Second pass: Filter dictionaries and update "other_key" for duplicates
    for d in list_dictionaries:
        if descriminator2 != None:
            identifier = (d[descriminator1], d[descriminator2])
        else:
            identifier = (d[descriminator1])
        if identifier not in seen:
            # If this (start, description) pair is a duplicate (appears more than once)
            if count[identifier] > 1:
                # don't choose randomly the reference, set it to the preferred reference type
                if descriminator2 != None:
                    d["reference"] = f"https://iomics.ugent.be/scop3p/index?protein={uniprot_id}"
                else:
                    d["reference"] = "https://www.ebi.ac.uk/pdbe/pisa/"
            # Add it to the seen set
            seen.add(identifier)
            # Add the dictionary to the unique list
            unique_dict_list.append(d)
    return unique_dict_list

In [None]:
def heatmap_cutting_pts(pharma_name,abbreviated_name,chimeras,allregions):

    # Number of heatmaps
    num_heatmaps = len(allregions)

    # Create a custom colormap starting with white and transitioning to red
    colors = ["white", "#ffcccc", "#ff9999", "#ff6666", "#ff3333", "#ff0000", "#cc0000", "#990000", "#660000", "#330000"]
    cmap_reds_white = LinearSegmentedColormap.from_list("RedsWhite", colors)

    for functionality in [True,False]:
        for parent_nameDB,parent_name,parent_pts in zip(["EC","IC"],["ref","target"],["cutting_point_ref","cutting_point_target"]):
            heatmaps_data = []
            for i,region in enumerate(allregions):
                heatmap = [0]* (region["end"]-region["start"]+1)
                nb_designs = 0
                for design in chimeras:
                    if functionality:
                        if design["expression_function"].lower() == "yes" or design["expression_function"].lower() == "not assessed":
                            if design[parent_name] == abbreviated_name:
                                for cutting_pt in design[parent_pts]:
                                    if cutting_pt.split(" ")[1].split("-")[0][1:] == region["name"]:
                                        position = int(cutting_pt.split(" ")[0].split("-")[0]) - region["start"]+ 1 
                                        heatmap[position-1] +=1
                                    if cutting_pt.split(" ")[1].split("-")[1][:-1] == region["name"]:
                                        position = int(cutting_pt.split(" ")[0].split("-")[1]) - region["start"] + 1 
                                        heatmap[position-1] +=1
                                nb_designs +=1
                    else:
                        if design["expression_function"].lower() != "yes" and design["expression_function"].lower() != "not assessed":
                            if design[parent_name] == abbreviated_name:
                                for cutting_pt in design[parent_pts]:
                                    if cutting_pt.split(" ")[1].split("-")[0][1:] == region["name"]:
                                        position = int(cutting_pt.split(" ")[0].split("-")[0]) - region["start"]+ 1 
                                        heatmap[position-1] +=1
                                    if cutting_pt.split(" ")[1].split("-")[1][:-1] == region["name"]:
                                        position = int(cutting_pt.split(" ")[0].split("-")[1]) - region["start"] + 1 
                                        heatmap[position-1] +=1
                                nb_designs +=1
                heatmaps_data.append((np.array(heatmap)/nb_designs).reshape(-1, 1))

            # Create a figure with a grid of subplots arranged horizontally
            fig, axes = plt.subplots(1, num_heatmaps, figsize=(10,5))
            if functionality:
                fig.suptitle(f"Functional Designs ({nb_designs}) with {html.unescape(pharma_name)} ({abbreviated_name}) as {parent_nameDB} side parent",fontsize=12)
            else:
                fig.suptitle(f"Non-functional Designs ({nb_designs}) with {html.unescape(pharma_name)} ({abbreviated_name}) as {parent_nameDB} side parent",fontsize=12)
            
            # Plot each heatmap
            for i, ax in enumerate(axes):
                cax = ax.matshow(heatmaps_data[i], aspect='auto', cmap=cmap_reds_white, vmin=0, vmax=1)  # Ensure vmin and vmax are set to 0 and 1
                ax.yaxis.set_ticks_position('left')
                ax.yaxis.set_tick_params(labelleft=True)

                # Major ticks every 4 elements
                major_tick_positions = range(0, heatmaps_data[i].shape[0], 5)
                major_tick_labels = np.arange(allregions[i]["start"], allregions[i]["end"] + 1)[::5]
                ax.set_yticks(major_tick_positions)
                ax.set_yticklabels(major_tick_labels, fontsize=6)
                
                # Minor ticks at every element
                minor_tick_positions = range(heatmaps_data[i].shape[0])
                ax.set_yticks(minor_tick_positions, minor=True)
                ax.set_title(allregions[i]["name"],fontsize=10)
                ax.set_xticks([])  # Hide x-axis ticks
                # for y in range(heatmaps_data[i].shape[0]):
                #     ax.axhline(y - 0.5, color='black', linewidth=0.5)  # Line between each row

            # Adjust layout to make room for colorbar
            plt.subplots_adjust(right=0.85, wspace=1.2)

            # Add color bar completely to the right
            cbar_ax = fig.add_axes([0.87, 0.15, 0.02, 0.7])
            cbar = fig.colorbar(cax, cax=cbar_ax, orientation='vertical')

            # Add label to the color bar
            cbar.set_label('Frequency of designs with these cutting points', fontsize=10)
            # Add ticks and labels to the color bar
            # cbar_ticks = [0, 0.25, 0.5, 0.75, 1.0]
            # cbar.set_ticks(cbar_ticks)
            # cbar.set_ticklabels(['0%', '25%', '50%', '75%', '100%'])

            if functionality:
                plt.savefig(f"../examples/heatmap_cutting_pts/{abbreviated_name}_{parent_nameDB}_functional.png",dpi=200)
            else:
                plt.savefig(f"../examples/heatmap_cutting_pts/{abbreviated_name}_{parent_nameDB}_non_functional.png",dpi=200)

            plt.close()
    return

In [22]:
#convert scientific name UniProt to common name
def parse_species_file(file_path):
    species_dict = {}
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    scientific_name = None
    
    for line in lines:
        if "N=" in line:
            scientific_name = line.split("N=")[1].strip()
        elif "C=" in line and scientific_name:
            common_name = line.split("C=")[1].strip()
            species_dict[scientific_name] = common_name
            scientific_name = None  # Reset for the next entry
    
    return species_dict

file_path = '../data/UniProt_names_scientific_common.txt' 
species_dict = parse_species_file(file_path)

In [None]:
for uniprotID, prot_seq in entry_uniprotID_seq.items():
    requestURL = f"https://rest.uniprot.org/uniprotkb/{uniprotID}.json"
    r = requests.get(requestURL, headers={ "Accept" : "application/json"})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    uniprot_json = json.loads(r.text)

    class_ = 'A' #always class A for now
    abbreviated_name = uniprot_json["uniProtkbId"]
    names = []
    names.append({"value":uniprot_json['proteinDescription']['recommendedName']['fullName']['value'], "reference":"UniProt"})
    try:
        if 'alternativeNames' in uniprot_json['proteinDescription']:
            for i in range(len(uniprot_json['proteinDescription']['alternativeNames'])):
                names.append({"value":uniprot_json['proteinDescription']['alternativeNames'][i]['fullName']['value'],"reference":"UniProt"})
    except:
        pass

    # if "a"=="a":
    if not os.path.exists(f'../examples/json_entries/all_mammals_latest/{abbreviated_name.upper()}.json'):
        print(uniprotID)
        species_scientific = uniprot_json['organism']['scientificName']
        if species_scientific in species_dict:
            species = species_dict[species_scientific]
        
        #find classfication based on human classification in GPCRdb
        #find human ortholog
        family = ""
        subclass_ligand = ""
        subclass_phylo = ""
        try:
            if listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprotID]["Phylogenetically-based"].values[0] == "A-other":
                if not abbreviated_name.endswith("HUMAN"):
                    abbreviated_name_human = (abbreviated_name.split('_')[0]+"_"+"HUMAN").lower()
                    uniprot_id_human = listGPCRdb_df[listGPCRdb_df['Name'] == abbreviated_name_human]['Uniprot ID'].values[0] 
            else:
                uniprot_id_human = uniprotID
            family = listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprot_id_human]["Subclass"].values[0].rstrip() #need to change this to interpro API for those not on GPCRdb
            if "Class A" in family:
                family = family.replace("Class A ","")
                family = family[0].upper()+family[1:]
            if "receptors" in family:
                family = family.replace("receptors","").rstrip()
            elif "receptor" in family:
                family = family.replace("receptor","").rstrip()
            subclass_ligand = listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprot_id_human]["Ligand-based"].values[0].rstrip() #need to change so it works for all mammals (put same as what we have for humans? What with those not on GPCRdb?)
            if "receptors" in subclass_ligand:
                subclass_ligand = subclass_ligand.replace("receptors","").rstrip()
            elif "receptor" in subclass_ligand:
                subclass_ligand = subclass_ligand.replace("receptor","").rstrip()
            subclass_phylo = listGPCRdb_df[listGPCRdb_df['Uniprot ID'] == uniprot_id_human]["Phylogenetically-based"].values[0]
            if "A-" in subclass_phylo:
                subclass_phylo = subclass_phylo.split('-')[1].rstrip()
                subclass_phylo = subclass_phylo[0].upper()+subclass_phylo[1:]
        except:
            if any("olfactory" in entry["value"].lower() for entry in names): #many of the olfactory humans are not in GPCRdb so no classification. We took the olf human ones from GROSS
                family = "Olfactory"
                subclass_ligand = "Olfactory"
                subclass_phylo = "Olfactory"
            else:
                print("Problem with classification (family,subfamily...)")

        #GtoP or gpcrdb_name or pharmacological name
        try:
            requestURL = f"https://gpcrdb.org/services/protein/accession/{uniprotID}"

            r = requests.get(requestURL, headers={ "Accept" : "application/json"})
            if not r.ok:
                info_entry = None
            else:
                info_entry = json.loads(r.text)
        except:
            info_entry = None

        if not info_entry is None:
            clean_html_tags = re.compile('<.*?>')
            pharma_name = re.sub(clean_html_tags, '', info_entry["name"])
        else:
            pharma_name = abbreviated_name
        
        #Gprot and Barr coupling data from GPCRdb
        try:
            Gprot_coupling_data,Barr_coupling_data=coupling_Gprot_Barr(uniprotID)
        except:
            Gprot_coupling_data,Barr_coupling_data=[],[]
        
        #Variations Uniprot
        requestURL = f"https://www.ebi.ac.uk/proteins/api/variation/{uniprotID}"

        r = requests.get(requestURL, headers={ "Accept" : "application/json"})
        if not r.ok:
            variations_uniprot_json = None
        else:
            variations_uniprot_json = json.loads(r.text)

        #Pharmacological info from GPCRdb mutants (API service)
        requestURL = f"https://gpcrdb.org/services/mutants/{abbreviated_name.lower()}"
        try:
            r = requests.get(requestURL, headers={ "Accept" : "application/json"})
            pharmaco_data_json = json.loads(r.text)
        except:
            pharmaco_data_json = None

        #name endogenous ligands
        ligands = []
        for value in endogenous_ligands:
            if value["receptor"] == abbreviated_name.lower():
                # ligands.append({"value":  value["ligand_name"].replace("&", "").replace(";", "").replace("<sub>","_"), "reference": "GPCRdb"})
                ligands.append({"value":  value["ligand_name"].replace("&", "").replace(";", "").replace("<sup>","").replace("</sup>","").replace("<sub>","").replace("</sub>",""), "reference": "GPCRdb"})

        seen_values = set()
        ligands = [item for item in ligands if item["value"] not in seen_values and not seen_values.add(item["value"])]
    
        MSA = "../data/MSA_all_mammalian.fasta"
        microswitches_literature = motifs_microswitches_literature(MSA,uniprotID)

        #Retrieve the mutagenesis information from Uniprot related to the entry
        if variations_uniprot_json:
            mutations_Uniprot = retrieve_mutagenesis_info_Uniprot(variations_uniprot_json,uniprotID)
        else:
            mutations_Uniprot = []

        #Pharmacological data - mutants info from GPCRdb API
        if pharmaco_data_json:
            pharmaco = retrieve_pharmaco_info_GPCRdb(pharmaco_data_json)
        else:
            pharmaco = []

        #If no conformational biosensor just put confo_biosensor = []
        confo_biosensor = []

        #find chimeras this parent is involved in
        #find the cutting points used for this parent in all designs that we know so we can learn form that
        chimeras,all_regions_cutting_pts,related_chimeras=retrieve_involvement_natural_chimeric_design(uniprotID,abbreviated_name,prot_seq,chimeric_design_df)
        
        cutting_point_values = [] #leave empty if it's a natural

        #get structures: pdb id, chain, state and offset
        #From the PDB (exp structures)
        structures, ds_bonds,interacting_residues, binders, PDBs = retrieve_pdb_dsbonds_interactions(uniprot_json,uniprotID,prot_seq)
        #AlphaFold2
        structures.append({"value":f"AlphaFold2","chain": "A",  "state":"Undetermined", "offset":  0, "gaps": [], "url": f"https://alphafold.ebi.ac.uk/files/AF-{uniprotID}-F1-model_v4.pdb", "reference":"AFDB"})
        
        #AlphaFold multistate. Don't have a AF ms for every GPCR (only humans). Need to check if file exist:
        af_ms_active = f"../examples/3Dstructures/AFms/Active/{uniprotID}.pdb"
        af_ms_inactive = f"../examples/3Dstructures/AFms/Inactive/{uniprotID}.pdb"
        if os.path.exists(af_ms_active):
            af_ms_active = f"file:///examples/3Dstructures/AFms/Active/{uniprotID}.pdb"
            structures.append({"value":f"AlphaFold2 Active","chain": "A",  "state":"Active", "offset":  0, "gaps": [], "url": af_ms_active, "reference":"AlphaFold multistate"})
        if os.path.exists(af_ms_inactive):
            af_ms_inactive = f"file:///examples/3Dstructures/AFms/Inactive/{uniprotID}.pdb"
            structures.append({"value":f"AlphaFold2 Inactive","chain": "A",  "state":"Inactive", "offset":  0, "gaps": [], "url": af_ms_inactive, "reference":"AlphaFold multistate"})

        #retrieve all limits of the secondary structure elements
        TM_regions_file = "../data/Uniprot_TMbed.txt"
        allregions = TMbed_retrieve_allregions(TM_regions_file,uniprotID)

        #retrieve the residues interacting with ligand/Gprot/Nb/Ab in PDB and link it to region
        #Add manually extra IC and EC contacts
        #should follow the following structure: list regrouping all dictionaries with 1 dict per contact
        #in dictionary: {"start":,"end","type","description","reference"}
        #for the EC contacts the types can be "orthosteric","allosteric","VHH EC"
        #for the IC contacts the types can be "G-protein","VHH IC"
        manual_ICs,manual_ECs = translate_interacting_residues_IC_EC(interacting_residues,binders,PDBs,allregions)

        #Features uniprot PTM, binding site uniptor
        features = {}
        ligand_BS_uniprot,PTMs_uniprot = features_uniprot(uniprot_json,uniprotID)

        #Scop3P phosphorylations
        PTMs_scop3P=retrieve_PTM_Scop3P(uniprotID)

        info = {}

        #Abbreviated name
        info["Abbreviated name"] = [{"value": abbreviated_name.upper(), "reference": "UniProt"}]

        #pharma name
        info["Pharmacological name"] = [{"value": pharma_name, "reference": "GPCRdb"}]

        #Name
        info["Name(s)"] = names

        #Uniprot ID
        info["Uniprot ID"] = [{"value": uniprotID, "reference": "UniProt"}]

        #Species
        info["Organism"] =  [{"value":species, "reference": "UniProt"}]

        #Class
        info["Class"] = [{"value":class_, "reference": "GPCRdb"}]

        #Family
        info["Family"] = [{"value": family, "reference": "GPCRdb"}]

        #Subclass
        #Phylogenetically based & Ligand based
        info["Subclass"] = {"Phylogenetically based": [{"value": subclass_phylo, "reference": "10.1124/mol.63.6.1256"}],
                            "Ligand based": [{"value":subclass_ligand, "reference": "GPCRdb"}]}

        #Endogenous ligand 
        info["Endogenous ligand"]=ligands

        #Gport and Barr coupling data
        info["G-protein coupling"]=Gprot_coupling_data
        info["Beta-arrestin coupling"] = Barr_coupling_data

        #Structures
        info["Structures"] = structures

        #Info related to chimeric design
        info["Conformational biosensor"] = confo_biosensor
        info["Involvement in chimeric design"] = chimeras
        info["Cutting point values"] = cutting_point_values
        info["Known cutting points and designs"] = {"Known cutting points":all_regions_cutting_pts,"Known designs":related_chimeras}

        features['Microswitches'] = microswitches_literature

        #remove duplicate PTMs, keep Scop3P ref
        PTMs_tot = PTMs_uniprot + PTMs_scop3P
        PTMs_tot_unique = remove_duplicates(PTMs_tot,uniprotID,"start",descriminator2="description")

        features['PTMs'] = PTMs_tot_unique
        features['Disulfide bonds'] = ds_bonds
        features['Mutagenesis'] = mutations_Uniprot
        features['Pharmacological mutagenesis'] = pharmaco

        #remove duplicate contacts, keep uniprot
        Contacts_EC_tot = manual_ECs + ligand_BS_uniprot
        Contacts_EC_unique = remove_duplicates(Contacts_EC_tot,uniprotID,"start")

        features["Contacts"] = Contacts_EC_unique +  manual_ICs

        info["Features"] = features

        #Sequence
        info["Sequence"] = [{"value":prot_seq, "reference": "UniProt"}]

        #Secondary structure info
        info["Limits regions"] = allregions

        #Gather info that could be useful for chimeric design
        known_info = []
        if len(confo_biosensor) > 0:
            known_info.append({"value": "Confo biosensor"})
        if len(chimeras) > 0:
            known_info.append({"value": "Parent chimera"})
        info["Known info chimeric design"] = known_info

        json.dump(info, open(f'../examples/json_entries/updated_naturals/{abbreviated_name.upper()}.json', 'w'), indent=2)

        # heatmap_cutting_pts(pharma_name,abbreviated_name,chimeras,allregions)

Q8IYL9
Q8TDS4
P02699
P11617
P30542
P04274
P07550
P20309
