This is an interactive notebook for assigning the GAIN Generic Residue Numbering Scheme to an unknown protein. Please enter the UniProtKB accession ID below.

In [None]:
# Inititalize modules and functions.
import requests, os, tempfile, re, json
import nglview as nv
# local
import sse_func
from gain_classes import ExtractedGain
import template_finder

gesamt_bin = "/home/hildilab/lib/xtal/ccp4-8.0/bin/gesamt"

def get_ca_indices(pdbfile, offset=0):
    atoms = [l for l in open(pdbfile).readlines() if l[13:15] == "CA" and l.startswith("ATOM")]
    ca_indices = {int(l[22:26]):int(l[6:11])-offset for l in atoms}
    return ca_indices

def get_pdb_offset(pdbfile):
    match = re.findall("ATOM\s+\d+", open(pdbfile).read())
    offset = int(match[0].split()[-1])
    return offset

def retrieve_uniprot(accession:str):
    # Make the API call to retrieve the protein information in JSON format using the accession number as query
    response = requests.get(f"https://rest.uniprot.org/uniprotkb/search?query=accession:{accession}&format=json")
    # Check if the API call was successful (HTTP status code 200 means success)
    if response.status_code == 200:
        # decode via json(), only return "results" (no other entry anyway.)
        return response.json()["results"]
    
    print("Failed to retrieve protein information.")
    return None

def extract_gain_end(uniprot_accession:str, uniprot_info:list):
    # Takes the result of the API call (from response.json) and tries to extract the GPS record (ProSite Rule), returns the end value
    # This might in the future be updated to include set GAIN domain boundaries, if UniProtKB has then been updated with them.
    # Returns also the sequence until including the gps_end residue number
    if len(uniprot_info) > 1: # If more than one match, find the correct one in via accession. More of a failsafe
        for entry in uniprot_info:
            if entry['primaryAccession'] == uniprot_accession:
                target_info = entry
    else:
        target_info = uniprot_info[0]
    try: 
        protein_name = target_info['proteinDescription']['recommendedName']['fullName']['value']
    except: 
        protein_name='unnamed_protein'
    # Parse the entry features and find the "GPS domain" (which is of course not a domain...)
    if 'features' not in target_info.keys():
        print("FEATURES entry not found in target UniProt entry. Continuing to manual GPS/GAIN boundary specification.")
        return None, None, protein_name
    
    for feature in target_info["features"]:
        if "description" in feature.keys() and feature["description"].upper() == "GPS":
            end_res = feature["location"]["end"]["value"]
            print(f"[NOTE] Found GPS entry in the UniProtKB accession entry ending at residue {end_res}")
            truncated_sequence = target_info["sequence"]["value"][:end_res] # zero_indexed!
            return end_res, truncated_sequence, protein_name
        
    print("No GPS entry found. Continuing to manual GPS/GAIN boundary specification.")
    return None, None, protein_name

def get_alphafolddb_model(uniprot_accession, tmp_dir):
    alphafold_id = f"AF-{uniprot_accession}-F1"
    database_version = "v4"
    model_url = f'https://alphafold.ebi.ac.uk/files/{alphafold_id}-model_{database_version}.pdb'
    error_url = f'https://alphafold.ebi.ac.uk/files/{alphafold_id}-predicted_aligned_error_{database_version}.json'

    # This creates a PDB and a JSON file for the target protein. They need to be read.
    os.system(f'curl {model_url} -o {tmp_dir}/{alphafold_id}.pdb')
    os.system(f'curl {error_url} -o {tmp_dir}/{alphafold_id}.json')
    print(f"[NOTE] Done retrieving the model for {uniprot_accession} with the corresponding AlphaFoldDB accession {alphafold_id}.")
    
    data = open(f'{tmp_dir}/{alphafold_id}.pdb').readlines()[0]
    if "Error" in data:
        print(data[0])
        raise FileNotFoundError(f"The Model is invalid. Target URL = https://alphafold.ebi.ac.uk/files/{alphafold_id}-model_{database_version}.pdb")

def truncate_pdb(pdbfile:str, start:int, end:int):
    # truncates pdbfile at the start and end residue, including them.
    pdblines = open(pdbfile).readlines()
    newlines = []
    for line in pdblines:
        if line.startswith("ATOM") and ( int(line[22:26]) < start or int(line[22:26]) > end ):
            continue
        if line.startswith("TER"):
            prev_info = newlines[-1]
            new_TER_line = f"TER   {str(int(prev_info[7:12])+1).rjust(5)}      {prev_info[17:26]}                                                      \n"
            newlines.append(new_TER_line)
            continue
        newlines.append(line)

    open(f'{pdbfile.replace(".pdb","_trunc.pdb")}', 'w').write("".join(newlines))
    print(f"[NOTE] Truncated PDB to residues {start }-{end}.")
    return pdbfile.replace(".pdb","_trunc.pdb")

def truncate_stride_dict(stride_dict:dict, start:int, end:int):
    # removes entries from the dictionary where the ssestart is behind the end or the sseend is before the start.
    truncated_dict = {}
    for key, list in stride_dict.items():
        trunc_list = [tup for tup in list if (tup[0] < end and tup[1] > start)]
        truncated_dict[key] = trunc_list
    return truncated_dict

def get_seq(uniprot_info, uniprot_accession, c_end=None):
    if len(uniprot_info) > 1: # If more than one match, find the correct one in via accession. More of a failsafe
        for entry in uniprot_info:
            if entry['primaryAccession'] == uniprot_accession:
                target_info = entry
    else:
        target_info = uniprot_info[0]

    if c_end is not None:
        return target_info["sequence"]["value"][:c_end] # zero_indexed!
    
    return target_info["sequence"]["value"]

In [None]:
uniprot_accession = "A1Z7G7"

### Here are some examples of distant GAIN domains
##Invertebrates
#   "Q8SZ78" D.melanogaster mayo/CG11318
#   "G5EDW2" C.elegans lat-1
#   "A1Z7G7" D.melanogaster Cirl
##PKD GAIN domains
#   "Q8R526" M.musculus PKD1L1
#   "H2LRU7" O.latipes PKD2
##OTHER
#   "B3SDA6" Trichoplax adhaerens Protein kinase domain-containing protein (NEGATIVE CONTROL)
#   "Q8CJ12" M.musculus ADGRG2 (already in dataset)

maxres = 360 # The maximum number of residues in a model to be evaluated. Increase this if you suspect the GAIN to be larger than that.

uniprot_info = retrieve_uniprot(uniprot_accession) # This is a list of dictionaries of the results
if uniprot_info is None:
    raise NameError("The provided UniProtKB accession did not yield any results. Please check your accession number.")

gps_end, gps_end_sequence, protein_name = extract_gain_end(uniprot_accession, uniprot_info)
if gps_end is not None and gps_end < maxres:
    maxres = gps_end

tmp_folder = tempfile.TemporaryDirectory(dir = '../')
tmp_dir = tmp_folder.name
json.dump(uniprot_info, open(f'{tmp_dir}/uniprotkb_{uniprot_accession}.json', 'w'))

get_alphafolddb_model(uniprot_accession, tmp_dir)
pdbfile = f'{tmp_dir}/AF-{uniprot_accession}-F1.pdb'
jsonfile = f'{tmp_dir}/AF-{uniprot_accession}-F1.json'

# Routine for manual determination of gps_end
view = nv.show_file(pdbfile)
view


Display the Information about this GAIN domain in NGLviewer: Subdomain A is represented as blue, Subdomain B as orange. The respective boundaries are labeled.

In [None]:
if gps_end is None:
    print("You need to manually set the C-terminal GAIN domain boundary. Please look into the structure and find the residue matching the GAIN domain end. You can likely find it by looking directly N-terminal of the seven-transmembrane domain, if present.")
    gps_end = int(input("Manually set the C-terminal GAIN end: "))
    print("Set gps_end to", gps_end)
    gps_end_sequence = get_seq(uniprot_info, uniprot_accession, c_end=gps_end)
    
truncated_sequence = gps_end_sequence[gps_end-maxres:] # matches the sequences to the PDB sequence (zero-indexed!)

# Run STRIDE for evaluating the secondary structure items.
stride_bin = '/home/hildilab/lib/stride/stride'
os.system(f"{stride_bin} {pdbfile} -f{tmp_dir}/AF-{uniprot_accession}-F1.stride") # This will be a stride analysis of the WHOLE protein.
raw_stride_file = f"{tmp_dir}/AF-{uniprot_accession}-F1.stride"
stride_file = f"{tmp_dir}/AF-{uniprot_accession}-F1.outliers.stride"
sse_func.detect_outliers(raw_stride_file, stride_file, sigmas=2)
complete_sse_dict = sse_func.read_sse_loc(stride_file)

# Cut the complete_ssse_dict down to only include entries before the detected end residue and within the maximum residue number.
truncated_sse_dict = truncate_stride_dict(complete_sse_dict, start=gps_end-maxres, end=gps_end)

# Find the GAIN domain start and subdomain boundary, if applicable
gain_start, gain_subdomain_boundary = sse_func.find_boundaries(truncated_sse_dict, seq_len=gps_end, bracket_size=30, domain_threshold=15, coil_weight=0.08, truncate_N=3)
truncated_pdbfile = truncate_pdb(pdbfile, start=gain_start, end=gps_end) # gps_end-maxres+1 ensures that the resulting PDB has exactly $maxres resiudes
gain_sequence = gps_end_sequence[gain_start-gps_end-1:]

In [None]:
# Highlight Subdomain A, Subdomain B and the Boundary between them.
view = nv.show_file(truncated_pdbfile)
view.clear()
c = {gain_subdomain_boundary:"red",gain_start:"blue",gps_end:"orange"}
for o in [gain_subdomain_boundary, gain_start, gps_end]:
    view.add_representation(repr_type="label", name ="label", showBackground =True, labelType="res", color=c[o],
                     sele = f"{o} and .CA", xOffset = 0.5 , zOffset =5, fixedSize=True )
    view.add_hyperball(selection=f"{o}", color=c[o])
view.add_cartoon(selection=f'{gain_subdomain_boundary}-{gps_end}', color='orange')
view.add_cartoon(selection=f'{gain_start}-{gain_subdomain_boundary}', color='blue')
view

With the detected boundaries of GAIN subdomains, proceed to map them onto the available templates.
Find the best fitting templates for each subdomain and align, creating a residue Numbering.

In [None]:
# Firste, rewrite the STRIDE file for Outlier detection:
mod_stride_file = stride_file.replace(".stride",".outliers.stride")
sse_func.detect_outliers(stride_file, mod_stride_file, sigmas=2)

print(f'[NOTE] Creating instance of GainDomain: {uniprot_accession}_{protein_name.replace(" ","-")}')
target_gain = ExtractedGain(
                start=gain_start, 
                subdomain_boundary=gain_subdomain_boundary, 
                end=gps_end,
                name=f'{uniprot_accession}_{protein_name.replace(" ","-")}',
                sequence=gain_sequence,
                stride_file=mod_stride_file,
                is_truncated=True,
                stride_outlier_mode=True,
                debug=False)
dir(target_gain)
sse_func.write2fasta(sequence=target_gain.sequence, name=target_gain.name, filename=f"{tmp_dir}/{target_gain.name}.fa")

element_intervals, element_centers, residue_labels, unindexed_elements, params = template_finder.assign_indexing(
                                gain_obj=target_gain,
                                file_prefix=f"{tmp_dir}/indexing",
                                gain_pdb=truncated_pdbfile,
                                template_dir='../r4_template_pdbs/',
                                template_json='template_data.json',
                                gesamt_bin=gesamt_bin,
                                debug=False,
                                create_pdb=True,
                                hard_cut={"S2":7,"S6":3,"H5":3},
                                patch_gps=True,
                                template_mode='extent'
                                )

print("[DEBUG]", residue_labels)
rmsds = {sd:float(re.search(r"RMSD\W+\:\W+[0-9]+\.[0-9]+",open(f"{tmp_dir}/indexing_{sd}.out").read()).group(0).split()[-1]) for sd in ["sda","sdb"]}
#print(rmsds)
print(f"The RMSD values of Subdomain Matches are:\n\tSDA: {round(rmsds['sda'], 3)} A\n\tSDB: {round(rmsds['sdb'], 3)} A")
if rmsds["sdb"] > 2.0: print("WARNING: The Matching RMSD is very high. This GAIN domain is likely not a good fit, if a GAIN domain at all. Please check your protein further.")
#
print(element_intervals, element_centers, residue_labels, unindexed_elements, params, sep="\n")
res2label = {v:k for k,v in residue_labels.items() if v is not None}

In [None]:
# Map the indexing onto the structure
view = nv.show_file(truncated_pdbfile)
view.clear()

ca_indices = get_ca_indices(truncated_pdbfile, offset=get_pdb_offset(truncated_pdbfile))
label_dict = {ca_indices[res]:label for label, res in residue_labels.items() if res is not None}

mysel = ",".join([str(k) for k in label_dict.keys()])

view.add_representation(repr_type="label", name ="label", showBackground =True, labelType="text", color='black', 
                        labelText=label_dict,
                        sele = f'@{mysel}', xOffset = 0.5 , zOffset =5, fixedSize=False )

view.add_cartoon(selection=f'{gain_subdomain_boundary}-{gps_end}', color='orange')
view.add_cartoon(selection=f'{gain_start}-{gain_subdomain_boundary}', color='blue')

view


In [None]:
def label2b(pdbfile, outfile, res2label, clear_b=False):
    data = open(pdbfile).readlines()
    newdata = []
    for l in data:
        if not l.startswith("ATOM"):
            newdata.append(l)
            continue
        if not l[13:15] == "CA" or int(l[22:26]) not in res2label.keys():
            #print(l[13:14], int(l[22:26]))
            if clear_b:
                k = l[:60]+"      "+l[67:]
            newdata.append(k)
            continue
        k = l[:60]+res2label[int(l[22:26])].rjust(6)+l[67:]
        newdata.append(k)
    open(outfile, 'w').write("".join(newdata))
    print(f"Written residue labels to PDB file CA entries : {outfile}")

def grn2csv(res2label, outfile): 
    with open(outfile, "w") as csv:
        csv.write("RESNR,RESNAME,LABEL\n")
        for k in range(target_gain.start, target_gain.end+1):
            if k in res2label.keys():
                csv.write(f"{k},{target_gain.sequence[k-target_gain.start]},{res2label[k]}\n")
            else:
                csv.write(f"{k},{target_gain.sequence[k-target_gain.start]},\n")


label2b(pdbfile=pdbfile, outfile=pdbfile.replace(".pdb","_grn.pdb"),res2label=res2label, clear_b=True)
label2b(pdbfile=truncated_pdbfile, outfile=truncated_pdbfile.replace(".pdb","_grn.pdb"), res2label=res2label, clear_b=True)
grn2csv(res2label, outfile=f"{tmp_dir}/{uniprot_accession}_grn.csv")

In [None]:
#tmp_folder.cleanup()
print(res2label)
print(label_dict)