This is an interactive notebook for assigning the GAIN Generic Residue Numbering Scheme to an unknown protein. Please enter the UniProtKB accession ID below.

In [27]:
import requests, os, tempfile#, json
import nglview as nv
# local
import sse_func
import matplotlib.pyplot as plt
import numpy as np
def retrieve_uniprot(accession:str):
    # Replace {accession} with the actual accession number of the protein you're interested in

# Make the API call to retrieve the protein information in XML format
    #response = requests.get(f"https://www.uniprot.org/uniprot/{accession}.xml")
    response = requests.get(f"https://rest.uniprot.org/uniprotkb/search?query=accession:{accession}&format=json")
    # Check if the API call was successful (HTTP status code 200 means success)
    if response.status_code == 200:
        # Extract the protein information from the XML response
        #protein_info = response.content.decode("utf-8")
        #print(protein_info)
        return response.json()["results"]
    
    print("Failed to retrieve protein information.")
    return None

def extract_gain_end(uniprot_accession:str, uniprot_info:list):
    # Takes the result of the API call (from response.json) and tries to extract the GPS record (ProSite Rule), returns the end value
    # This might in the future be updated to include set GAIN domain boundaries, if UniProtKB has then been updated with them.
    # Returns also the sequence until including the gps_end residue number
    if len(uniprot_info) > 1: # If more than one match, find the correct one in via accession. More of a failsafe
        for entry in uniprot_info:
            if entry['primaryAccession'] == uniprot_accession:
                target_info = entry
    else:
        target_info = uniprot_info[0]

    # Parse the entry features and find the "GPS domain" (which is of course not a domain...)
    if 'features' not in target_info.keys():
        print("FEATURES entry not found in target UniProt entry. Continuing to manual GPS/GAIN boundary specification.")
        return None, None
    
    for feature in target_info["features"]:
        if "description" in feature.keys() and feature["description"].upper() == "GPS":
            end_res = feature["location"]["end"]["value"]
            print(f"[NOTE]: Found GPS entry in the UniProtKB accession entry ending at residue {end_res}")
            truncated_sequence = target_info["sequence"]["value"][:end_res] # zero_indexed!
            return end_res, truncated_sequence
        
    print("No GPS entry found. Continuing to manual GPS/GAIN boundary specification.")
    return None, None

def get_alphafolddb_model(uniprot_accession, tmp_dir):
    alphafold_id = f"AF-{uniprot_accession}-F1"
    database_version = "v4"
    model_url = f'https://alphafold.ebi.ac.uk/files/{alphafold_id}-model_{database_version}.pdb'
    error_url = f'https://alphafold.ebi.ac.uk/files/{alphafold_id}-predicted_aligned_error_{database_version}.json'

    # This creates a PDB and a JSON file for the target protein. They need to be read.
    os.system(f'curl {model_url} -o {tmp_dir}/{alphafold_id}.pdb')
    os.system(f'curl {error_url} -o {tmp_dir}/{alphafold_id}.json')
    print(f"[NOTE] Done retrieving the model for {uniprot_accession} with the corresponding AlphaFoldDB accession {alphafold_id}.")

def truncate_pdb(pdbfile:str, start:int, end:int):
    # truncates pdbfile at the start and end residue, including them.
    pdblines = open(pdbfile).readlines()
    newlines = []
    for line in pdblines:
        if line.startswith("ATOM") and ( int(line[22:26]) < start or int(line[22:26]) > end ):
            continue
        if line.startswith("TER"):
            prev_info = newlines[-1]
            new_TER_line = f"TER   {str(int(prev_info[7:12])+1).rjust(5)}      {prev_info[17:26]}                                                      \n"
            newlines.append(new_TER_line)
            continue
        newlines.append(line)

    open(f'{pdbfile.replace(".pdb","_trunc.pdb")}', 'w').write("".join(newlines))
    print(f"[NOTE] Truncated PDB at residue {gps_end}.")
    return pdbfile.replace(".pdb","_trunc.pdb")

def truncate_stride_dict(stride_dict:dict, start:int, end:int):
    # removes entries from the dictionary where the ssestart is behind the end or the sseend is before the start.
    truncated_dict = {}
    for key, list in stride_dict.items():
        trunc_list = [tup for tup in list if (tup[0] < end and tup[1] > start)]
        truncated_dict[key] = trunc_list
    return truncated_dict


In [43]:
uniprot_accession = "Q8CJ12"

maxres = 360 # The maximum number of residues in a model to be evaluated. Increase this if you suspect the GAIN to be larger than that.

uniprot_info = retrieve_uniprot(uniprot_accession) # This is a list of dictionaries of the results
if uniprot_info is None:
    raise NameError("The provided UniProtKB accession did not yield any results. Please check your accession number.")

gps_end, gps_end_sequence = extract_gain_end(uniprot_accession, uniprot_info)
if gps_end < maxres:
    maxres = gps_end

tmp_folder = tempfile.TemporaryDirectory(dir = '../')
tmp_dir = tmp_folder.name

get_alphafolddb_model(uniprot_accession, tmp_dir)
pdbfile = f'{tmp_dir}/AF-{uniprot_accession}-F1.pdb'
jsonfile = f'{tmp_dir}/AF-{uniprot_accession}-F1.json'


truncated_sequence = gps_end_sequence[gps_end-maxres:] # matches the sequences to the PDB sequence (zero-indexed!)

# Run STRIDE for evaluating the secondary structure items.
stride_bin = '/home/hildilab/lib/stride/stride'
os.system(f"{stride_bin} {pdbfile} -f{tmp_dir}/AF-{uniprot_accession}-F1.stride") # This will be a stride analysis of the WHOLE protein.
stride_file = f"{tmp_dir}/AF-{uniprot_accession}-F1.stride"
complete_sse_dict = sse_func.read_sse_loc(stride_file)
# Cut the complete_ssse_dict down to only include entries before the detected end residue and within the maximum residue number.
truncated_sse_dict = truncate_stride_dict(complete_sse_dict, start=gps_end-maxres, end=gps_end)
# Find the GAIN domain start and subdomain boundary, if applicable
gain_start, gain_subdomain_boundary = sse_func.find_boundaries(truncated_sse_dict, seq_len=gps_end, bracket_size=30, domain_threshold=15, coil_weight=0.08, truncate_N=True)



[NOTE]: Found GPS entry in the UniProtKB accession entry ending at residue 610
[NOTE] Done retrieving the model for Q8CJ12 with the corresponding AlphaFoldDB accession AF-Q8CJ12-F1.
DEBUG /home/hildilab/lib/stride/stride ../tmpxmxutdmz/AF-Q8CJ12-F1.pdb -f../tmpxmxutdmz/AF-Q8CJ12-F1.stride
{'AlphaHelix': [(366, 380), (387, 401), (405, 409), (411, 425), (493, 499)], '310Helix': [], 'Strand': [(432, 437), (440, 446), (455, 460), (466, 468), (477, 483), (501, 507), (521, 523), (527, 532), (544, 549), (559, 566), (575, 577), (581, 587), (590, 595), (600, 606)], 'TurnIV': [(449, 452), (462, 465), (470, 473), (477, 480), (511, 514), (532, 535), (568, 571)], 'TurnI': [(380, 383), (437, 440), (448, 451), (461, 464), (474, 477), (484, 487), (486, 489), (488, 491), (510, 513), (515, 518), (518, 521), (555, 558), (566, 569), (587, 590), (607, 610), (608, 611)], 'TurnII': [(578, 581)], 'TurnVIII': [(430, 433), (507, 510), (540, 543), (550, 553)], "TurnI'": [(569, 572), (570, 573)], "TurnII'": [], '

In [44]:
#print(open(pdbfile).readlines())
#print(json.loads(open(jsonfile, 'rb').read()))
truncated_pdbfile = truncate_pdb(pdbfile, start=gain_start, end=gps_end) # gps_end-maxres+1 ensures that the resulting PDB has exactly $maxres resiudes
# Now, with truncated_sequence and truncated_pdbfile, proceed with
print(truncated_pdbfile)
print(truncated_sequence, len(truncated_sequence), sep="\n")
view = nv.show_file(truncated_pdbfile)
view

[NOTE] Truncated PDB at residue 610.
../tmpxmxutdmz/AF-Q8CJ12-F1_trunc.pdb
SKPVVPQATIISHVASDFSLAEPLDHALMTPSTPSLTQESNLPSPQPTIPLASSPATDLPVQSVVVSSLPQTDLSHTLSPVQSSIPSPTTPAPSVPTELVTISTPPGETVVNTSTVSDLEAQVSQMEKALSLGSLEPNLAGEMVNRVSKLLHSPPALLAPLAQRLLKVVDAIGLQLNFSSTTISLTSPSLALAVIRVNASNFNTTTFAAQDPTNLQVSLETPPPENSIGAITLPSSLMNNLPANDVELASRIQFNFFETPALFQDPSLENLTLISYVISSSVTNMTIKNLTRNVTVALKHINPSPDDLTVKCVFWDLGRNGGKGGWSSDGCSVKDKRMNETICTCSHLTSFGILLDLSRT
360


NGLWidget()

In [4]:
tmp_folder.cleanup()