# 3 Template Curation
A noteboook for evaluating a set of selected templates to account for dataset coverage and similarity between templates to manually curate the ideal template set.


In [None]:
# DEPENDENCIES
import glob, os
import numpy as np
import pandas as pd
import json
# LOCAL IMPORTS
#from indexing_classes import GPCRDBIndexing

import matplotlib.pyplot as plt
import gaingrn.scripts.template_utils
import gaingrn.scripts.io
import gaingrn.scripts.indexing_utils
try: 
    GESAMT_BIN = os.environ.get('GESAMT_BIN')
except:
    GESAMT_BIN = "/home/hildilab/lib/xtal/ccp4-8.0/ccp4-8.0/bin/gesamt"

if GESAMT_BIN is None:
    GESAMT_BIN = "/home/hildilab/lib/xtal/ccp4-8.0/ccp4-8.0/bin/gesamt"

### 1. Load the valid GAIN domain Dataset and specify the pdb path.

In [None]:
# In addition to the collection, we need the corresponding PDB files.
valid_collection = pd.read_pickle("../data/valid_collection.pkl")
allpdbs = '../../all_pdbs/*.pdb'
print(len(valid_collection.collection))
segments = ['H1','H2','H3','H4','H5','H6','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14']
with open("../data/template_history.json","r") as jj:
    template_data = json.load(jj)

#### For evaluation, every template is run against the __full__ dataset, generating RMSDs and 3D-aligned PDB files. One can either run directly from the cell or generate a Bash script within the folder to be run manually.

In [None]:
# The template data dictionary contains the template info of all runs. We only take the final ones

# RUN FOR SDB TEMPLATES
for t_name, sdb_template in template_data["sdb_templates"].items():
    gesamt_outfolder = f'../../TESTING/{t_name}_sdb'
    sdb_template_pdb = gaingrn.scripts.io.find_pdb(sdb_template, "../data/template_pdbs/")
    gaingrn.scripts.template_utils.run_gesamt_execution(valid_collection.collection, 
                                outfolder=gesamt_outfolder,
                                gesamt_bin=GESAMT_BIN,
                                pdb_folder='../../all_pdbs',
                                domain='sdb', 
                                n_threads=6, 
                                max_struc=len(valid_collection.collection), 
                                no_run=True,  # <-- CHANGE THIS IF YOU DO/DON'T WANT A DRY RUN, OTHERWISE ONLY SCRIPTS FOR EXECUTION ARE CREATED.
                                template=sdb_template_pdb)

# RUN FOR SDA TEMPLATES
for t_name, sda_template in template_data["sda_templates"].items():
    identifier = sda_template.split("-")[0]
    sda_template_pdb = gaingrn.scripts.io.find_pdb(sda_template, '../data/template_pdbs/')
    gesamt_outfolder = f'../../TESTING/{t_name}_{identifier}_sda'

    gaingrn.scripts.template_utils.run_gesamt_execution(valid_collection.collection, 
                            outfolder=gesamt_outfolder, 
                            gesamt_bin=GESAMT_BIN,
                            pdb_folder='../../all_pdbs', 
                            domain='sda', 
                            n_threads=6, 
                            max_struc=len(valid_collection.collection), 
                            no_run=True,  # <-- CHANGE THIS IF YOU DO/DON'T WANT A DRY RUN, OTHERWISE ONLY SCRIPTS FOR EXECUTION ARE CREATED.
                            template=sda_template_pdb)

### 2. Evaluate the template match quality
 The resulting Directories from each **run_gesamt_execution** are collected. dictionary for evaluation. A structural alignment is constructed for every template by stacking the pairwise matches that __GESAMT__ has generated into a MSA-compatible file. From there, the best residues of each element and the respective conservational frequency can be collected.

 With the constructed MSA, we can extract the most conserved residue of each segment for each template to select the segment centers.

 Below, this is done exemplarily on one single template in a directory of all pairwise GAIN matches to this template.

In [None]:
templates = { **template_data['sda_templates'], **template_data['sdb_templates'] }

for k, v in templates.items():
    print("found.")
    print(k, v)
    template_name = v.split("-")[0]
    if 'b' in k:
        subdomain = 'b'
        threshold = 1
    else:
        subdomain = 'a'
        threshold = 4
    template_gain = gaingrn.scripts.io.get_gain(v.split("-")[0], valid_collection)
 

    gesamt_folder = f"../../TESTING/{k}_sd{subdomain}"
    outfile=f'../data/template_alignments/{template_name}_{k}.struc_aln.fa'

    center_indices, center_quality, centers, structural_alignment = gaingrn.scripts.template_utils.evaluate_template(
                    template_gain_obj=template_gain, 
                    list_of_gain_obj=valid_collection.collection, 
                    gesamt_folder=gesamt_folder, 
                    subdomain=subdomain, 
                    threshold=threshold, 
                    aln_output=outfile,
                    gain_indices=range(len(valid_collection.collection)))

    print(f"{center_quality = }\n{center_indices = }\n{centers = }")

Since we have stored the constructed alignments, we can analyze them on the fly.

Keep in mind that here, Hx and Sx do not correspond to the actual segment names, but are simply enumerated. The combination and overlap of segment centers has to be done manually.

In [None]:
templates = { **template_data['sda_templates'], **template_data['sdb_templates'] }

template_centers = {}
template_quality = {}
template_indices = []
all_indices = []

for k, v in templates.items():
    template_name = v.split("-")[0]

    alignment_file = f"../data/template_alignments/{template_name}_{k}.struc_aln.fa"

    if 'b' in k:
        subdomain = 'b'
        threshold = 1
    else:
        subdomain = 'a'
        threshold = 4

    template_gain = gaingrn.scripts.io.get_gain(v.split("-")[0], valid_collection)
    # We can also just load the finished alignments and run the template analysis from there.
    aln = gaingrn.scripts.io.read_alignment(alignment_file)

    centers, center_quality, pdb_centers = gaingrn.scripts.template_utils.get_struc_aln_centers(gain=template_gain,
                                                                                                aln_dict = aln,
                                                                                                subdomain=subdomain,
                                                                                                threshold=threshold,
                                                                                                silent=True)
    template_centers[k] = centers
    template_quality[k] = center_quality
    
    for i in centers:
        template_indices.append(i)
    all_indices.append(centers)

best_indices, best_indices_freq  = np.unique(template_indices, return_counts=True)
a_counts = dict(zip(best_indices, best_indices_freq)) 
center_col = {x:i for i,x in enumerate(best_indices)}
hasCenter = np.zeros(shape=(len(best_indices), len(all_indices)))
for fam_count, indices in enumerate(all_indices):
    for v in indices:
        hasCenter[center_col[v], fam_count] = 1

for k in template_centers.keys():
    print("TEMPLATE", k, ":\n\tCenter PDB res.\t", template_centers[k], "\n\tn_structures:\t", template_quality[k])

### 4. Evaluate the current set of Templates

For this. we match every template to all receptor subselections (i.e. ADGRA2). We assume that the variance within a single subselection (= every receptor type) is negligible.

In [None]:
receptor_list, receptor_counts = gaingrn.scripts.indexing_utils.count_receptors(valid_collection)
receptors = list(receptor_counts.keys())
y = len(receptors)

# Now, we need template data to evaluate on:
with open("../data/template_data.json") as tdata:
    template_data = json.load(tdata)

template_centers = {**template_data["sda_centers"], **template_data["sdb_centers"]}
template_ids = list(template_data["template_names"].keys())

gesamt_folders = {}

for t_id in template_ids:
    if 'b' in t_id: 
                sd_string = 'sdb'
    else: 
                sd_string = 'sda'
    gesamt_folders[t_id] = f"../../TESTING/{t_id}_{sd_string}"

t_distances, t_occupancies, unmatched, unmatched_counters = gaingrn.scripts.template_utils.analyze_template_matches(template_ids, template_centers, valid_collection, gesamt_folders, receptors, receptor_list)

#### From the calculated comparisons, plot the __OCCUPANCY__, meaning the fraction of the selection where this element is occurring.

In [None]:
# Plot the OCCUPANCY
for t_id in template_ids:
    t_centers = template_centers[t_id]
    n_anch = len(t_centers.keys())
    t_center_freqs = t_occupancies[t_id]
    u_counters = unmatched_counters[t_id]
    fig = plt.figure(figsize=[6,10], facecolor='w')
    plt.title(f"Template Match for : {t_id}")
    plt.yticks(ticks = range(len(receptors)), labels= [f'{i[0]}:{i[1]} (u:{u_counters[x]})' for x,i in enumerate(receptor_counts.items())])
    plt.xticks(ticks = range(n_anch), labels=t_centers.keys(), rotation=90)
    distances = np.zeros(shape=(len(receptors), n_anch), dtype=float)
    plt.imshow(t_center_freqs, cmap='summer')
    cbar = plt.colorbar(shrink=0.5)
    cbar.set_label('Relative Occupancy')
    ydim = len(receptors)
    for y in range(ydim):
        for x in range(n_anch):
            if t_center_freqs[y,x] > 0.001:
                pass
            else:
                plt.text(x,y,'x', horizontalalignment='center', verticalalignment='center', fontsize=18,color='k')
    plt.savefig(f'../../TESTING/StAl{t_id}_occ.png', dpi=300)

#### From the calculated comparisons, plot the __CENTER DISTANCE__, meaning the average distance of the matched residue to the template center.

In [None]:
# Plot the DISTANCES
for t_id in template_ids:
    t_centers = template_centers[t_id]
    n_anch = len(t_centers.keys())
    t_center_freqs = t_occupancies[t_id]
    u_counters = unmatched_counters[t_id]
    fig = plt.figure(figsize=[6,10], facecolor='w')
    plt.title(f"Template Match for : {t_id}")
    plt.yticks(ticks = range(len(receptors)), labels= [f'{i[0]}:{i[1]} (u:{u_counters[x]})' for x,i in enumerate(receptor_counts.items())])
    plt.xticks(ticks = range(n_anch), labels=t_centers.keys(), rotation=90)
    distances = np.zeros(shape=(len(receptors), n_anch), dtype=float)
    t_dists = t_distances[t_id]
    for i,l in enumerate(t_dists):
        distances[i,:] = l
    plt.imshow(distances, cmap='spring')
    cbar = plt.colorbar(shrink=0.5)
    cbar.set_label('AA-Distance')
    ydim = len(receptors)
    for y in range(ydim):
        for x in range(n_anch):
            if t_center_freqs[y,x] > 0.001:
                pass
            else:
                plt.text(x,y,'x', horizontalalignment='center', verticalalignment='center', fontsize=18,color='k')
    plt.savefig(f'../../TESTING/StAl{t_id}_dist.png', dpi=300)

Every unmatched element is the written to a TextFile with the following information:
> - Template
> - Subselection(Group)
> - Elements in Group
> - corr. Alignment Index or start residue
> - Number of Structures with unmatched element
> - average element length
> - fraction of unmatched structures (%)

If only "X" type receptors are present, we have a good quality match, since "X" denotes all unclassed receptors, which by default will have low matches when collectively matched against a single template

In [None]:
outfile = "../../TESTING/StAl_unmatched_full.txt"

gaingrn.scripts.template_utils.write_unmatched_elements(outfile, template_ids, receptor_counts, receptor_list, unmatched)

Construct PDB files for Visualizing the center locations for each element.

In [None]:
sdb_centers = template_data["sdb_centers"]
print(templates)
template_ids = sdb_centers.keys()
target_folder = '../data/template_pdbs/'
outstr = []
print(template_ids)
for t_id in template_ids:
    print(t_id)
    t_centers = sdb_centers[t_id]
    t_pdb = gaingrn.scripts.io.find_pdb(templates[t_id], '../../all_pdbs/')
    print(t_pdb)
    t_metric = {v:1 for v in sdb_centers[t_id].values()}
    outpdb = target_folder+t_id+".b.pdb"

    gaingrn.scripts.io.score2b(t_pdb, outpdb, t_metric)
    outstr.append(outpdb)

print(" ".join(outstr))

Finally, create a complete set of template PDBs and move them to their directory, where they can be taken as argument for __assign_indexing__ within __assign_indexing.py__ or __assign_indexing.ipynb__

In [None]:
target_folder = '../../TESTING/template_pdbs/'

try: os.mkdir("../../TESTING/template_pdbs")
except: print("Directory already exists.")

templates = template_data["template_names"]

for t_id in templates.keys():
    print(t_id)
    print(templates[t_id])
    t_pdb = gaingrn.scripts.io.find_pdb(templates[t_id].split("-")[0], '../../all_pdbs/')
    print(t_pdb)
    # Find the template in the valid_collection
    for gain in valid_collection.collection:
        if gain.name.split("-")[0] == templates[t_id].split("-")[0]:
            print(gaingrn.scripts.structure_utils.get_pdb_extents(t_pdb, gain.subdomain_boundary))
            
            with open(t_pdb) as inpdb:
                data = inpdb.readlines()
            newdata = []
            for l in data:
                if not l.startswith("ATOM"):
                    newdata.append(l)
                    continue
                resid = int(l[22:26])
                if "b" in t_id and resid < gain.subdomain_boundary:
                    continue
                if "b" not in t_id and resid > gain.subdomain_boundary:
                    continue
                newdata.append(l)
            
            with open(f"{target_folder}{t_id}_{templates[t_id].split('-')[0]}.pdb", "w") as outpdb:
                outpdb.write("".join(newdata))
            
            continue