# 2 Template Selection
A Notebook for Picking suitable Template Structures from Subgroups of the GAIN domain Dataset.

In [None]:
# DEPENDENCIES
import glob, glob, re, os
#from shutil import copyfile
import numpy as np
import pandas as pd
# LOCAL IMPORTS
import gaingrn.scripts.template_utils
import gaingrn.scripts.assign
import gaingrn.scripts.plotting_utils

try: 
    GESAMT_BIN = os.environ.get('GESAMT_BIN')
except:
    GESAMT_BIN = "/home/hildilab/lib/xtal/ccp4-8.0/ccp4-8.0/bin/gesamt"

if GESAMT_BIN is None:
    GESAMT_BIN = "/home/hildilab/lib/xtal/ccp4-8.0/ccp4-8.0/bin/gesamt"

In [None]:
# This is a remnant from the previous approach in having MSA-based anchors and is used as a comparative metric.
valid_seqs = gaingrn.scripts.io.read_multi_seq("../data/seq_aln/offset_valid_seqs.fa")
stride_files = glob.glob("../data/gain_strides/*.stride")

### 1. Load the full GainCollection of the Dataset and habe the folder containing all GAIN PDB files.

In [None]:
valid_collection = pd.read_pickle("../data/valid_collection.pkl")
allpdbs = glob.glob('../../all_pdbs/*.pdb')
print(len(allpdbs, "PDB files have been found."))

### 2.Chunk down the Dataset into Subselections containing every subfamily ("A", "B") and receptor type ("B1", "D2")

In [None]:
fam_list = [gaingrn.scripts.io.get_agpcr_type(gain.name) for gain in valid_collection.collection]
name_list = [gain.name for gain in valid_collection.collection]
subfam_list = [x[0] for x in fam_list]
receptors, counts  = np.unique(fam_list, return_counts=True)
r_counts = list(zip(receptors,counts))
print("HERE IS THE COUNT OF DETECTED aGPCR RECEPTOR PROTEINS:", r_counts, sep="\n")

fam_counts = {}
for prot in fam_list:
    fam = prot[0]
    if fam not in fam_counts.keys():
        fam_counts[fam] = 0
    fam_counts[fam] += 1

print("HERE IS THE COUNT OF DETECTED aGPCR SUBFAMILIES:", fam_counts, sep="\n")

### 3. The receptor groups to be tested are specified. The following sequence of detail was tested:

> - Run1: Subfamily-specific templates
> - Run2: Added Receptor-specific templates for low-matching receptors
> - Run3: Added further templates for extra 14th strand S4

Every subselection is clustered via a pairwise RMSD-matrix constructed from individual _GESAMT_ runs. Two template candidates are then evaluated: The _overall_ centroid and the _largest cluster_ centroid. If they do not match, the clusters are evaluated to see whether there is are separate different clusters. A manual selection of the template is then carried out.

For template selection, we only use 400 randomly selected structures from the respective subset. The templates are tested afterwards to asses their quality. We assume sufficient coverage from these 400 proteins.

In [None]:
# This Box is for running Subselections for Subdomain A >SDA<

best_structures = {}
best_clusters = {}

manual_receptors = ["X"]

run_prefix = "p1"
print(allpdbs)
gain_subset = valid_collection.collection
print(len(gain_subset))

gain_idx_list = range(len(fam_list)) # fam_list
maxlen = 400
data_length = len(gain_subset)
if data_length > maxlen:
    data_length = maxlen # data_length will be used.
    stride = data_length // maxlen
    gain_subset = gain_subset[::stride]
    gain_subset = gain_subset[:maxlen]
    print(len(gain_idx_list), len(gain_subset))

gesamt_outfolder = f"../../TESTING/{run_prefix}_gesamt_sda_pc"

gaingrn.scripts.template_utils.run_gesamt_execution(gain_subset, 
                        outfolder=gesamt_outfolder, 
                        pdb_folder='../../all_pdbs/', 
                        domain='sda', 
                        n_threads=4, 
                        max_struc=maxlen, 
                        gesamt_bin=GESAMT_BIN,
                        no_run=False)

distances = gaingrn.scripts.template_utils.evaluate_gesamt_files(gesamt_outfolder, n_prot=data_length, penalty_value=6.0, remove=False)

print(distances.shape)
results = gaingrn.scripts.template_utils.cluster_agglomerative(distances, gain_subset, n_cluster=9)
gaingrn.scripts.plotting_utils.plot_heirarchy(results['reordered_distances'], groupname=f'aGPCR GAIN', savename=f'{run_prefix}_A_sda_heirarchy.png')
gaingrn.scripts.plotting_utils.plot_matrix(results['reordered_distances'], title=f'aGPCR GAIN', savename=f'{run_prefix}_sda_ordered_matrix.png')

all_best = gain_subset[results['overall_best_gain']].name
best_list = []
for r in "ABCDEFGLVX":
    for i, c in results['cluster_best_gains']:
        best_list.append((gain_subset[i].name, c, results['cluster_sizes'][c]))
    best_structures[r] = all_best
    best_clusters[r] = best_list
    print("Done with run",r)

In [None]:
# This Box is for running Subselections for Subdomain B >SDB<
best_structures = {}
best_clusters = {}

for r in 'ABCDEFGLVX': #enumerate(receptors[:1])
    print(r)
    gain_subset = [ gain for i, gain in enumerate(valid_collection.collection) if fam_list[i]==r ]#fam_list[i] == r]#
    print(len(gain_subset))
    gain_idx_list = [ i for i,gain in enumerate(fam_list) if gain == r ] # fam_list
    maxlen = 400
    data_length = len(gain_subset)
    if data_length > maxlen:
        data_length = maxlen # data_length will be used.
        stride = data_length // maxlen
        gain_subset = gain_subset[::stride]
        gain_subset = gain_subset[:maxlen]
        print(len(gain_idx_list), len(gain_subset))

    gesamt_outfolder = f"../{run_prefix}_gesamt_sdb_pc"
    
    gaingrn.scripts.template_utils.run_gesamt_execution(gain_subset, outfolder=gesamt_outfolder, gesamt_bin=GESAMT_BIN, pdb_folder='../all_pdbs/', domain='sdb', n_threads=4, max_struc=maxlen, no_run=False)
    distances = gaingrn.scripts.template_utils.evaluate_gesamt_files(gesamt_outfolder, n_prot=data_length, penalty_value=6.0, remove=False)
    print(distances.shape)
    results = gaingrn.scripts.template_utils.cluster_agglomerative(distances, gain_subset, n_cluster=9)
    gaingrn.scripts.plotting_utils.plot_heirarchy(results['reordered_distances'], groupname=f'aGPCR GAIN', savename=f'{run_prefix}_sdb_heirarchy.png')
    gaingrn.scripts.plotting_utils.plot_matrix(results['reordered_distances'], title=f'aGPCR GAIN', savename=f'{run_prefix}_sdb_ordered_matrix.png')

    all_best = gain_subset[results['overall_best_gain']].name
    best_list = []
    for i, c in results['cluster_best_gains']:
        best_list.append((gain_subset[i].name, c, results['cluster_sizes'][c]))
    best_structures[r] = all_best
    best_clusters[r] = best_list
    print("Done with run",r)

In [None]:
print(best_structures)
print(best_clusters)


#### After creating a set of potential templates, they will each be evaluated against the whole set to remove arbitrarily similar templates, since not every receptor needs a unique template if they are too similar.

In [None]:
#print(results.keys())
#print(results['overall_best_gain'], results['cluster_best_gains'])
all_best = gain_subset[results['overall_best_gain']].name
best_list = []
for i, c in results['cluster_best_gains']:
    best_list.append((gain_subset[i].name, c, results['cluster_sizes'][c]))
print(all_best)
print(best_list)

After the directories with the potential templates have been created, we proceed to __template_testing.ipynb__ to evaluate them and select our final set.