# A Notebook for Picking suitable Template Structures from Subgroups of the GAIN domain Dataset.

In [None]:
# DEPENDENCIES
import glob, glob, re
#from shutil import copyfile
import numpy as np
import pandas as pd
# LOCAL IMPORTS
import sse_func
import template_finder as tf

gesamt_bin = "/home/hildilab/lib/xtal/ccp4-8.0/ccp4-8.0/bin/gesamt" #specify the location of your GESAMT executable

def calc_identity(aln_matrix):
    # This takes an alignment matrix with shape=(n_columns, n_sequences) and generates counts based on the identity matrix.
    # Returns the highest non "-" residue count as the most conserved residue and its occupancy based on count("-") - n_struc
    n_struc = aln_matrix.shape[0]
    quality = []
    occ = []
    for col in range(aln_matrix.shape[1]):
        chars, count = np.unique(aln_matrix[:,col], return_counts=True)
        dtype = [('aa', 'S1'), ('counts', int)]
        values = np.array(list(zip(chars,count)), dtype=dtype)
        s_values = np.sort(values, order='counts')

        if s_values[-1][0] == b'-':
            q = s_values[-2][1]
        else:
            q = s_values[-1][1]
        x = np.where(chars == '-')[0][0]
        occ.append(n_struc - count[x])
        quality.append(q)
    return quality, occ


In [None]:
# This is a remnant from the previous approach in having MSA-based anchors and is used as a comparative metric.
valid_seqs = sse_func.read_multi_seq("../agpcr_celsr.fa")
stride_files = glob.glob("../sigma_2_floats/*.stride")

#### First, load the full GainCollection of the Dataset and habe the folder containing all GAIN PDB files.

In [None]:
v_collection = pd.read_pickle("../valid_collection.q.pkl")
allpdbs = glob.glob('../all_pdbs/*.pdb')
print(len(allpdbs))

#### Chunk down the Dataset into Subselections containing every subfamily ("A", "B") and receptor type ("B1", "D2")

In [None]:
def get_family(name):
    # With the different namings, there is plenty of ambiguity regarding each aGPCR. This function mitigates this and posts A1,A2,....,V1 only from the protein name.
    queries = [('AGR..', name, lambda x: x[-1][-2:]), #
                ('ADGR..', name, lambda x: x[-1][-2:]), 
                ('cadher.*receptor.', name.lower(), lambda x: f"C{x[-1][-1]}"),
                ('cels?r.', name.lower(), lambda x: f"C{x[-1][-1]}"), 
                ('latrophilin.*protein-?\d', name.lower(), lambda x: f"L{x[-1][-1]}"),
                ('latrophilin-?\d', name.lower(), lambda x: f"L{x[-1][-1]}"),
                ('GP?R133', name.upper(),lambda x: 'D1'),
                ('GP?R126', name.upper(),lambda x: 'G6'),
                ('GP?R?124', name.upper(),lambda x: 'A2'),
                ('GP?R?125', name.upper(),lambda x: 'A3'),
                ('GP?R112', name.upper(),lambda x: 'G4'),
                ('GP?R116', name.upper(),lambda x: 'F5'),
                ('GP?R144', name.upper(),lambda x: 'D2'),
                ('ag-?.*-?coupled-?receptor-?.-?\d', name.lower(),lambda x: x[-1].replace('-','')[-2:].upper()),
                ('brain-?specific-?angiogenesis-?inhibitor-?\d', name.lower(), lambda x: f"B{x[-1][-1]}"),
                ('emr\d', name.lower(), lambda x: f"E{x[-1][-1]}"),
                ]
    for pattern, searchstring, output in queries:
        match = re.findall(pattern, searchstring)
        if match != []:
            #if output(match) == '': print(name)
            return output(match)
    return 'X'

fam_list = [get_family(gain.name) for gain in v_collection.collection]
name_list = [gain.name for gain in v_collection.collection]
subfam_list = [x[0] for x in fam_list]
print(fam_list)
receptors, counts  = np.unique(fam_list, return_counts=True)
r_list = list(zip(receptors,counts))
print(r_list)
print(receptors)
fam_counts = {}
for prot in fam_list:
    fam = prot[0]
    if fam not in fam_counts.keys():
        fam_counts[fam] = 0
    fam_counts[fam] += 1

print(fam_counts)

#### Here, the receptor groups to be tested are specified. The following sequence was tested:

> - Run1: Subfamily-specific templates
> - Run2: Added Receptor-specific templates for low-matching receptors
> - Run3: Added further templates for extra 14th strand S4

Every subselection is clustered via a pairwise RMSD-matrix constructed from individual _GESAMT_ runs. Two template candidates are then evaluated: The _overall_ centroid and the _largest cluster_ centroid. If they do not match, the clusters are evaluated to see whether there is are separate different clusters. A manual selection of the template is then carried out.

In [None]:
# This Box is for running Subselections for Subdomain A >SDA<

best_structures = {}
best_clusters = {}

manual_receptors = ["X"]

run_prefix = "p1"
print(allpdbs)
gain_subset = v_collection.collection
print(len(gain_subset))

gain_idx_list = range(len(fam_list)) # fam_list
maxlen = 400
data_length = len(gain_subset)
if data_length > maxlen:
    data_length = maxlen # data_length will be used.
    stride = data_length // maxlen
    gain_subset = gain_subset[::stride]
    gain_subset = gain_subset[:maxlen]
    print(len(gain_idx_list), len(gain_subset))

gesamt_outfolder = f"../{run_prefix}_gesamt_sda_pc"

tf.run_gesamt_execution(gain_subset, 
                        outfolder=gesamt_outfolder, 
                        pdb_folder='../all_pdbs/', 
                        domain='sda', 
                        n_threads=4, 
                        max_struc=maxlen, 
                        gesamt_bin=gesamt_bin,
                        no_run=False)
distances = tf.evaluate_gesamt_files(gesamt_outfolder, n_prot=data_length, penalty_value=6.0, remove=False)

print(distances.shape)
results = tf.cluster_agglomerative(distances, gain_subset, n_cluster=9)
tf.plot_heirarchy(results['reordered_distances'], groupname=f'aGPCR GAIN', savename=f'{run_prefix}_A_sda_heirarchy.png')
tf.plot_matrix(results['reordered_distances'], title=f'aGPCR GAIN', savename=f'{run_prefix}_sda_ordered_matrix.png')
#print(results.keys())
#print(results['overall_best_gain'], results['cluster_best_gains'])
all_best = gain_subset[results['overall_best_gain']].name
best_list = []
for i, c in results['cluster_best_gains']:
    best_list.append((gain_subset[i].name, c, results['cluster_sizes'][c]))
best_structures[r] = all_best
best_clusters[r] = best_list
print("Done with run",r)

In [None]:
# This Box is for running Subselections for Subdomain B >SDB<
best_structures = {}
best_clusters = {}

for r in 'ABCDEFGLVX': #enumerate(receptors[:1])
    print(r)
    gain_subset = [ gain for i, gain in enumerate(v_collection.collection) if fam_list[i]==r ]#fam_list[i] == r]#
    print(len(gain_subset))
    gain_idx_list = [ i for i,gain in enumerate(fam_list) if gain == r ] # fam_list
    maxlen = 400
    data_length = len(gain_subset)
    if data_length > maxlen:
        data_length = maxlen # data_length will be used.
        stride = data_length // maxlen
        gain_subset = gain_subset[::stride]
        gain_subset = gain_subset[:maxlen]
        print(len(gain_idx_list), len(gain_subset))

    gesamt_outfolder = f"../{run_prefix}_gesamt_sdb_pc"
    
    tf.run_gesamt_execution(gain_subset, outfolder=gesamt_outfolder, gesamt_bin=gesamt_bin, pdb_folder='../all_pdbs/', domain='sdb', n_threads=4, max_struc=maxlen, no_run=False)
    distances = tf.evaluate_gesamt_files(gesamt_outfolder, n_prot=data_length, penalty_value=6.0, remove=False)
    print(distances.shape)
    results = tf.cluster_agglomerative(distances, gain_subset, n_cluster=9)
    tf.plot_heirarchy(results['reordered_distances'], groupname=f'aGPCR GAIN', savename=f'{run_prefix}_sdb_heirarchy.png')
    tf.plot_matrix(results['reordered_distances'], title=f'aGPCR GAIN', savename=f'{run_prefix}_sdb_ordered_matrix.png')
    #print(results.keys())
    #print(results['overall_best_gain'], results['cluster_best_gains'])
    all_best = gain_subset[results['overall_best_gain']].name
    best_list = []
    for i, c in results['cluster_best_gains']:
        best_list.append((gain_subset[i].name, c, results['cluster_sizes'][c]))
    best_structures[r] = all_best
    best_clusters[r] = best_list
    print("Done with run",r)

In [None]:
print(best_structures)
print(best_clusters)


#### After creating a set of potential templates, they will each be evaluated against the whole set to remove arbitrarily similar templates, since not every receptor needs a unique template if they are too similar.

In [None]:
#print(results.keys())
#print(results['overall_best_gain'], results['cluster_best_gains'])
all_best = gain_subset[results['overall_best_gain']].name
best_list = []
for i, c in results['cluster_best_gains']:
    best_list.append((gain_subset[i].name, c, results['cluster_sizes'][c]))
print(all_best)
print(best_list)

After the directories with the potential templates have been created, we proceed to __template_testing.ipynb__ to evaluate them and select our final set.