In [1]:
# DEPENDENCIES
import glob
#from shutil import copyfile
import numpy as np
import pandas as pd
#import logomaker
# LOCAL IMPORTS
#from indexing_classes import GPCRDBIndexing
import sse_func
import matplotlib.pyplot as plt
import template_finder as tf

def calc_identity(aln_matrix, return_best_aa=False):
    # This takes an alignment matrix with shape=(n_columns, n_sequences) and generates counts based on the identity matrix.
    # Returns the highest non "-" residue count as the most conserved residue and its occupancy based on count("-") - n_struc
    n_struc = aln_matrix.shape[0]
    quality = []
    occ = []
    aa = []
    for col in range(aln_matrix.shape[1]):
        chars, count = np.unique(aln_matrix[:,col], return_counts=True)
        dtype = [('aa', 'S1'), ('counts', int)]
        values = np.array(list(zip(chars,count)), dtype=dtype)
        s_values = np.sort(values, order='counts')

        if s_values[-1][0] == b'-':
            q = s_values[-2][1]
            aa.append(s_values[-2][0])
        else:
            q = s_values[-1][1]
            aa.append(s_values[-1][0])
        x = np.where(chars == '-')[0][0]
        occ.append(n_struc - count[x])
        quality.append(q)
    if not return_best_aa:
        return quality, occ
    if return_best_aa:
        return quality, occ, aa


Load the valid GAIN domain Dataset and specify the pdb path.

In [2]:
# In addition to the collection, we need the corresponding PDB files.
valid_collection = pd.read_pickle("../valid_collection.pkl")
allpdbs = '../all_pdbs/*.pdb'
print(len(valid_collection.collection))

14435


In [3]:
# Get the Data for the initially selected templates
def get_gain(identifier, a_gain_collection):
    for gain in a_gain_collection:
        if identifier in gain.name:
            return gain
        
def get_struc_aln_anchors(gain, aln_dict, subdomain='a', threshold=3):
    aln_matrix = np.array([list(seq) for seq in aln_dict.values()])
    # Get the identity scores from the alignment
    quality, occ, aa = calc_identity(aln_matrix, return_best_aa=True)
    # The columns here exactly correspond to the template sequence order
    if subdomain.lower() == 'a':
        sse = gain.sda_helices
        d_string = "HELIX "
        sse_type = "H"
    elif subdomain.lower() == 'b':
        sse = gain.sdb_sheets
        d_string = "STRAND"
        sse_type = "S"
    else:
        print("NO SUBDOMAIN specified. EXITING.")
    
    anchor_quality = {}
    anchors = {}
    counter = 1

    for i,element in enumerate(sse):
        if element[1]-element[0] <= threshold:
            print("Element length below threshold. Skipping.", element)
            continue
        if subdomain =='a' and gain.start+element[0] > gain.subdomain_boundary:
            print("Skipping Subdomain A Helix", element)
            continue

        q = quality[element[0]:element[1]+1]
        label = f'{sse_type}{counter}'
        max_id = element[0]+np.argmax(q)+1
        max_res = gain.sequence[max_id]

        res_id = gain.start+max_id+1

        print(f"{d_string} #{i+1}: {max_res}{res_id} @ SSE residue {max_id-element[0]} | q = {np.max(q)} with res_idx {max_id} | MOST CONSERVED: {aa[max_id]} | PDB-res {gain.start+element[0]+1}-{gain.start+element[1]+1}")
        anchor_quality[label] = np.max(q)
        anchors[label] = max_id
        counter += 1
        pdb_anchors = {v:k+gain.start+1 for v,k in anchors.items()}
    print("__________")
    return anchors, anchor_quality, pdb_anchors

def get_template_information(identifier, gain_collection, subdomain='a', threshold=3, no_input=True):
    for gain in gain_collection.collection:
        if identifier in gain.name:
            print(gain.name, gain.start, gain.subdomain_boundary, gain.end, "\n")

            if subdomain.lower() == 'a':
                sse = gain.sda_helices
                d_string = "HELIX "
                sse_type = "H"
            elif subdomain.lower() == 'b':
                sse = gain.sdb_sheets
                d_string = "STRAND"
                sse_type = "S"
            else:
                print("NO SUBDOMAIN specified. EXITING.")
        
            #print(sse)
            anchor_quality = {}
            anchors = {}
            counter = 1
            aln_indices = []
            for i,element in enumerate(sse):
                if element[1]-element[0] <= threshold:
                    print("Element length below threshold. Skipping.", element)
                    continue
                if subdomain =='a' and gain.start+element[0] > gain.subdomain_boundary:
                    print("Skipping Subdomain A Helix", element)
                    continue
                label = f'{sse_type}{counter}'
                q = [ gain.residue_quality[res] for res in range(element[0], element[1]+1)]
                max_id = element[0]+np.argmax(q)
                max_res = gain.sequence[max_id]
                #aln_idx = gain.alignment_indices[max_id]
                res_id = gain.start+max_id+1
                print(f"{d_string} #{i+1}: {max_res}{res_id} @ SSE residue {max_id-element[0]} | q = {np.max(q)} with res_idx {max_id} | {q} | {gain.start+element[0]}-{gain.start+element[1]}")
                if not no_input:
                    confirm = input(f"{d_string} #{i+1}: {max_res}{res_id} @ SSE re {max_id-element[0]} | q={np.max(q)} w res_idx {max_id} | {gain.start+element[0]}-{gain.start+element[1]}. Keep?")
                    if confirm.lower() != "y":
                        print("Skipping this anchor.");continue
                anchor_quality[label] = np.max(q)
                anchors[label] = max_id
                aln_indices.append(gain.alignment_indices[max_id])
                counter += 1
            pdb_anchors = {v:k+gain.start+1 for v,k in anchors.items()}
            print("__________")
            return anchors, anchor_quality, aln_indices, pdb_anchors

# Test:
# _,_,_,_ = get_template_information('A0A6G1Q0B9', valid_collection, 'a')

In [8]:
# Construct a GESAMT bashfile for pairwise aln to each subdomain. Output the resulting PDB into respective folder
#SDB TEMPLATE

sdb_r1_template = {'E5b':'A0A3P8S994-A0A3P8S994_AMPPE-AGRE5b,duplicate2-Amphiprion_percula'}
sdb_templates = {'E5b':'A0A3P8S994-A0A3P8S994_AMPPE-AGRE5b,duplicate2-Amphiprion_percula',
                 #'G1b':'A0A7L3GD10-A0A7L3GD10_9AVES-AGRG1-Anhinga_rufa', 
                 #'G3b':'A0A3Q7QGV6-A0A3Q7QGV6_CALUR-AGRG3-likeisoformX3-Callorhinus_ursinus',
                 'G5b':'A0A6J3IBI5-A0A6J3IBI5_SAPAP-AGRG5-Sapajus_apella'}
# SDA TEMPLATEs
sda_r1_templates = {'A': 'A0A2Y9F628-A0A2Y9F628_PHYMC-AGRA3isoformX1-Physeter_macrocephalus', 
                    'B': 'A0A4W6DVA0-A0A4W6DVA0_LATCA-AGRB1b-Lates_calcarifer', 
                    'C': 'A0A7K6E127-A0A7K6E127_9PASS-CELR3protein-Grantiella_picta.', 
                    'D': 'A0A1A7WJQ6-A0A1A7WJQ6_9TELE-GR144-Iconisemion_striatum.', 
                    'E': 'A0A3P8S994-A0A3P8S994_AMPPE-AGRE5b,duplicate2-Amphiprion_percula', 
                    'F': 'A0A452IH20-A0A452IH20_9SAUR-AGRF5-Gopherus_agassizii', 
                    'G': 'A0A1W4WJB1-A0A1W4WJB1_AGRPL-AGRG6-likeisoformX1-Agrilus_planipennis', 
                    'L': 'A0A452HCU9-A0A452HCU9_9SAUR-AGRL3-Gopherus_agassizii', 
                    'V': 'A0A6Q2XYK2-A0A6Q2XYK2_ESOLU-AGRV1-Esox_lucius',
                    'X': "A0A6F9A857-A0A6F9A857_9TELE-Uncharacterizedprotein-Coregonus_sp._'balchen'."}

sda_r2_templates = {'G1': 'A0A7L3GD10-A0A7L3GD10_9AVES-AGRG1-Anhinga_rufa', 
                    'G2': 'A0A2K5MG19-A0A2K5MG19_CERAT-AGRG2-Cercocebus_atys', 
                    'G3': 'A0A3Q7QGV6-A0A3Q7QGV6_CALUR-AGRG3-likeisoformX3-Callorhinus_ursinus', 
                    'G4': 'A0A2I4CCH8-A0A2I4CCH8_9TELE-AGRG4-like-Austrofundulus_limnaeus.', 
                    'G5': 'A0A6J3IBI5-A0A6J3IBI5_SAPAP-AGRG5-Sapajus_apella', 
                    'G6': 'A0A6P7HB06-A0A6P7HB06_9TELE-AGRG6isoformX6-Parambassis_ranga', 
                    #'G6.1': 'F6QI92-F6QI92_CALJA-AGRG6-Callithrix_jacchus', 
                    #'G6.2':'A0A7J7WUN2-A0A7J7WUN2_MYOMY-AGRG6-Myotis_myotis',
                    'G7': 'A0A2K5Y1I7-A0A2K5Y1I7_MANLE-AGRG7-Mandrillus_leucophaeus', 
                    'E1': 'A0A2I2YJG7-A0A2I2YJG7_GORGO-AGRE1-Gorilla_gorilla_gorilla', 
                    'E2': 'A0A2Y9QG39-A0A2Y9QG39_TRIMA-AGRE2isoformX3-Trichechus_manatus_latirostris', 
                    'E3': 'A0A2Y9M464-A0A2Y9M464_DELLE-AGRE3isoformX1-Delphinapterus_leucas', 
                    'E4': 'A0A6J3FRL0-A0A6J3FRL0_SAPAP-putativeAGRE4PisoformX1-Sapajus_apella', 
                    'E5': 'G1TKX5-G1TKX5_RABIT-AGRE5-Oryctolagus_cuniculus', 
                    #'E5.1': 'A0A2R9CT02-A0A2R9CT02_PANPA-AGRE5-Pan_paniscus',
                    #'E5.2': 'F6PLI2-F6PLI2_CANLF-AGRE5-Canis_lupus_familiaris',
                    'F2': 'A0A452SUX4-A0A452SUX4_URSAM-AGRF2-Ursus_americanus', 
                    #'F2.1':'A0A3Q0CU45-A0A3Q0CU45_MESAU-AGRF2-Mesocricetus_auratus',  
                    #'F2.2':'E2RAG2-E2RAG2_CANLF-AGRF2-Canis_lupus_familiaris',
                    'F4': 'W5PQ70-W5PQ70_SHEEP-AGRF4-Ovis_aries', 
                    'F5': 'A0A7L3N0A5-A0A7L3N0A5_9AVES-AGRF5protein-Oreotrochilus_melanogaster.', 
                    #'F5.1':'A0A1U7SCS2-A0A1U7SCS2_ALLSI-AGRF5isoformX1-Alligator_sinensis',
                    #'F5.2':'A0A7K5GSD3-A0A7K5GSD3_9AVES-AGRF5protein-Chunga_burmeisteri',
                    'D1': 'A0A369SLT5-A0A369SLT5_9METZ-AGRD1-Trichoplax_sp._H2.', 
                    'L4': 'A0A7L3KTA8-A0A7L3KTA8_9PASS-AGRL4protein-Drymodes_brunneopygia.'}

sda_templates = {
    'A': 'A0A2Y9F628-A0A2Y9F628_PHYMC-AGRA3isoformX1-Physeter_macrocephalus',
    'C': 'A0A7K6E127-A0A7K6E127_9PASS-CELR3protein-Grantiella_picta.', 
    'D': 'A0A1A7WJQ6-A0A1A7WJQ6_9TELE-GR144-Iconisemion_striatum.', 
    'E1': 'A0A2I2YJG7-A0A2I2YJG7_GORGO-AGRE1-Gorilla_gorilla_gorilla', 
    'E5': 'G1TKX5-G1TKX5_RABIT-AGRE5-Oryctolagus_cuniculus', 
    'F5': 'A0A7L3N0A5-A0A7L3N0A5_9AVES-AGRF5protein-Oreotrochilus_melanogaster.',
    'F4': 'W5PQ70-W5PQ70_SHEEP-AGRF4-Ovis_aries', 
    'G7': 'A0A2K5Y1I7-A0A2K5Y1I7_MANLE-AGRG7-Mandrillus_leucophaeus', 
    'L': 'A0A452HCU9-A0A452HCU9_9SAUR-AGRL3-Gopherus_agassizii', 
    'L4': 'A0A7L3KTA8-A0A7L3KTA8_9PASS-AGRL4protein-Drymodes_brunneopygia.',
    'V': 'A0A6Q2XYK2-A0A6Q2XYK2_ESOLU-AGRV1-Esox_lucius'
}

def find_pdb(name, pdb_folder):
    identifier = name.split("-")[0]
    target_pdb = glob.glob(f"{pdb_folder}/*{identifier}*.pdb")[0]
    return target_pdb

#for t_name, sdb_template in sdb_r1_template.items():
for i in range(1):
    t_name, sdb_template = 'G5b','A0A6J3IBI5-A0A6J3IBI5_SAPAP-AGRG5-Sapajus_apella'
    sdb_template_pdb = find_pdb(sdb_template, '../r2_template_pdbs')#'../r2_sdb_templates')
    print(sdb_template_pdb)
    tf.run_gesamt_execution(valid_collection.collection, 
                                outfolder=f"../{t_name}_{sdb_template.split('-')[0]}_sdb",
                                pdb_folder='../all_pdbs', 
                                domain='sdb', 
                                n_threads=6, 
                                max_struc=len(valid_collection.collection), 
                                no_run=False,
                                template=sdb_template_pdb)

for fam, prot in sda_templates.items():
    identifier = prot.split("-")[0]
    current_template = find_pdb(prot, '../r2_template_pdbs')

    gesamt_outfolder = f'../{fam}_{identifier}_sda'

"""    tf.run_gesamt_execution(valid_collection.collection, 
                            outfolder=gesamt_outfolder, 
                            pdb_folder='../all_pdbs', 
                            domain='sda', 
                            n_threads=6, 
                            max_struc=len(valid_collection.collection), 
                            no_run=False,
                            template=current_template)"""

../r2_template_pdbs/G5b_A0A6J3IBI5.pdb
Created ../G5b_A0A6J3IBI5_sdb
Written a total of 14435 GESAMT commands to file.
Running set of GESAMT comparisons with 6 threads.
done.


"    tf.run_gesamt_execution(valid_collection.collection, \n                            outfolder=gesamt_outfolder, \n                            pdb_folder='../all_pdbs', \n                            domain='sda', \n                            n_threads=6, \n                            max_struc=len(valid_collection.collection), \n                            no_run=False,\n                            template=current_template)"

In [9]:
r1_templates = {'sdb':['A0A3P8S994-A0A3P8S994_AMPPE-AGRE5b,duplicate2-Amphiprion_percula', 'b', '../A0A3P8S994_sdb'],
                    'A': ['A0A2Y9F628-A0A2Y9F628_PHYMC-AGRA3isoformX1-Physeter_macrocephalus', 'a', '../A0A2Y9F628_A_sda'], 
                    'B': ['A0A4W6DVA0-A0A4W6DVA0_LATCA-AGRB1b-Lates_calcarifer', 'a', '../A0A4W6DVA0_B_sda'], 
                    'C': ['A0A7K6E127-A0A7K6E127_9PASS-CELR3protein-Grantiella_picta.', 'a', '../A0A7K6E127_C_sda'], 
                    'D': ['A0A1A7WJQ6-A0A1A7WJQ6_9TELE-GR144-Iconisemion_striatum.', 'a', '../A0A1A7WJQ6_D_sda'], 
                    'E': ['A0A3P8S994-A0A3P8S994_AMPPE-AGRE5b,duplicate2-Amphiprion_percula', 'a', '../A0A3P8S994_E_sda'], 
                    'F': ['A0A452IH20-A0A452IH20_9SAUR-AGRF5-Gopherus_agassizii', 'a', '../A0A452IH20_F_sda'], 
                    'G': ['A0A1W4WJB1-A0A1W4WJB1_AGRPL-AGRG6-likeisoformX1-Agrilus_planipennis', 'a', '../A0A1W4WJB1_G_sda'], 
                    'L': ['A0A452HCU9-A0A452HCU9_9SAUR-AGRL3-Gopherus_agassizii', 'a', '../A0A452HCU9_L_sda'], 
                    'V': ['A0A6Q2XYK2-A0A6Q2XYK2_ESOLU-AGRV1-Esox_lucius', 'a', '../A0A6Q2XYK2_V_sda'],
                    'X': ["A0A6F9A857-A0A6F9A857_9TELE-Uncharacterizedprotein-Coregonus_sp._'balchen'.", 'a', '../A0A6F9A857_X_sda']}

# fuse sda_templates and sdb_templates together to be in the form of:
#    'A': ['A0A2Y9F628-A0A2Y9F628_PHYMC-AGRA3isoformX1-Physeter_macrocephalus', 'a', '../A0A2Y9F628_A_sda'],
templates = {}
for t_id, t_name in {**sdb_templates, **sda_templates}.items():
    if "b" in t_id: sd = 'b'
    else:           sd = 'a'
    
    folder_string = f"../{t_id}_{t_name.split('-')[0]}_sd{sd}"
    templates[t_id] = [t_name, sd, folder_string]

print("Fused SDA and SDB templates into a single dictionary:", templates)

template_anchors = {}
template_quality = {}
template_indices = []

all_indices = []
for k, v in templates.items():

    if 'b' in k:
        threshold = 1
    else:
        threshold = 4
    
    template_gain = get_gain(v[0].split("-")[0], valid_collection.collection)
    #raw_anchors, a_qual, indices, anchors = get_template_information(v[0].split("-")[0], valid_collection, v[1], threshold=threshold)
    structural_alignment = tf.construct_structural_alignment(template_gain_domain=template_gain,
                                                             list_of_gain_obj=valid_collection.collection,
                                                             gain_indices=range(len(valid_collection.collection)),
                                                             gesamt_folder=v[2],
                                                             outfile=f'../{v[0].split("-")[0]}_{k}.struc_aln.fa',
                                                             debug=False)
    #print(structural_alignment)
    a_qual, indices, anchors = get_struc_aln_anchors(gain=template_gain,
                                                     aln_dict=structural_alignment,
                                                     subdomain=v[1],
                                                     threshold=threshold)
    print(a_qual, indices, anchors)
    template_anchors[k] = anchors
    template_quality[k] = a_qual
    if 'b' in k:
        continue
    for i in indices:
        template_indices.append(i)
    all_indices.append(indices)

print(template_anchors)
print(template_quality)
a,b  = np.unique(template_indices, return_counts=True)
a_counts = dict(zip(a,b)) 
anchor_col = {x:i for i,x in enumerate(a)}
print(anchor_col, len(a))
hasAnchor = np.zeros(shape=(len(a), len(all_indices)))
print(hasAnchor.shape)
for fam_count, indices in enumerate(all_indices):
    print(indices)
    for v in indices:
        print(v)
        hasAnchor[anchor_col[v], fam_count] = 1
scwabbel

Fused SDA and SDB templates into a single dictionary: {'E5b': ['A0A3P8S994-A0A3P8S994_AMPPE-AGRE5b,duplicate2-Amphiprion_percula', 'b', '../E5b_A0A3P8S994_sdb'], 'G5b': ['A0A6J3IBI5-A0A6J3IBI5_SAPAP-AGRG5-Sapajus_apella', 'b', '../G5b_A0A6J3IBI5_sdb'], 'A': ['A0A2Y9F628-A0A2Y9F628_PHYMC-AGRA3isoformX1-Physeter_macrocephalus', 'a', '../A_A0A2Y9F628_sda'], 'C': ['A0A7K6E127-A0A7K6E127_9PASS-CELR3protein-Grantiella_picta.', 'a', '../C_A0A7K6E127_sda'], 'D': ['A0A1A7WJQ6-A0A1A7WJQ6_9TELE-GR144-Iconisemion_striatum.', 'a', '../D_A0A1A7WJQ6_sda'], 'E1': ['A0A2I2YJG7-A0A2I2YJG7_GORGO-AGRE1-Gorilla_gorilla_gorilla', 'a', '../E1_A0A2I2YJG7_sda'], 'E5': ['G1TKX5-G1TKX5_RABIT-AGRE5-Oryctolagus_cuniculus', 'a', '../E5_G1TKX5_sda'], 'F5': ['A0A7L3N0A5-A0A7L3N0A5_9AVES-AGRF5protein-Oreotrochilus_melanogaster.', 'a', '../F5_A0A7L3N0A5_sda'], 'F4': ['W5PQ70-W5PQ70_SHEEP-AGRF4-Ovis_aries', 'a', '../F4_W5PQ70_sda'], 'G7': ['A0A2K5Y1I7-A0A2K5Y1I7_MANLE-AGRG7-Mandrillus_leucophaeus', 'a', '../G7_A0A2K5Y1I

NameError: name 'scwabbel' is not defined

After running the pairwise GESAMT, we can use the resulting OUT and PDB files for analyzing.

In [None]:
print(a,b)
# Plot the anchors with their respective alignment columns
# This won't work with the StAl-based anchors, since we have one alignment for each template and the index is simply the residue index
fig = plt.figure(figsize=[4,4], facecolor='w')
im = plt.imshow(hasAnchor.T, cmap='gray')
ax = plt.gca()
ax.set_xticks(range(len(a)))
ax.set_yticks(range(len(templates.keys())))
ax.set_xticklabels(a, rotation=90)
ax.set_yticklabels(templates.keys())
ax.set_xticks(np.arange(-.5, len(a), 1), minor=True)
ax.set_yticks(np.arange(-.5, len(templates.keys()), 1), minor=True)
plt.xlabel("Helix Anchor column")
plt.ylabel("Subfamily SDA Template")
ax.grid(which='minor', linewidth=2)
plt.savefig("../r2_StAl_sda_template_anchors.png",dpi=300, bbox_inches='tight')

In [None]:
fam_list = [tf.get_agpcr_type(gain.name) for gain in valid_collection.collection]
name_list = [gain.name for gain in valid_collection.collection]
subfam_list = [x[0] for x in fam_list]
receptors, counts  = np.unique(fam_list, return_counts=True)
r_list = list(zip(receptors,counts))
print(r_list)
print(receptors)
fam_counts = {}
for prot in fam_list:
    fam = prot[0]
    if fam not in fam_counts.keys():
        fam_counts[fam] = 0
    fam_counts[fam] += 1

print(fam_counts)

In [None]:
# match everything for each subfamily.
#print(subfam_list)

y = len(r_list)
# famstring
template_ids = list(templates.keys())

t_occupancies = {}
t_distances = {}
unmatched = {}
unmatched_counters = {}
for t_id in template_ids:
    t_anchors = template_anchors[t_id]
    # SDB modified. t_anchors = {'S1': 324, 'S2': 335, 'S3': 353, 'S4': 359, 'S5': 381, 'S6': 409, 'S7': 414, 'S8': 436, 'S9': 453, 'S10': 459, 'S11': 470, 'S12': 478, 'S13': 487}
    #t_anchors = {'S1': 324, 'S2': 335, 'S3': 349, 'S4': 359, 'S5': 381, 'S6': 409, 'S7': 414, 'S8': 436, 'S9': 453, 'S10': 459, 'S11': 466, 'S12': 478, 'S13': 487}
    t_quality = template_quality[t_id]
    t_folder = templates[t_id][-1]
    n_anch = len(t_anchors.keys())
    u_list = np.zeros(shape=(y), dtype=dict)
    u_counters = np.zeros(shape=(y), dtype=int)
    print(t_anchors)
    anchor_index = {k:i for i, k in enumerate(t_anchors.keys())}
    assigned_anchor_freq = np.zeros(shape=(len(receptors),n_anch))
    all_anchor_averages = np.full(shape=(y,n_anch), fill_value=None)
    all_anchor_occupancy = np.zeros(shape=(y,n_anch))

    if 'b' in t_id: sd_string = 'sdb'
    else: sd_string = 'sda'

    for fam_idx, r in enumerate(receptors):# in enumerate('ABCDEFGLVX'):
        gain_subset = [ gain for i, gain in enumerate(valid_collection.collection) if fam_list[i] == r ]#subfam_list[i]==r ]
        gain_idx_list = [ i for i,gain in enumerate(fam_list) if gain == r ]
        #print(r, len(gain_subset))

        element_occupation = {k:0 for k in t_anchors.keys()}

        for key, val in element_occupation.items():
            assigned_anchor_freq[fam_idx, anchor_index[key]] = float(val)/len(gain_subset)
        #DEBUG:
        #for gain in gain_subset: 
        #    print(gain.name, [hel for hel in gain.sda_helices if hel[0] < gain.subdomain_boundary-gain.start])
        
        fam_distances, fam_matched_anchors, unmatched_elements, unmatched_counter = tf.gain_set_to_template(gain_subset, 
                                                                                                            gain_idx_list, 
                                                                                                            t_anchors, 
                                                                                                            t_folder, 
                                                                                                            penalty=None,
                                                                                                            subdomain=sd_string,
                                                                                                            return_unmatched_mode='all', 
                                                                                                            debug=False)
        #print("DEBUG: ROOT" , unmatched_elements)
        mean_dist = np.empty(shape=(n_anch))
        occ = np.zeros(shape=(n_anch))
        
        for j in range(n_anch):
            occ_values = np.array([d for d in fam_distances[:,j] if d is not None])
            if len(occ_values) != 0:
                mean_dist[j] = round(np.mean(occ_values), 3)
                occ[j] = round(np.count_nonzero(fam_distances[:,j])/len(gain_idx_list), 3)
        all_anchor_averages[fam_idx,:] = mean_dist #np.mean(fam_distances, axis=0)
        all_anchor_occupancy[fam_idx,:] = occ
        u_counters[fam_idx] = unmatched_counter
        u_list[fam_idx] = unmatched_elements
        #print(all_anchor_averages)
        #print(all_anchor_occupancy)
    print(u_list.shape) # u_list is a list of dicts.
    print(type(u_list[0]), type(u_list[1]))
    print(f"Done with Template {t_id}.\n", "_"*30)

    t_distances[t_id] = all_anchor_averages
    t_occupancies[t_id] = all_anchor_occupancy
    unmatched[t_id] = u_list
    unmatched_counters[t_id] = u_counters
#print(unmatched_counters)
#print(type(unmatched))


In [None]:
# Plot the OCCUPANCY
for t_id in ["F5"]:#template_ids:
    t_anchors = template_anchors[t_id]
    # SDB modfied anchors : t_anchors = {'S1': 324, 'S2': 335, 'S3': 353, 'S4': 359, 'S5': 381, 'S6': 409, 'S7': 414, 'S8': 436, 'S9': 453, 'S10': 459, 'S11': 470, 'S12': 478, 'S13': 487}
    #t_anchors = {'S1': 324, 'S2': 335, 'S3': 349, 'S4': 359, 'S5': 381, 'S6': 409, 'S7': 414, 'S8': 436, 'S9': 453, 'S10': 459, 'S11': 466, 'S12': 478, 'S13': 487}
    n_anch = len(t_anchors.keys())
    t_anchor_freqs = t_occupancies[t_id]
    u_counters = unmatched_counters[t_id]
    fig = plt.figure(figsize=[6,10], facecolor='w')
    #plt.yticks(ticks = range(ydim), labels= [f'ADGR{f}' for f in 'ABCDEFGLVX'])
    plt.title(f"Template Match for : {t_id}")
    plt.yticks(ticks = range(len(r_list)), labels= [f'{i[0]}:{i[1]} (u:{u_counters[x]})' for x,i in enumerate(r_list)])
    #plt.yticks(ticks = range(len(r_list)), labels= [f'{i[0]}:{i[1]}' for x,i in enumerate(r_list)])
    plt.xticks(ticks = range(n_anch), labels=t_anchors.keys(), rotation=90)
    #plt.imshow(t_anchor_freqs, cmap='summer')
    distances = np.zeros(shape=(len(r_list), n_anch), dtype=float)
    #t_dists = t_distances[t_id]
    #for i,l in enumerate(t_dists):
    #    distances[i,:] = l
    #plt.imshow(distances, cmap='spring')
    plt.imshow(t_anchor_freqs, cmap='summer')
    cbar = plt.colorbar(shrink=0.5)
    cbar.set_label('Relative Occupancy')
    #cbar.set_label('AA-Distance')
    ydim = len(r_list)
    for y in range(ydim):
        for x in range(n_anch):
            if t_anchor_freqs[y,x] > 0.001:
                pass
            else:
                plt.text(x,y,'x', horizontalalignment='center', verticalalignment='center', fontsize=18,color='k')
    plt.savefig(f'r2_StAl{t_id}_occ.png', dpi=300)

In [None]:
# Plot the DISTANCES
for t_id in template_ids[-1:]:
    t_anchors = template_anchors[t_id]
    # SDB modfied anchors : t_anchors = {'S1': 324, 'S2': 335, 'S3': 353, 'S4': 359, 'S5': 381, 'S6': 409, 'S7': 414, 'S8': 436, 'S9': 453, 'S10': 459, 'S11': 470, 'S12': 478, 'S13': 487}
    #t_anchors = {'S1': 324, 'S2': 335, 'S3': 349, 'S4': 359, 'S5': 381, 'S6': 409, 'S7': 414, 'S8': 436, 'S9': 453, 'S10': 459, 'S11': 466, 'S12': 478, 'S13': 487}
    n_anch = len(t_anchors.keys())
    t_anchor_freqs = t_occupancies[t_id]
    u_counters = unmatched_counters[t_id]
    fig = plt.figure(figsize=[6,10], facecolor='w')
    #plt.yticks(ticks = range(ydim), labels= [f'ADGR{f}' for f in r_list])
    plt.title(f"Template Match for : {t_id}")
    #plt.yticks(ticks = range(len(r_list)), labels= [f'{i[0]}:{i[1]} (u:{u_counters[x]})' for x,i in enumerate(r_list)])
    plt.yticks(ticks = range(len(r_list)), labels= [f'{i[0]}:{i[1]}' for i in r_list])
    plt.xticks(ticks = range(n_anch), labels=t_anchors.keys(), rotation=90)
    #plt.imshow(t_anchor_freqs, cmap='summer')
    distances = np.zeros(shape=(len(r_list), n_anch), dtype=float)
    t_dists = t_distances[t_id]
    for i,l in enumerate(t_dists):
        distances[i,:] = l
    plt.imshow(distances, cmap='spring')
    #plt.imshow(t_anchor_freqs, cmap='summer')
    cbar = plt.colorbar(shrink=0.5)
    #cbar.set_label('Relative Occupancy')
    cbar.set_label('AA-Distance')
    ydim = len(r_list)
    for y in range(ydim):
        for x in range(n_anch):
            if t_anchor_freqs[y,x] > 0.001:
                pass
            else:
                plt.text(x,y,'x', horizontalalignment='center', verticalalignment='center', fontsize=18,color='k')
    plt.savefig(f'r2_StAl{t_id}_dist.png', dpi=300)

In [None]:
outfile = open("../r2_StAl_unmatched_full.txt", 'w')
outfile.write("Temp  Grp   nGrp  alnIdx  nNoMat  avgLen  %unmat\n")
adress_matrix = [] # (my_template, col_names[id], value)
col_names = {} # A1-1094: 0
skip = 3

#print(r_list, len(r_list))
for t_index, t_id in enumerate(template_ids[skip:]):
    u_list = unmatched[t_id]

    #print(u_dict)
    for rud_idx, receptor_unmatched_dict in enumerate(u_list):
        
        e_length = []
        e_res = []
        res_len = {}
        all_items = []
        #print(receptor_unmatched_dict)
        for lst in receptor_unmatched_dict.values():
            lengths = [int(i[2]) for i in lst]
            e_length = e_length+lengths
            e_res += [i[0] for i in lst]
            
            for i in lst:
                if int(i[0]) not in res_len.keys():
                    res_len[int(i[0])] = [i[2]]
                else:
                    res_len[int(i[0])].append(i[2])
                all_items.append(i)
        res_av_len = {k:np.average(v) for k,v in res_len.items()}

        #print(np.average(e_length))
        resid, ct = np.unique(e_res, return_counts=True)
        where_many = {resid[k]:c for k,c in enumerate(ct) if c > 5}
        #print(where_many)
        #plt.bar(resid, ct)
        
        sel_length = r_list[rud_idx][1]
        receptor_name = r_list[rud_idx][0]
        for idx, count in enumerate(ct):
            if count > 0.1*sel_length and res_av_len[resid[idx]] > 3.5: # more than 10% of selection have this

                unindexed_freq = count/sel_length
                column_name = f"{receptor_name}-{str(resid[idx]).ljust(4)}"
                if column_name not in col_names.keys(): 
                    name_idx = len(col_names.keys())
                    col_names[column_name] = name_idx
                    
                else:
                    name_idx = col_names[column_name]
                adress_matrix.append( (t_index, name_idx, unindexed_freq) )

                outfile.write(f"{t_id}{receptor_name.rjust(7)}{str(sel_length).rjust(8)}")
                outfile.write(f"{str(resid[idx]).rjust(8)}{str(count).rjust(8)}{str(round(res_av_len[resid[idx]],1)).rjust(8)}{str(round(count*100/sel_length)).rjust(7)}%   ")
                for value in all_items[idx]:
                    outfile.write(str(value).rjust(8))#plt.bar(resid[idx], count)
                outfile.write("\n")
                #plt.annotate(f"{round(res_av_len[resid[idx]],1)}", (resid[idx],count))
outfile.close()

In [None]:
# now we can construct a DataFrame from this shizzle
unmatched_matrix = np.zeros(shape=(len(template_ids)-skip, len(col_names.keys())), dtype=float)
for item in adress_matrix:
    unmatched_matrix[item[0],item[1]] = item[2]
# Sort the matrix to match the receptor order in $receptors
sorted_unmatched_matrix = np.zeros(shape=(len(template_ids)-skip, len(col_names.keys())), dtype=float)
new_order = sorted(range(len(col_names.keys())), key=lambda k: list(col_names.keys())[k])
#print(new_order)
for data_col in range(len(col_names.keys())):
    sorted_unmatched_matrix[:,data_col] = unmatched_matrix[:,new_order[data_col]]

fig = plt.figure(figsize=[12,4], facecolor='w')
ax = plt.gca()
ax.imshow(sorted_unmatched_matrix, cmap='binary')
plt.xticks(ticks = range(len(col_names.keys())), labels = sorted(col_names.keys()) ,rotation=90, fontsize=4, verticalalignment='top')
plt.yticks(ticks = range(len(template_ids)-skip), labels=template_ids[skip:], fontsize=4)
stored_ki = 'A1'
r_bounds = []
for i, ki in enumerate(sorted(col_names.keys())):
    r = ki.split("-")[0]
    if r != stored_ki:
        stored_ki = r
        r_bounds.append(i)
for b in r_bounds:
    plt.vlines(b-0.5, -0.5, len(template_ids)-skip-0.5, color='r', linewidth=0.5)
plt.savefig("../r2_StAl_template_match1.png",dpi=300, bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=[4,12], facecolor='w')
plt.imshow(sorted_unmatched_matrix)
stored_ki = 'A1'
r_bounds = []
for i, ki in enumerate(sorted(col_names.keys())):
    r = ki.split("-")[0]
    if r != stored_ki:
        stored_ki = r
        r_bounds.append(i)

#for b in r_bounds:
#    plt.vlines(b, 0, len(template_ids)-1, color='w', linewidth=0.5)

In [None]:
def sel2pymol(receptor, target_folder, find=False, stride=1):
    id_list = []
    for r in receptors:# in enumerate('ABCDEFGLVX'):
        if r != receptor:
            continue
        gain_subset = [ gain for i, gain in enumerate(valid_collection.collection) if fam_list[i] == r]#subfam_list[i]==r ]
        for gain in gain_subset:
            id_list.append(gain.name.split("-")[0])
        file_str = [(find_pdb(i, target_folder)) for i in id_list]
        print("pymol"," ".join(file_str))
    if find:
        for identifier in id_list[::stride]:
            print(f'find . -name \"*{identifier}*rank_1*pdb\" | tee -a {receptor}_found.txt')

sel2pymol('F2', '../all_pdbs/', True, stride=3)

In [None]:

ydim = 40
fig = plt.figure(figsize=[8,ydim/3], facecolor='w')
plt.imshow(docc, cmap='spring')
#plt.yticks(ticks = range(ydim), labels= [f'ADGR{f}' for f in 'ABCDEFGLVX'])
plt.yticks(ticks = range(ydim), labels= [f'{i[0]}:{i[1]}' for i in r_list])
plt.xticks(ticks = range(n_anch), labels= allsse, rotation=90)

all_anchor_averages # 10,21
for y in range(ydim):
    for x in range(n_anch):
        if all_anchor_averages[y,x] > 0.0001:
            plt.text(x,y, round(all_anchor_averages[y,x], 2), horizontalalignment='center', verticalalignment='center', fontsize=7,color='k', rotation=45)
cbar = plt.colorbar(shrink=float(8/ydim))
cbar.set_label('Relative Occupancy')
plt.vlines(6.5,-0.5,ydim-0.5, color='k', linewidth=1.5)
plt.savefig('identity_receptor_anchor_occupancy.png', dpi=300)

In [None]:

fig = plt.figure(figsize=[8,ydim/3], facecolor='w')
#plt.imshow(df, cmap='summer')
im_data = np.zeros(shape=(ydim, n_anch))

print(all_anchor_averages.shape)
#plt.yticks(ticks = range(10), labels= [f'ADGR{f}' for f in 'ABCDEFGLVX'])
plt.yticks(ticks = range(ydim), labels= [f'{i[0]}:{i[1]}' for i in r_list])
plt.xticks(ticks = range(n_anch), labels= allsse, rotation=90)
#all_anchor_averages # 10,21
for y in range(ydim):
    for x in range(n_anch):
        if all_anchor_averages[y,x] > 0.001:
            im_data[y,x] = all_anchor_averages[y,x]
        else:
            plt.text(x,y,'x', horizontalalignment='center', verticalalignment='center', fontsize=20,color='k')
            #patches.Rectangle((x,y), 1, 1, linewidth=0.5, edgecolor='k', facecolor='w')
plt.imshow(im_data, cmap='summer', vmax=3)
            #plt.text(x,y, round(all_anchor_averages[y,x], 2), horizontalalignment='center', verticalalignment='center', fontsize=7,color='k', rotation=45)
cbar = plt.colorbar(shrink=float(8)/ydim)
cbar.set_label(r'Closest Anchor Residue Distance [$\AA$]')
plt.vlines(6.5,-0.5,ydim-0.5, color='k', linewidth=1.5)
plt.savefig('identity_receptor_anchor_distance.png', dpi=300)

In [None]:
ydim = 40
fig = plt.figure(figsize=[8,ydim/3], facecolor='w')

#plt.yticks(ticks = range(ydim), labels= [f'ADGR{f}' for f in 'ABCDEFGLVX'])
plt.yticks(ticks = range(ydim), labels= [f'{i[0]}:{i[1]}' for i in r_list])
plt.xticks(ticks = range(n_anch), labels= allsse, rotation=90)
occ_values = df.to_numpy()
is_off = np.zeros(shape=(40,26))
print(docc.shape, all_anchor_averages.shape)
#all_anchor_averages # 10,21
for y in range(ydim):
    for x in range(n_anch):
        if all_anchor_averages[y,x] is not None and all_anchor_averages[y,x] > 1.5 and occ_values[y,x] > 0.1:
            is_off[y,x] = 1
            plt.text(x,y, round(all_anchor_averages[y,x], 2), horizontalalignment='center', verticalalignment='center', fontsize=7,color='k', rotation=45)
plt.imshow(is_off, cmap='spring')
cbar = plt.colorbar(shrink=float(8/ydim))
cbar.set_label('Relative Occupancy')
plt.vlines(6.5,-0.5,ydim-0.5, color='k', linewidth=1.5)
#plt.savefig('identity_receptor_anchor_occupancy.png', dpi=300)

In [None]:
sse_stats = np.zeros(shape = (n_anch, 2))

for fam_idx, r in enumerate(receptors):# in enumerate('ABCDEFGLVX'):
    print(r)
    gain_subset = [ gain for i, gain in enumerate(valid_collection.collection) if fam_list[i] == r]#subfam_list[i]==r ]
    gain_idx_list = [ i for i,gain in enumerate(fam_list) if gain == r ]
    n_sse = [[len(gain.sda_helices), len(gain.sdb_sheets)] for gain in gain_subset] # (n_struc, 2)
    n_strucs = np.mean(np.array(n_sse), axis=0)
    print(r, round(n_strucs[0], 2), round(n_strucs[1],2))


In [None]:
# new Anchor management.

#
# Use the "max" SDA / SDB template for generating the new anchors 
# get the center residue index for each template SSE
for gain in valid_collection.collection:
    if gain.name[:10] == 'A0A7K7IHI9': #SDA
        hel_centers = []
        for hel in gain.sda_helices: # each hel is a tuple
            hel_centers.append( gain.start + int((hel[0]+hel[1])/2) )
        hel_keys = [f'H{i+1}' for i in range(len(hel_centers))]
        sda_centers = dict(zip(hel_keys, hel_centers))
    if gain.name[:10] =='A0A3P9I6M5':
        sheet_centers = []
        for sheet in gain.sdb_sheets: # each hel is a tuple
            #if sheet[1] - sheet[0] < 3:
            #    print(gain.start+sheet[0], gain.start+sheet[1])
            sheet_centers.append( gain.start + int((sheet[0]+sheet[1])/2) )
        sheet_keys = [f'S{i+1}' for i in range(len(sheet_centers))]
        sdb_centers = dict(zip(sheet_keys, sheet_centers))
# Manually curated the centers to exclude two small strands in the CD between S6/S7 @ 707-708 and 711-713, respectively (low pLDDT here).
sda_centers = {'H1': 313, 'H2': 328, 'H3': 359, 'H4': 383, 'H5': 409, 'H6': 424, 'H7': 444}
sdb_centers = {'S1': 622, 'S2': 631, 'S3': 645, 'S4': 658, 'S5': 670, 'S6': 695, 'S7': 719, 'S8': 736, 'S9': 752, 'S10': 765, 'S11': 771, 'S12': 782, 'S13': 793}

# Find closest residue to the center (GESAMT), note down the sequence, start, end of the matched SSE; write to FASTA
    # A dict of dicts --> for each key, there is a dictionary inside sse_seqs['H1'][gain.name]:'seqlist'
all_keys = list({**sda_centers, **sdb_centers}.keys())
sse_seqs = {k:{} for k in all_keys}
sse_extents = {k:{} for k in all_keys}
unmatched = {k:0 for k in all_keys}
unstructured = {k:0 for k in all_keys}

for i, gain in enumerate(valid_collection.collection):
        a_gesamt_file = f'../sda_template_aligned_files/sda_{i}.out'
        b_gesamt_file = f'../sdb_template_aligned_files/sdb_{i}.out'

        sda_matches = tf.find_anchor_matches(a_gesamt_file, sda_centers, isTarget=False)
        sdb_matches = tf.find_anchor_matches(b_gesamt_file, sdb_centers, isTarget=False)
        #print(sda_matches, sdb_matches)
        hel_extents = np.full(shape = (gain.end-gain.start+1), fill_value=100)
        she_extents = np.full(shape = (gain.end-gain.start+1), fill_value=100)
        # Establish two matrices to match the respective residue to the index of its helix/sheet for easier matching
        for i,element in enumerate(gain.sda_helices):
            hel_extents[element[0]:element[1]] = i
        for i,element in enumerate(gain.sdb_sheets):
            she_extents[element[0]:element[1]] = i
        # Match the corresponding closest residue to find the associated SSE with start, end and sequence
        for sse, match in sda_matches.items():
            if match[0] is None:
                unmatched[sse] += 1
                continue

            sse_index = hel_extents[match[0]-gain.start]

            if sse_index == 100:
                unstructured[sse] += 1
                continue

            sse_extents[sse][gain.name] = gain.sda_helices[sse_index]
            sse_seqs[sse][gain.name] = gain.sequence[gain.sda_helices[sse_index][0]:gain.sda_helices[sse_index][1]]
        
        for sse, match in sdb_matches.items():
            if match[0] is None:
                unmatched[sse] += 1
                continue

            sse_index = she_extents[match[0]-gain.start]

            if sse_index == 100:
                unstructured[sse] += 1
                continue

            sse_extents[sse][gain.name] = gain.sdb_sheets[sse_index]
            sse_seqs[sse][gain.name] = gain.sequence[gain.sdb_sheets[sse_index][0]:gain.sdb_sheets[sse_index][1]]
        
for sse in all_keys:
    with open(f'../sse_aln/{sse}.seqs.fa','w') as fa:
        for name, seq in sse_seqs[sse].items():
            fa.write(f'>{name}\n{"".join(seq)}\n')

print(unmatched, '\n', unstructured)
#   Run MAFFT with each of the gathered sequences
#   For each MAFFT
#       Find the most conserved residue (Identity matrix)
#       Set as new Anchor.

In [None]:

for gain in valid_collection.collection:

    if gain.name[:10] == 'A0A7K7IHI9': #SDA
        sda_gain = gain
    if gain.name[:10] =='A0A3P9I6M5': # SDB
        sdb_gain = gain
for i,k in enumerate(sda_centers.keys()):
    kfile = glob.glob(f"../sse_aln/{k}.aln.fa")[0]
    with open(kfile) as alnf:
        x = alnf.readlines()[1].strip(" \n")
        kcutoff = len(x)
    print(kcutoff)
    aln = sse_func.read_alignment(kfile, cutoff=kcutoff)
    h = sda_gain.sda_helices[i]
    aln_matrix = np.array([list(seq) for seq in aln.values()])
    kquality, kocc = calc_identity(aln_matrix)
    '''    kquality = []
    kocc = []
    for col in range(aln_matrix.shape[1]):
        chars, count = np.unique(aln_matrix[:,col], return_counts=True)
        if chars[0] == '-':
            q = count[1]
        else:
            q = count[0]
        x = np.where(chars == '-')[0][0]
        kocc.append(14435 - count[x])
        kquality.append(q)'''
    template_aln_seq = aln[sda_gain.name]
    template_res_idx = np.argmax(kquality)
    print(template_aln_seq, template_res_idx)
    template_index = template_aln_seq[:template_res_idx+1]
    t_res = template_aln_seq[template_res_idx]
    print(template_index, t_res)
    new = template_index

    fig = plt.figure(figsize=[4,2], facecolor='w')
    plt.bar(range(kcutoff), kquality)
    plt.title(f'SDA TEMPLATE : {k}')
    plt.xticks(ticks = range(kcutoff), labels=template_aln_seq, fontsize=5)
    plt.savefig(f'../sse_aln/{k}.template1.png', dpi=300)
    plt.close(fig)

for i,k in enumerate(sdb_centers.keys()):
    kfile = glob.glob(f"../sse_aln/{k}.aln.fa")[0]
    with open(kfile) as alnf:
        x = alnf.readlines()[1].strip(" \n")
        kcutoff = len(x)
    print(kcutoff)
    aln = sse_func.read_alignment(kfile, cutoff=kcutoff)
    h = sdb_gain.sdb_sheets[i]
    aln_matrix = np.array([list(seq) for seq in aln.values()])
    kquality = []
    kocc = []
    kquality, kocc = calc_identity(aln_matrix)
        
    template_aln_seq = aln[sdb_gain.name]
    template_res_idx = np.argmax(kquality)
    print(template_aln_seq, template_res_idx)
    template_index = template_aln_seq[:template_res_idx+1]
    t_res = template_aln_seq[template_res_idx]
    print(template_index, t_res)
    new = template_index

    fig = plt.figure(figsize=[4,2], facecolor='w')
    plt.bar(range(kcutoff), kquality)
    plt.title(f'SDB TEMPLATE : {k}')
    plt.xticks(ticks = range(kcutoff), labels=template_aln_seq, fontsize=5)
    plt.savefig(f'../sse_aln/{k}.template1.png', dpi=300)
    plt.close(fig)

In [None]:
def factor2pdb(input_pdb, output_pdb, metric):
    # line [61:66] = xx.xx b factor
    # set to zero if not in metric, set to val otherwise
    with open(input_pdb) as ipdb:
        data = ipdb.readlines()
    newdata = []
    for l in data:
        if not l.startswith("ATOM"):
            newdata.append(l)
            continue
        
        resid = int(l[22:26])

        if resid not in metric.keys():
            l = l[:61]+"00.00"+l[66:]
            newdata.append(l)
            continue

        l = l[:61]+f'{metric[resid]:5.2f}'+l[66:]
        newdata.append(l)

    with open(output_pdb, 'w') as opdb:
        opdb.write("".join(newdata))
    
    print("Done.")

In [None]:
print(template_anchors)
template_ids = templates.keys()
print(templates)
target_folder = '../r2_f_templates/'
outstr = []
for t_id in template_ids:
    print(t_id)
    t_anchors = template_anchors[t_id]
    t_pdb = find_pdb(templates[t_id][0], '../all_pdbs/')

    t_metric = {v:1 for v in t_anchors.values()}
    outpdb = target_folder+t_id+".b.pdb"

    #factor2pdb(t_pdb, outpdb, t_metric)
    outstr.append(outpdb)

print(" ".join(outstr))

In [None]:
# compare this to the classical anchor representation using the CONSERVATION QUALITY from MAFFT
"""alignment_file = "/home/hildilab/projects/agpcr_nom/app_gain_gain.mafft.fa"
# This only contains the sigma files for truncated (?) PDBs.
#quality = sse_func.read_quality(quality_file)
gps_minus_one = 6781 
aln_cutoff = 6826 
alignment_dict = sse_func.read_alignment(alignment_file, aln_cutoff)
aln_matrix = np.array([list(seq) for seq in alignment_dict.values()])
#print(aln_matrix.shape)
quality, occ = calc_identity(aln_matrix)"""
precalc_anchors = [ 662, 1194, 1912, 2490, 2848, 3011, 3073, 3260, #H1-H8
            3455, 3607, 3998, 4279, 4850, 5339, #5341 S1-S6, S7 REMOVED!
            5413, 5813, 6337, 6659, 6696, 6765, 6808] #S8-13
precalc_anchor_occupation = [ 4594.,  6539., 11392., 13658.,  8862., 5092.,  3228., 14189., #H1-H8
                      9413., 12760.,  9420., 11201., 12283., 3676.,#  4562. S1-S6, S7 REMOVED!
                     13992., 12575., 13999., 14051., 14353., 9760., 14215.] #S8-13
precalc_anchor_dict = sse_func.make_anchor_dict(precalc_anchors, 3425)

print(templates)
target_folder = '../r2_f_templates/'
outstr = []
for t_id in template_ids:
    print(t_id)
    t_anchors = template_anchors[t_id]
    t_pdb = find_pdb(templates[t_id][0], '../all_pdbs/')

    # Find the template in the valid_collection
    for gain in valid_collection.collection:
        if gain.name.split("-")[0] == templates[t_id][0].split("-")[0]:
            _,centers,_,_ = gain.create_indexing(precalc_anchors, precalc_anchor_occupation, precalc_anchor_dict, 
                                            outdir=None, offset=0, silent=True, split_mode='single',debug=False)
            break
    #print(centers)
    
    t_metric = {v+gain.start+1:1 for v in centers.values()}
    outpdb = target_folder+t_id+".p.pdb"

    factor2pdb(t_pdb, outpdb, t_metric)
    outstr.append(outpdb)

print(" ".join(outstr))

In [None]:
def find_pdb(name, pdb_folder):
    identifier = name.split("-")[0]
    target_pdb = glob.glob(f"{pdb_folder}/*{identifier}*.pdb")[0]
    return target_pdb

templates = {
    'E5b':'A0A3P8S994-A0A3P8S994_AMPPE-AGRE5b,duplicate2-Amphiprion_percula',
    'G5b':'A0A6J3IBI5-A0A6J3IBI5_SAPAP-AGRG5-Sapajus_apella',
    'A': 'A0A2Y9F628-A0A2Y9F628_PHYMC-AGRA3isoformX1-Physeter_macrocephalus',
    'C': 'A0A7K6E127-A0A7K6E127_9PASS-CELR3protein-Grantiella_picta.', 
    'D': 'A0A1A7WJQ6-A0A1A7WJQ6_9TELE-GR144-Iconisemion_striatum.', 
    'E1': 'A0A2I2YJG7-A0A2I2YJG7_GORGO-AGRE1-Gorilla_gorilla_gorilla', 
    'E5': 'G1TKX5-G1TKX5_RABIT-AGRE5-Oryctolagus_cuniculus', 
    'F2': 'A0A452SUX4-A0A452SUX4_URSAM-AGRF2-Ursus_americanus', 
    'F4': 'W5PQ70-W5PQ70_SHEEP-AGRF4-Ovis_aries', 
    'G7': 'A0A2K5Y1I7-A0A2K5Y1I7_MANLE-AGRG7-Mandrillus_leucophaeus', 
    'L': 'A0A452HCU9-A0A452HCU9_9SAUR-AGRL3-Gopherus_agassizii', 
    'L4': 'A0A7L3KTA8-A0A7L3KTA8_9PASS-AGRL4protein-Drymodes_brunneopygia.',
    'V': 'A0A6Q2XYK2-A0A6Q2XYK2_ESOLU-AGRV1-Esox_lucius'
    }

target_folder = '../r2_template_pdbs/'
for t_id in templates.keys():
    print(t_id)
    print(templates[t_id])
    t_pdb = find_pdb(templates[t_id].split("-")[0], '../all_pdbs/')
    print(t_pdb)
    # Find the template in the valid_collection
    for gain in valid_collection.collection:
        if gain.name.split("-")[0] == templates[t_id].split("-")[0]:
            print(tf.get_pdb_extents(t_pdb, gain.subdomain_boundary))
            
            with open(t_pdb) as inpdb:
                data = inpdb.readlines()
            newdata = []
            for l in data:
                if not l.startswith("ATOM"):
                    newdata.append(l)
                    continue
                resid = int(l[22:26])
                if "b" in t_id and resid < gain.subdomain_boundary:
                    continue
                if "b" not in t_id and resid > gain.subdomain_boundary:
                    continue
                newdata.append(l)
            
            with open(f"{target_folder}{t_id}_{templates[t_id].split('-')[0]}.pdb", "w") as outpdb:
                outpdb.write("".join(newdata))
            
            continue