This is a notebook for the creation of a GainCollection object. The input are the AlphaFold/Colabfold models  and they are validated and filtered, leaving only validly detected GAIN domains for use in the indexing scheme and template selection.

In [1]:
# DEPENDENCIES, see README.txt
import glob
from shutil import copyfile
import pickle
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.ticker import (MultipleLocator, FixedLocator)
import logomaker
# LOCAL IMPORTS
from indexing_classes import GPCRDBIndexing
from gain_classes import GainCollection
import sse_func

In [None]:
valid_collection = np.load('../data/valid_collection.q.pkl', allow_pickle=True)
human_collection = np.load('../data/human_collection.q.pkl', allow_pickle=True)

With the "stal_indexing" Indexing and the "validCollection" Collection, we can query the data to give us statistics about the anchors and individual SSE composition.

In [None]:
#in_pdbs = glob.glob("/home/hildilab/projects/agpcr_nom/human_31/aligned/subd_B/trunc_pdbs/*.pdb")
# For each SSE, get the number of structures having this anchor and stats of which residue is the .50 anchor (via np.where -> anchor)
# Get statistics of the length of each SSE
# Get statistics of loop lengths.
# This may also be subqueued to families...

stal_indexing=pd.read_pickle('../stal_indexing.r4.pkl')
valid_collection=pd.read_pickle('../valid_collection.q.pkl')
def get_loops(indexing_dir):
    # Returns a named dict with loop lengths, i.e. {"H1-H2":13, "H8-S1":12}
    inverted_dir = {sse[0] : (sse[1],ki) for ki, sse in indexing_dir.items()} # The begin of each sse is here {0:(13, "H2")}
    loop_dir = {}
    ordered_starts = sorted(inverted_dir.keys())
    for i, sse_start in enumerate(ordered_starts):
        if i == 0: 
            continue # Skip the first and go from the second SSE onwards, looking in N-terminal direction.
        c_label = inverted_dir[sse_start][1]
        n_end, n_label = inverted_dir[ordered_starts[i-1]]
        loop_dir[f"{n_label}-{c_label}"] = sse_start - n_end - 1
    return loop_dir

def get_sse_len(intervals, total_keys):
    # Returns a dict with the length of each SSE in respective GAIN domain.
    len_dir = {x:0 for x in total_keys}
    for ki in intervals.keys():
        start = intervals[ki][0]
        end = intervals[ki][1]
        len_dir[ki] = end - start + 1
    return len_dir

def get_pos_res(pos_dir, gain):
    # Returns a dict with the One-Letter-Code of each SSE position in the respective GAIN domain.
    pos_res = {k : gain.sequence[pos_dir[k]] for k in pos_dir.keys()}
    return pos_res

def match_dirs(single_dir, collection_dir, exclude=[]):
    for k, v in single_dir.items():
        if v in exclude:
            continue
        if k not in collection_dir.keys():
            collection_dir[k] = [v]
            continue
        collection_dir[k].append(v)
    return collection_dir

In [None]:
seqs = [gain.sequence for gain in valid_collection.collection]

In [None]:
from scipy import stats
#print(dir(all_base))
newkeys = ['H1','H2','H3','H4','H5','H6','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14','GPS']
loop_lengths = {}
sse_lengths = {}
center_residues = {}
sse_matrix = np.zeros(shape=(len(stal_indexing.total_keys),len(stal_indexing.total_keys)))
for idx in range(stal_indexing.length):
    #Sanity Check - Do the identifiers match? Yes, they do.
    #if all_base.names[idx].split("-")[0] != valid_collection.collection[idx].name.split("-")[0]:
    #    print(all_base.names[idx].split("-")[0], valid_collection.collection[idx].name.split("-")[0])
    #    raise IndexError
    loop_lengths = match_dirs(get_loops(all_base.indexing_dirs[idx]), loop_lengths)
    sse_lengths = match_dirs(get_sse_len(stal_indexing.intervals[idx], stal_indexing.total_keys), sse_lengths, exclude=[0])

    present_sse = stal_indexing.indexing_dirs[idx].keys()
    for i, kk in enumerate(newkeys):
        for j in range(i,len(newkeys)):
            if kk in present_sse and newkeys[j] in present_sse:
                sse_matrix[j,i] += 1

print(loop_lengths)

# Plot the element occupancy for each of the SSE elements as a 2D matrix
plt.imshow(sse_matrix, cmap='gist_yarg')
plt.xticks(ticks= range(len(newkeys)), labels=newkeys, rotation=90)
plt.yticks(ticks= range(len(newkeys)), labels=newkeys)
plt.xlim(-0.5,20.5)
plt.ylim(20.5,-0.5)
cbar = plt.colorbar(shrink=0.5)
plt.savefig("occ_map.svg")
#print(f"\tHas H4 and H5: {len(has4and5)}\n\tHas H5 and H6: {len(has5and6)}\n\tHas H4 and H6: {len(has4and6)}\n\tHas H4,5,6: {len(has_all)}")

In [None]:
def plot_hist(datarow, color, name, length):
    max = np.max(datarow)
    try: 
        dens = stats.gaussian_kde(datarow)
    except:
        print(np.unique(datarow))
        return
    fig = plt.figure(figsize=[4,2])
    fig.set_facecolor('w')
    n, x, _ = plt.hist(datarow, bins=np.linspace(0,max,max+1), histtype=u'step', density=True, color='white',alpha=0)
    plt.plot(x, dens(x),linewidth=2,color=color,alpha=1)
    plt.fill_between(x,dens(x), color=color,alpha=0.1)
    ax = plt.gca()
    ymax = ax.get_ylim()[1]
    val_string = f'{round(np.average(datarow),2)}±{round(np.std(datarow),2)}'
    plt.text(max, ymax*0.95, name, horizontalalignment='right', fontsize=14, verticalalignment='top')
    plt.text(max, ymax*0.8, val_string, horizontalalignment='right', fontsize=14, verticalalignment='top')
    plt.text(max, ymax*0.65, f"{round(len(datarow)/length*100, 1)}%", horizontalalignment='right', fontsize=14, verticalalignment='top')
    plt.xlabel('Element Length [Residues]')
    plt.ylabel('Relative density [AU]')
    plt.savefig(f'{name}_hist.svg')
    plt.show()
    plt.close(fig)

def parse_conservation(datarow, length):
    total = len(datarow)
    letters, counts = np.unique(np.array(datarow), return_counts=True)

    resid_counts = {}
    for i, res in enumerate(letters):
            resid_counts[int(counts[i])] = res
    
    sorted_counts = sorted(resid_counts.keys())[::-1]

    occupancy = round(total/length*100, 1)
    conserv_string = []
    residue_occupancies = [ int( x*100 / total ) for x in sorted_counts]
    for idx, occ in enumerate(residue_occupancies):
        if occ >= 5: conserv_string.append(f"{resid_counts[sorted_counts[idx]]}:{occ}%")

    return occupancy, ", ".join(conserv_string)
# HISTOGRAMS FOR ALL SSE Lengths.

newkeys = ['H1','H2','H3','H4','H5','H6','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14','GPS']

# Statistics for SSE extents
with open('adj.sse_lengths.tsv','w') as lenfile:
    lenfile.write("Element\tLength\n")
    for ki in newkeys:
        datarow = sse_lengths[ki]
        if "S" in ki: c = 'xkcd:orange'
        else: c = 'xkcd:denim'
        print(len(datarow))
        plot_hist(datarow, c, ki, stal_indexing.length)
        val_string = f'{round(np.average(datarow),1)} ± {round(np.std(datarow),1)}'
        lenfile.write(f"{ki}\t{val_string}\n")

# Statistics for SSE residue conservation
with open("adj.sse_stats.tsv", "w") as statfile:
    statfile.write("Element\tOccupation\tConsensus\n")
    for sse in newkeys:
        datarow = center_res[f"{sse}.50"]
        occ, residues = parse_conservation(datarow, all_base.length)
        statfile.write(f"{sse}\t{occ}%\t{residues}\n")

In [None]:
# Create the TSV file with the individual pLDDT values.
import json

jsons = glob.glob('/home/hildilab/projects/agpcr_nom/*_output/batch*/*rank_1*scores.json')

import re
identifiers = []
with open('all_plddt.tsv', 'w') as c:
    c.write('Identifier\tplddt_values\n')
    for j in jsons:
        identifier = re.findall(r'\/[\w]+-', j)[0][1:-1]
        if identifier in identifiers:
            continue
        identifiers.append(identifier)
        with open(j) as jx:
            data = json.load(jx)
        c.write(f"{identifier}\t{','.join(['{:.2f}'.format(k) for k in data['plddt']])}\n")
print(len(identifiers))

In [None]:


def construct_identifiers(indexing_dir, center_dir, plddt_values, max_id_dir, name, gain_start, seq=None):
    # construct a dictionary where for each identifier (i.e. H6.49), the total occ. is plotted
    # This enables a statistical evaluation for each GRN labeled position.
    seq=list(seq)
    if seq is not None:
        seq.append("X") # Sometimes, unknown residues are present in the FASTA file.
    id_dir = {}
    plddts = {}
    sse_seq = {}
    for sse in indexing_dir.keys():

        if sse == 'GPS' :
            continue

        start = indexing_dir[sse][0]
        end = indexing_dir[sse][1]

        if end-start > 45: 
            # Some models have segments which are excessively long and correspond to low pLDDT values as well. 
            # That is why manually, a limit of 45 residues was imposed.
            print(f"NOTE: SKIPPING TOO LONG SSE WITH LENGTH {end-start}\n{name}: {sse}")
            continue

        center_res = center_dir[f"{sse}.50"]
        first_res = 50 - center_res + start

        for k in range(end-start+1):
            if sse not in max_id_dir.keys():
                max_id_dir[sse] = []
            if first_res+k not in max_id_dir[sse]:
                max_id_dir[sse].append(first_res+k)

        id_dir[sse] = [first_res+k for k in range(end-start+1)]
        plddts[sse] = [plddt_values[k] for k in range(start, end+1)]

        if seq is not None:
            sse_seq[sse] = [seq[k] for k in range(start-gain_start, end-gain_start+1)]

    if seq is None:
        sse_seq = None

    return max_id_dir, id_dir, plddts, sse_seq

def get_plddt_dir(file='all_plddt.tsv'):
    # Load the pLDDT file into a dictionary
    plddt_dir = {}
    with open(file) as f:
        data = [l.strip() for l in f.readlines()[1:]]
        for l in data:
            i,v  = tuple(l.split("\t"))
            plddt_dir[i] = [float(val) for val in v.split(",")]
    return plddt_dir

def make_id_list(id_dir):
    # From the dictionary, convert to label strings, i.e. "S6.50"
    id_list = []
    for sse in id_dir.keys():
        for res in id_dir[sse]:
            id_list.append(f"{sse}.{res}")
    return id_list

def compact_label_positions(id_collection, plddt_collection, sse_keys, return_unique=True):
    # Here, we stack the labeled GRN positions on top of one another to created a GRN-based multiple sequence alignment for evaluation.
    label_plddts = {}
    print("compact_label_positions:", sse_keys)
    for sse in sse_keys:
        label_plddts[sse] = {}

    for i in range(len(id_collection)):
        gain_positions = id_collection[i]
        plddt_positions = plddt_collection[i]
        #print(plddt_positions.keys())
        for sse in gain_positions.keys():
            if not return_unique and "." in sse:
                print("Found unique segment", sse)
                sse_id = sse.split(".")[0]
            else:
                sse_id = sse
            for j, pos in enumerate(gain_positions[sse]):
                pos = int(pos)
                if pos < 10 or pos > 90:
                    continue
                #print(pos, label_plddts[sse_id].keys())
                if pos not in label_plddts[sse_id].keys():
                    label_plddts[sse_id][pos] = [plddt_positions[sse][j]]
                else:
                    label_plddts[sse_id][pos].append(plddt_positions[sse][j])

    return label_plddts

def construct_id_occupancy(intervals, center_dirs, length, plddt_dir, names, seqs):
    newkeys = ['H1','H1.D1','H1.E1', 'H1.F4','H2','H3','H4','H5','H6','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14']
    id_collection = []
    plddt_collection = []
    seq_collection = []
    all_id_dir = {x:[] for x in newkeys}
    for k in range(length):
        identifier = names[k].split("-")[0]
        plddt_values = plddt_dir[identifier]
        gain_start = valid_collection.collection[k].start
        #print(identifier, valid_collection.collection[k].name, len(plddt_values))
        #print("INPUT:", k, intervals[k], center_dirs[k], plddt_values, all_id_dir, names[k], seqs[k], sep='\n')
        all_id_dir, id_dir, plddts, sse_seq = construct_identifiers(intervals[k], center_dirs[k], plddt_values, all_id_dir, names[k], gain_start, seqs[k])
        #print("OUTPUT:", all_id_dir, id_dir, plddts, sse_seq, sep='\n')
        id_collection.append(id_dir)
        #print(id_dir)
        plddt_collection.append(plddts)
        seq_collection.append(sse_seq)
    print("Completed creating value collection.")
    print(id_collection[0])
    print(plddt_collection[0])
    print(valid_collection.collection[0].name)
    # Here, parse through the id_dirs to count the occurrence of positions per SSE
    # Dictionary to map any label identifier to a respective position.
    id_map = {}
    i = 0
    for sse in newkeys:
        for res in all_id_dir[sse]:
            id_map[f'{sse}.{res}'] = i 
            i += 1
    
    max_id_list = []
    for i, id_dict in enumerate(id_collection):
        max_id_list.append(make_id_list(id_dict))
    flat_id_list = np.array([item for sublist in max_id_list for item in sublist])
    print("Finished constructing flat_id_list.")
    labels, occ = np.unique(flat_id_list, return_counts=True)
    occ_dict = dict(zip(labels,occ))

    # Transform occ_dict to the same format as label_plddts (one dict per sse):
    label_occ = {}

    for sse in newkeys:
        print(sse)
        label_occ[sse] = {int(k[-2:]):v for k,v in occ_dict.items() if k[:-3] == sse}
    
    label_plddts = compact_label_positions(id_collection, plddt_collection, newkeys, return_unique=True)
    label_seq = compact_label_positions(id_collection, seq_collection, newkeys, return_unique=True)
    
    return label_plddts, label_occ, label_seq

plddt_dir = get_plddt_dir()

plddt_values, occ_values, label_seq = construct_id_occupancy(stal_indexing.intervals, 
                                                             stal_indexing.center_dirs, 
                                                             stal_indexing.length, 
                                                             plddt_dir, 
                                                             stal_indexing.names, 
                                                             seqs)

In [None]:
print(occ_values)
print(stal_indexing.indexing_dirs[0])

# get the occupancy for GPS
gps_occ = {"GPS-2":0,"GPS-1":0,"GPS+1":0}
gps_seq = {"GPS-2":[],"GPS-1":[],"GPS+1":[]}
for idx, pos_dir in enumerate(stal_indexing.indexing_dirs):
    gain_start = valid_collection.collection[idx].start
    current_seq = seqs[idx]
    for k in gps_occ.keys():
        if k in pos_dir.keys() and pos_dir[k] is not None:
            gps_occ[k] += 1
            gps_seq[k].append(current_seq[pos_dir[k]-gain_start])

print(gps_occ)
print(gps_seq)

#print(len(seqs))
print(label_seq["S4"])
label_seq["GPS"] = gps_seq

In [None]:
## PLOT THE POSITION OCCUPANCY AND THE AVERAGE PLDDT PER POSITION. with plddt_values, occ_values
newkeys = ['H1','H2','H3','H4','H5','H6','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14','GPS']
#newkeys = ['H1','H1.D1','H1.E1', 'H1.F4','H2','H3','H4','H5','H6','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14','GPS']
for sse in newkeys:
    # Transform the values first
    pp = plddt_values[sse]
    #print(occ_values[sse])
    av_pp = {k:np.average(np.array(v))/100 for k,v in pp.items()}
    #print(av_pp)
    norm_occ = {k:v/14435 for k,v in occ_values[sse].items()}
    xax = sorted(av_pp.keys())
    y_pp = [av_pp[x] for x in xax]
    y_occ = [norm_occ[x] for x in xax]
    norm_pp = np.array(y_pp)*np.array(y_occ)

    fig, ax = plt.subplots(figsize=[5,2])
    fig.set_facecolor('w')
    ax.xaxis.set_minor_locator(MultipleLocator(1)) #AutoMinorLocator())
    ax.xaxis.set_major_locator(FixedLocator([a for a in range(2,100,3)]))#MultipleLocator(3)))
    ax.tick_params(which='both', width=2)
    ax.tick_params(which='major', length=8)
    ax.tick_params(which='minor', length=6)
    plt.bar(xax,y_pp, color='silver', alpha=0.7)
    plt.plot(xax, y_occ, color='dodgerblue')
    plt.bar(xax, norm_pp, color='xkcd:lightish red', alpha=0.1)
    plt.title(f'Element Composition ({sse})')
    plt.yticks(ticks = [0, 0.2, 0.4, 0.6, 0.8, 1], labels = ['0%', '20%', '40%', '60%', '80%', '100%'])
    #plt.ylabel('')
    ax.set_xticklabels([f'{sse}.{str(int(v))}' for v in ax.get_xticks()], rotation=90)
    plt.savefig(f'../fig/r4stal/{sse}_stats2.svg', bbox_inches='tight')
    #plt.show()
    plt.close(fig)

In [None]:
# Get the occupancy of certain positions:
enriched_positions = ['H1.50','H1.54','H1.57','H1.61','H2.56','H2.57','H2.60','H2.61','H3.36','H3.43',
'H3.44','H3.51','H3.53','H3.56','H4.38','H4.41','H4.51','H5.37','H5.38','H5.42','H5.44',
'H5.48','H5.50','H5.59','H6.42','H6.54','H6.56','H7.40','H7.51','H8.46','H8.58','H8.60',
'S1.48','S2.47','S2.51','S2.53','S2.58','S3.53','S3.55','S5.56','S6.48','S6.52','S6.55',
'S7.50','S8.45','S8.52','S8.57','S10.53','S11.50','S12.54','S12.55','S13.47','S13.50','S13.52','S14.48','S14.50']
for sse in newkeys:
    sub_positions = [k for k in enriched_positions if f'{sse}.' in k]
    # Transform the values first
    #pp = plddt_values[sse]
    #print(occ_values[sse])
    #av_pp = {k:np.average(np.array(v))/100 for k,v in pp.items()}
    #print(av_pp)
    norm_occ = {f'{sse}.{k}':v/14435 for k,v in occ_values[sse].items()}
    #print(norm_occ)
    for k in sub_positions:
        print(round(norm_occ[k],2),k)


In [None]:
# GENERATE A FULL DATAFRAME FOR THE LABELED POSITIONS AND THEIR RESPECTIVE AA FREQUENCIES FOR LOGOPLOTS
sse_aa_freqs = {}
aastr = 'ACDEFGHIKLMNPQRSTVWYX'
cols = {aa:i for i,aa in enumerate(aastr)}
for sse in newkeys:
    sse_dict = label_seq[sse]
    aafreqs = np.zeros(shape=(len(sse_dict.keys()), 21))
    for p_index, pos in enumerate(sorted(sse_dict.keys())):
        aas, freq = np.unique(np.array(sse_dict[pos]), return_counts=True)
        for i, aa in enumerate(aas):
            aafreqs[p_index, cols[aa]] = freq[i]/14435
    sse_aa_freqs[sse] = aafreqs
root_path+"human_31/trunc_pdbs/"
gps_aa_freqs = {}
aastr = 'ACDEFGHIKLMNPQRSTVWYX'
cols = {aa:i for i,aa in enumerate(aastr)}

In [None]:
# Print the sequence composition for each cancer-enriched-position

for sse in newkeys:
    sub_positions = [k for k in enriched_positions if f'{sse}.' in k]
    lframe = pd.DataFrame(data=sse_aa_freqs[sse], columns=[c for c in aastr], index = sorted(plddt_values[sse].keys()))
    #print(lframe)
    for pos in sub_positions:
        idx = int(pos[-2:]) # get the row number in the SSE
        res_data = lframe.loc[[idx]]
        total_freqs = res_data.sum(axis=1).to_list()[0]
        #print(f"{total_freqs = }, {type(total_freqs)}")

        norm_freq_dict = {round(freq.to_list()[0]/total_freqs, 5) : aa for aa, freq in res_data.items()}
        sorted_norm_freqs = sorted(norm_freq_dict.keys())[::-1]
        #print(pos)
        xstring = ''
        for k in sorted_norm_freqs[:3]: 
            xstring = xstring +f'{norm_freq_dict[k]}:{round(k*100)}%,'
        print(pos, xstring)

            # normalize frequency with the total sum

"""for sse in newkeys:
    for pos in enriched_positions:
        datarow = center_res[pos]
        occ, residues = parse_conservation(datarow, all_base.length)
        print(f"{sse}\t{occ}%\t{residues}\n")"""

In [None]:
# LOGOPLOTS FOR THE ELEMENTS

print()
print(sse_aa_freqs["GPS"])
plddt_values["GPS"] = {"GPS-2":100,"GPS-1":100,"GPS+1":100}
for sse in ["GPS"]: #newkeys:
    if sse == "GPS":
        lframe = pd.DataFrame(data=sse_aa_freqs[sse], columns=[c for c in aastr], index = [0,1,2])
    else:
        lframe = pd.DataFrame(data=sse_aa_freqs[sse], columns=[c for c in aastr], index = sorted(plddt_values[sse].keys()))

    # Note down the first and last row where the occupation threshold is met.
    firstval = None
    for i, r in lframe.iterrows():
        if np.sum(r) > 0.05: 
            if firstval is None:
                firstval = i
            lastval = i
    print(firstval, lastval)
    subframe = lframe.truncate(before=firstval, after=lastval)
    #x_offset = sorted(plddt_values[sse].keys())[0]

    fig, ax = plt.subplots(figsize=[5,2])
    cons_logo = logomaker.Logo(subframe,
                                ax=ax,
                                color_scheme='chemistry',
                                show_spines=False,
                                font_name='DejaVu Sans Mono')

    fig.set_facecolor('w')
    ax.xaxis.set_minor_locator(MultipleLocator(1)) #AutoMinorLocator())
    ax.xaxis.set_major_locator(FixedLocator([a for a in range(2,100,3)]))#MultipleLocator(3))
    ax.tick_params(which='both', width=2)
    ax.tick_params(which='major', length=8)
    ax.tick_params(which='minor', length=6)
    ax.set_xticklabels([f'{sse}.{str(int(v))}' for v in ax.get_xticks()], rotation=90)
    cons_logo.draw()
    fig.tight_layout()
    fig.set_facecolor('w')
    plt.savefig(f"../fig/r4stal/conslogo_{sse}.svg", bbox_inches='tight')
    plt.show()
    plt.close(fig)

In [None]:
def get_loop_stats(indexing_dir, sequence):
    # Returns a named dict with loop lengths, i.e. {"H1-H2":13, "H8-S1":12}
    inverted_dir = {sse[0] : (sse[1],ki) for ki, sse in indexing_dir.items() if "GPS" not in ki} # The begin of each sse is here {0:(13, "H2")}
    loop_loc = {}
    loop_dir = {}
    ordered_starts = sorted(inverted_dir.keys())
    for i, sse_start in enumerate(ordered_starts):
        if i == 0: 
            continue # Skip the first and go from the second SSE onwards, looking in N-terminal direction.
        c_label = inverted_dir[sse_start][1]
        n_end, n_label = inverted_dir[ordered_starts[i-1]]
        loop_loc[f"{n_label}-{c_label}"] = (n_end, sse_start-1)
        loop_dir[f"{n_label}-{c_label}"] = sequence[n_end+1:sse_start] # The one-letter-coded seqeuence. Will be a list of lists
    return loop_loc, loop_dir

loop_lengths = {}
loop_seqs = {}
loop_seq = {}

loop_info = {}
#[loop_info[loop] = {} for loop in loop_seqs.keys()] # into each of these keys, any entry is composed of "name":$name, "sequence":$seq

for idx in range(all_base.length):
    curr_name = all_base.names[idx]
    start = valid_collection.collection[idx].start
    i_loc, i_dir = get_loop_stats(all_base.indexing_dirs[idx], valid_collection.collection[idx].sequence)
    for k, seq in i_dir.items():
        if k not in loop_info.keys():
            loop_info[k] = []
        loop_info[k].append({'name':f'{all_base.names[idx]}_{i_loc[k][0]+start}-{i_loc[k][1]+start}', 'sequence':''.join(seq)})
    #print(i_len)
    #loop_lengths = match_dirs(i_len, loop_lengths)
    #loop_seqs = match_dirs(i_dir, loop_seqs)



In [None]:
# Write the collected loop sequences to a FASTA file for later alignment.
def loop2fasta(outfile, itemlist):
    with open(outfile, 'w') as out:
        for subdict in itemlist:
            out.write(f">{subdict['name']}\n{subdict['sequence']}\n")
    print("Done with", outfile)

for loop in loop_info.keys():
    loop2fasta(f"../loops/{loop}.fa", loop_info[loop])


In [None]:
extents = {}
for gain in valid_collection.collection:
    extents[gain.name.split("-")[0]] = [str(gain.start+1), str(gain.subdomain_boundary+1), str(gain.end+1)] # make it compatible with ONE-indexed PDBs.

import json
with open('domain_extents.json', 'w') as j:
    dump = json.dumps(extents)
    j.write(dump)
