In [None]:
# ANALYSIS FOR BRANCH PKD - poycystins without alignment
# DEPENDENCIES
import glob
from shutil import copyfile
import pickle
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.ticker import (MultipleLocator, FixedLocator)
#import logomaker
# LOCAL IMPORTS
from indexing_classes import GPCRDBIndexing
from gain_classes import GainDomain, GainCollection, GainDomainNoAln
import sse_func


In [None]:
def filter_by_receptor(sequences, selection):
    new_list = []
    for seq_tup in sequences:
        if selection in seq_tup[0]:
            new_list.append(seq_tup)
    return new_list

def filter_by_list(sequences, selection): # selection list
    new_list = []
    for seq_tup in sequences:
        for it in selection:
            if it in seq_tup[0]:
                new_list.append(seq_tup)
    return new_list

In [None]:
valid_seqs = sse_func.read_multi_seq("../data/pkd/all_pkd_gain.fa")
stride_files = glob.glob("../data/pkd/pkds_stride/*")
# This only contains the sigma files for truncated (?) PDBs.
print(len(stride_files))

In [None]:
# re-offset the sequences to match the exact PDB indexing
f_seqs = sse_func.read_alignment("../all_pkds.fa")
full_seqs = {k.split("-")[0]:v for k,v in f_seqs.items()}
print(full_seqs.keys())
valid_adj_seqs = []
for tup in valid_seqs:
    name = tup[0].split("-")[0]
    x = sse_func.find_the_start(longseq=full_seqs[name], shortseq=tup[1])
    if x == 0:
        print("already 1st res.\n", name, tup[1][:10], tup[1][-10:])
        valid_adj_seqs.append( (name,full_seqs[name][:len(tup[1])-1]) )
        print(full_seqs[name][:len(tup[1])-1])
    else:
        valid_adj_seqs.append( (name,full_seqs[name][x-1:x+len(tup[1])-1]) )


In [None]:
valid_collection = GainCollection(  
                                    stride_files = stride_files,
                                    sequence_files=None,
                                    sequences=valid_adj_seqs,#valid_seqs,
                                    is_truncated = True,
                                    coil_weight=0.00, # TESTING
                                    #domain_threshold=20, # TESTING
                                    stride_outlier_mode=True,
                                    no_alignment=True,
                                    debug=False)
#valid_collection = pd.read_pickle("../valid_collection.pkl")

In [None]:
pickle.dump(valid_collection, open('../pkd_collection.pkl', 'wb'))

In [None]:
def truncate_pdb(start, end, oldpdb, newpdb):
    with open(oldpdb) as p:
        data = p.readlines()
    newdata = []
    for line in data:
        if line.startswith('ATOM'):
            resid = int(line[22:26])
            if start > resid or end < resid:
                continue
        newdata.append(line)
    with open(newpdb, 'w') as new:
        new.write(''.join(newdata))

pdbs = glob.glob("../pkd_pdbs/*_rank_1_*.pdb")
print(f"Found {len(pdbs)} best ranked models in target directories.")
valid_ct = 0

for gain in valid_collection.collection:
    valid_ct +=1
    name = gain.name
    tar_pdb = [p for p in pdbs if name.split("_")[0] in p][0]
    new_pdb_path = f'../trunc_pkd_pdbs/{gain.name.split("-")[0]}.pdb'
    truncate_pdb(gain.start,gain.end, tar_pdb, new_pdb_path)
print("Copied and truncated", valid_ct, "GAIN domains.")
    

In [None]:

valid_collection.plot_sse_hist(title=f"Polycystins (Total: {len(valid_adj_seqs)})",
                               n_max=26,
                               #savename="../fig/hists/%s.adj"%(out_names[i]))
                               savename="../pkd_data/pkd_sse")

In [None]:
"""    parse_string = "_"
    print(parse_string)
    filtered_sequences = filter_by_receptor(valid_seqs, parse_string)
    #if len(filtered_sequences) == 0:
    print(f"Parsed with {parse_string = }: Result : {len(filtered_sequences)} Sequences")
    parsed_collection = GainCollection( alignment_file = alignment_file,
                                        aln_cutoff = aln_cutoff,
                                        quality = quality,
                                        gps_index = gps_minus_one,
                                        stride_files = stride_files,
                                        sequence_files=None,
                                        sequences=filtered_sequences,
                                        alignment_dict = alignment_dict,
                                        is_truncated = True
                                         )
    parsed_collection.plot_sse_hist(title=f"Receptor group: {parse_string} (Total: {len(filtered_sequences)})",
                                   n_max=16,
                                   savename="hists/%s"%(str(group)))"""

In [None]:
# Function for Parsing out specific Files from the overall dataset based on selection
def grab_selection(parse_string, stride_path, pdb_list, sequences, profile_path, target_dir, seqs=None):
    # grabs PDB file, stride file, profiles, sequence from FASTA and copies to target dir.
    if seqs is None:
        sub_seqs = [seq for seq in sequences if parse_string.lower() in seq[0].lower()]
    else: sub_seqs = seqs
    print(f"Found {len(sub_seqs)} sequences.")
    strides = glob.glob(stride_path+"*.stride")#
    profiles = glob.glob(profile_path+"*.png")
    
    sub_strides = []
    sub_profiles = []
    sub_pdbs = []
    
    for seq in sub_seqs:
        ac = seq[0].split("-")[0]
        [sub_profiles.append(prof) for prof in profiles if ac in prof]
        [sub_strides.append(stride) for stride in strides if ac in stride]
        [sub_pdbs.append(pdb) for pdb in pdb_list if ac in pdb]
    
    for prof in sub_profiles:
        name = prof.split("/")[-1]
        copyfile(prof, target_dir+"profiles/"+name)
    
    for stride in sub_strides:
        name = stride.split("/")[-1]
        copyfile(stride, target_dir+"strides/"+name)
    
    for pdb in sub_pdbs:
        name = pdb.split("/")[-1]
        copyfile(pdb, target_dir+"pdbs/"+name)
        
    for seq in sub_seqs:
        sse_func.write2fasta(seq[1]+"\n", seq[0], target_dir+"seqs/"+seq[0]+".fa")
        
    print(f"Copied {len(sub_pdbs)} PDB files, {len(sub_strides)} STRIDE files,",
          f" {len(sub_profiles)} Profiles and {len(sub_seqs)} Sequences",
          f"for Selection {parse_string}")
    
root_path = "/home/hildilab/projects/agpcr_nom/"
profile_path = root_path+"all_gps_profiles/"
pdb_list = glob.glob(f"{root_path}all_gps*/batch*/*rank_1_*.pdb")
print(len(pdb_list))
#valid_seqs
target_dir = root_path+"human/"

In [None]:
"""grab_selection(parse_string='HUMAN',
              stride_path = root_path+"all_gps_stride/",
              pdb_list = pdb_list,
              sequences = valid_seqs,
              profile_path = profile_path,
              target_dir = target_dir)"""

In [None]:
human_seqs = ["Q9HBW9","O60241","Q6QNK2","Q9UHX3","Q5T601","Q96PE1","O60242","Q86SQ4",
                "O94910","Q8IWK6","Q8IZP9","Q8WXG9","Q86Y34","O95490","Q14246","Q9BY15",
                "Q8IZF2","Q86SQ3","Q8IZF6","Q96K78","Q8IZF3","Q8IZF7","Q8IZF5","Q7Z7M1",
                "Q8IZF4","Q9HCU4","Q9NYQ6","Q9NYQ7","Q9HAR2","O14514","P48960",
                "Q9Y653"]
list_31 = filter_by_list(valid_seqs, human_seqs)
sigma_2_strides = glob.glob("/home/hildilab/projects/agpcr_nom/sigma_2*/*.stride")


human_collection = GainCollection( alignment_file = alignment_file,
                                        aln_cutoff = aln_cutoff,
                                        quality = quality,
                                        gps_index = gps_minus_one,
                                        stride_files =  sigma_2_strides,
                                        sequence_files=None,
                                        sequences=list_31,
                                        alignment_dict = alignment_dict,
                                        is_truncated = True,
                                        coil_weight = 0.08, # TESTING
                                        stride_outlier_mode = True,
                                        debug=False
                                         )

pickle.dump(human_collection, open('../human_collection.pkl', 'wb'))
#print(len(human_collection.collection))
"""for gain in human_collection.collection:
    #print(gain.name, gain.start, gain.end, gain.sequence, gain.index, gain.subdomain_boundary)
    pdb_out = root_path+"human/trunc_pdbs/"+gain.name+"_gain.pdb"
    ac = gain.name.split("-")[0]
    found_pdb = [pdb for pdb in pdb_list if ac in pdb]
    target_pdb = found_pdb[0]
    gain.write_gain_pdb(target_pdb, pdb_out)"""


In [None]:
"""plt.plot(human_collection.anchor_hist)
human_collection.plot_sse_hist(title=f"Receptor group: HUMAN_31 (Total: 31)",
                                   n_max=16,
                                   savename=f"hists/human_31.s2_newAnch.{enum}")"""
def find_pdb(name, pdb_folder):
    identifier = name.split("-")[0]
    target_pdb = glob.glob(f"{pdb_folder}/*{identifier}*.pdb")[0]
    return target_pdb
l = []
for gain in human_collection.collection:
    l.append(find_pdb(gain.name, "../all_pdbs/"))
print("pymol", " ".join(l))


In [None]:
occupancy = np.zeros([aln_cutoff],dtype=int)
sse_matrix = np.zeros([len(valid_collection.collection), aln_cutoff])
print(f"{occupancy.shape = }\n{sse_matrix.shape = }")
for i, gain in enumerate(valid_collection.collection):
    
    #print(gain.sda_helices, gain.sdb_sheets)
#    for res_id in range(gain.start,gain.end+1):
    occupancy[gain.alignment_indices] += 1
    #print("_"*30, "\n",gain.name, "\n",gain.sda_helices,"\n", gain.sdb_sheets)
    #print(len(gain.alignment_indices))
    for helix in gain.sda_helices:
        for res_id in range(helix[0]-1,helix[1]): # Residue 1 means that the index is zero.
            #print(gain.alignment_indices[res_id], res_id)
            gain.alignment_indices[res_id]
            sse_matrix[i, gain.alignment_indices[res_id]] = -1
    for sheet in gain.sdb_sheets:
        for res_id in range(sheet[0]-1,sheet[1]):
            sse_matrix[i, gain.alignment_indices[res_id]] = 1

anchors , anchor_occupation = valid_collection.find_anchors(cutoff=3000)
anchor_dict = sse_func.make_anchor_dict(anchors, valid_collection.alignment_subdomain_boundary)
#dir(human_collection.collection[0])
#human_collection.collection[0].create_indexing(anchors, anchor_occupation, anchor_dict)
print(anchor_dict)


With the "all_base" Indexing and the "validCollection" Collection, we can query the data to give us statistics about the anchors and individual SSE composition.

In [None]:
# Create the TSV file with the individual pLDDT values.
import re, json

jsons = glob.glob('/home/hildilab/projects/agpcr_nom/*_output/batch*/*rank_1*scores.json')

identifiers = []
with open('all_plddt.tsv', 'w') as c:
    c.write('Identifier\tplddt_values\n')
    for j in jsons:
        identifier = re.findall(r'\/[\w]+-', j)[0][1:-1]
        if identifier in identifiers:
            continue
        identifiers.append(identifier)
        with open(j) as jx:
            data = json.load(jx)
        c.write(f"{identifier}\t{','.join(['{:.2f}'.format(k) for k in data['plddt']])}\n")
#print(len(identifiers))

In [None]:
# Create a file where all pLDDT values are contained within, with each identifier
import json

jsons = glob.glob('/home/hildilab/projects/agpcr_nom/*_output/batch*/*rank_1*scores.json')

# construct a dictionary where for each identifier (i.e. H6.49), the total occ. is plottef

def construct_identifiers(indexing_dir, center_dir, plddt_values, max_id_dir, name, seq=None):
    id_dir = {}
    plddts = {}
    sse_seq = {}
    for sse in indexing_dir.keys():
        if sse == 'GPS' :
            continue
        start = indexing_dir[sse][0]
        end = indexing_dir[sse][1]
        if end-start > 45:
            print(f"NOTE: SKIPPDING TOO LONG SSE WITH LENGTH {end-start}\n{name}: {sse}")
            continue
        center_res = center_dir[f"{sse}.50"]
        first_res = 50 - center_res + start
        for k in range(end-start+1):
            if sse not in max_id_dir.keys():
                max_id_dir[sse] = []
            if first_res+k not in max_id_dir[sse]:
                max_id_dir[sse].append(first_res+k)
        id_dir[sse] = [first_res+k for k in range(end-start+1)]
        plddts[sse] = [plddt_values[k] for k in range(start, end+1)]
        if seq is not None:
            sse_seq[sse] = [seq[k] for k in range(start, end+1)]
    if seq is None:
        sse_seq = None
    return max_id_dir, id_dir, plddts, sse_seq

def get_plddt_dir(file='all_plddt.tsv'):
    plddt_dir = {}
    with open(file) as f:
        data = [l.strip() for l in f.readlines()[1:]]
        for l in data:
            i,v  = tuple(l.split("\t"))
            plddt_dir[i] = [float(val) for val in v.split(",")]
    return plddt_dir

def make_id_list(id_dir):
    id_list = []
    for sse in id_dir.keys():
        for res in id_dir[sse]:
            id_list.append(f"{sse}.{res}")
    return id_list #np.array(id_list)

def compact_label_positions(id_collection, plddt_collection, sse_keys):
    label_plddts = {}
    for sse in sse_keys:
        label_plddts[sse] = {}

    for i in range(len(id_collection)):
        gain_positions = id_collection[i]
        plddt_positions = plddt_collection[i]
        
        for sse in gain_positions.keys():
            
            for j, pos in enumerate(gain_positions[sse]):
                pos = int(pos)

                if pos not in label_plddts[sse].keys():
                    label_plddts[sse][pos] = [plddt_positions[sse][j]]
                else:
                    label_plddts[sse][pos].append(plddt_positions[sse][j])

    return label_plddts

def construct_id_occupancy(indexing_dirs, center_dirs, length, plddt_dir, names, seqs):
    newkeys = ['H1','H2','H3','H4','H5','H6','H7','H8','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13']
    id_collection = []
    plddt_collection = []
    seq_collection = []
    all_id_dir = {x:[] for x in newkeys}
    for k in range(length):
        identifier = names[k].split("-")[0]
        plddt_values = plddt_dir[identifier]
        all_id_dir, id_dir, plddts, sse_seq = construct_identifiers(indexing_dirs[k], center_dirs[k], plddt_values, all_id_dir, names[k], seqs[k])
        id_collection.append(id_dir)
        #print(id_dir)
        plddt_collection.append(plddts)
        seq_collection.append(sse_seq)
    print("Completed creating value collection.")
    print(id_collection[0])
    print(plddt_collection[0])

    # Here, parse through the id_dirs to count the occurrence of positions per SSE
    # Dictionary to map any label identifier to a respective position.
    id_map = {}
    i = 0
    for sse in newkeys:
        for res in all_id_dir[sse]:
            id_map[f'{sse}.{res}'] = i 
            i += 1
    
    max_id_list = []
    for i, id_dict in enumerate(id_collection):
        max_id_list.append(make_id_list(id_dict))
    flat_id_list = np.array([item for sublist in max_id_list for item in sublist])
    print("Finished constructing flat_id_list.")
    labels, occ = np.unique(flat_id_list, return_counts=True)
    # Parse through labels, occ to generate the sse-specific data
    occ_dict = {labels[u]:occ[u] for u in range(len(labels))}
    # Transform occ_dict to the same format as label_plddts (one dict per sse):
    label_occ = {}
    for sse in newkeys:
        label_occ[sse] = {int(k[-2:]):v for k,v in occ_dict.items() if sse in k}
    #print(labels, occ)
    label_plddts = compact_label_positions(id_collection, plddt_collection, newkeys)
    label_seq = compact_label_positions(id_collection, seq_collection, newkeys)
    #print(labels)
    return label_plddts, label_occ, label_seq
    #[print(k, len(v)) for k,v in label_plddts.items()]

plddt_dir = get_plddt_dir()
#print(list(plddt_dir.keys())[:10])
plddt_values, occ_values, label_seq = construct_id_occupancy(all_base.indexing_dirs, all_base.center_dirs, all_base.length, plddt_dir, all_base.names, seqs)

In [None]:
## PLOT THE POSITION OCCUPANCY AND THE AVERAGE PLDDT PER POSITION. with plddt_values, occ_values
newkeys = ['H1','H2','H3','H4','H5','H6','H7','H8','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13']
for sse in newkeys:
    # Transform the values first
    pp = plddt_values[sse]
    #print(occ_values[sse])
    av_pp = {k:np.average(np.array(v))/100 for k,v in pp.items()}
    #print(av_pp)
    norm_occ = {k:v/14435 for k,v in occ_values[sse].items()}
    xax = sorted(av_pp.keys())
    y_pp = [av_pp[x] for x in xax]
    y_occ = [norm_occ[x] for x in xax]
    norm_pp = np.array(y_pp)*np.array(y_occ)

    fig, ax = plt.subplots(figsize=[5,2])
    fig.set_facecolor('w')
    ax.xaxis.set_minor_locator(MultipleLocator(1)) #AutoMinorLocator())
    ax.xaxis.set_major_locator(FixedLocator([a for a in range(2,100,3)]))#MultipleLocator(3)))
    ax.tick_params(which='both', width=2)
    ax.tick_params(which='major', length=8)
    ax.tick_params(which='minor', length=6)
    plt.bar(xax,y_pp, color='silver', alpha=0.7)
    plt.plot(xax, y_occ, color='dodgerblue')
    plt.bar(xax, norm_pp, color='xkcd:lightish red', alpha=0.1)
    plt.title(f'Element Composition ({sse})')
    plt.yticks(ticks = [0, 0.2, 0.4, 0.6, 0.8, 1], labels = ['0%', '20%', '40%', '60%', '80%', '100%'])
    #plt.ylabel('')
    ax.set_xticklabels([f'{sse}.{str(int(v))}' for v in ax.get_xticks()], rotation=90)
    #plt.savefig(f'../fig/{sse}_stats.svg', bbox_inches='tight')
    plt.show()
    plt.close(fig)

In [None]:
# Get the occupancy of certain positions:
enriched_positions = ['H1.50','H1.54','H1.57','H1.61','H2.56','H2.57','H2.60','H2.61','H3.36','H3.43',
'H3.44','H3.51','H3.53','H3.56','H4.38','H4.41','H4.51','H5.37','H5.38','H5.42','H5.44',
'H5.48','H5.50','H5.59','H6.42','H6.54','H6.56','H7.40','H7.51','H8.46','H8.58','H8.60',
'S1.48','S2.47','S2.51','S2.53','S2.58','S3.53','S3.55','S4.56','S5.48','S5.52','S5.55',
'S6.50','S7.45','S7.52','S7.57','S9.53','S10.50','S11.54','S11.55','S12.47','S12.50','S12.52','S13.48','S13.50']
for sse in newkeys:
    sub_positions = [k for k in enriched_positions if f'{sse}.' in k]
    # Transform the values first
    #pp = plddt_values[sse]
    #print(occ_values[sse])
    #av_pp = {k:np.average(np.array(v))/100 for k,v in pp.items()}
    #print(av_pp)
    norm_occ = {f'{sse}.{k}':v/14435 for k,v in occ_values[sse].items()}
    #print(norm_occ)
    for k in sub_positions:
        print(round(norm_occ[k],2),k)


In [None]:
# GENERATE A FULL DATAFRAME FOR THE LABELED POSITIONS AND THEIR RESPECTIVE AA FREQUENCIES FOR LOGOPLOTS
sse_aa_freqs = {}
aastr = 'ACDEFGHIKLMNPQRSTVWYX'
cols = {aa:i for i,aa in enumerate(aastr)}
for sse in newkeys:
    sse_dict = label_seq[sse]
    aafreqs = np.zeros(shape=(len(sse_dict.keys()), 21))
    for p_index, pos in enumerate(sorted(sse_dict.keys())):
        aas, freq = np.unique(np.array(sse_dict[pos]), return_counts=True)
        for i, aa in enumerate(aas):
            aafreqs[p_index, cols[aa]] = freq[i]/14435
    sse_aa_freqs[sse] = aafreqs


In [None]:
# Print the sequence composition for each cancer-enriched-position

for sse in newkeys:
    sub_positions = [k for k in enriched_positions if f'{sse}.' in k]
    lframe = pd.DataFrame(data=sse_aa_freqs[sse], columns=[c for c in aastr], index = sorted(plddt_values[sse].keys()))
    #print(lframe)
    for pos in sub_positions:
        idx = int(pos[-2:]) # get the row number in the SSE
        res_data = lframe.loc[[idx]]
        total_freqs = res_data.sum(axis=1).to_list()[0]
        #print(f"{total_freqs = }, {type(total_freqs)}")

        norm_freq_dict = {round(freq.to_list()[0]/total_freqs, 5) : aa for aa, freq in res_data.items()}
        sorted_norm_freqs = sorted(norm_freq_dict.keys())[::-1]
        #print(pos)
        xstring = ''
        for k in sorted_norm_freqs[:3]: 
            xstring = xstring +f'{norm_freq_dict[k]}:{round(k*100)}%,'
        print(pos, xstring)

            # normalize frequency with the total sum

"""for sse in newkeys:
    for pos in enriched_positions:
        datarow = center_res[pos]
        occ, residues = parse_conservation(datarow, all_base.length)
        print(f"{sse}\t{occ}%\t{residues}\n")"""

In [None]:
# LOGOPLOTS FOR THE ELEMENTS

from matplotlib.ticker import FixedLocator

for sse in newkeys:

    lframe = pd.DataFrame(data=sse_aa_freqs[sse], columns=[c for c in aastr], index = sorted(plddt_values[sse].keys()))

    # Note down the first and last row where the occupation threshold is met.
    firstval = None
    for i, r in lframe.iterrows():
        if np.sum(r) > 0.05: 
            if firstval is None:
                firstval = i
            lastval = i
    print(firstval, lastval)
    subframe = lframe.truncate(before=firstval, after=lastval)
    #x_offset = sorted(plddt_values[sse].keys())[0]

    fig, ax = plt.subplots(figsize=[5,2])
    cons_logo = logomaker.Logo(subframe,
                                ax=ax,
                                color_scheme='chemistry',
                                show_spines=False,
                                font_name='FreeSans')

    fig.set_facecolor('w')
    ax.xaxis.set_minor_locator(MultipleLocator(1)) #AutoMinorLocator())
    ax.xaxis.set_major_locator(FixedLocator([a for a in range(2,100,3)]))#MultipleLocator(3))
    ax.tick_params(which='both', width=2)
    ax.tick_params(which='major', length=8)
    ax.tick_params(which='minor', length=6)
    ax.set_xticklabels([f'{sse}.{str(int(v))}' for v in ax.get_xticks()], rotation=90)
    cons_logo.draw()
    fig.tight_layout()
    fig.set_facecolor('w')
    #plt.savefig(f"../fig/conslogo_{sse}.svg", bbox_inches='tight')
    plt.close(fig)

In [None]:
for i, gain in enumerate(human_collection.collection):
    #if "Q6QNK2" in gain.name:
        x1, x2,_,_ = gain.create_indexing(anchors, anchor_occupation, anchor_dict, 
                    outdir = "/home/hildilab/projects/agpcr_nom/human_31/indexing_files_s2_dsp",
                    #offset = fasta_offsets[i]-gain.start+1,
                    split_mode='double')
        
        print(x1, x2)

In [None]:
# A CELL FOR SPECIFICALLY EVALUATING INDIVIDUAL GAIN DOMAINS.
# MOSTLY FOR DEBUGGING OR CHECKING OUT ANOMALITIES
#[('G1SHT5-G1SHT5_RABIT-AGRG6-Oryctolagus_cuniculus', 'REANEVASEILNLTADGQNLTSANITSIVEQVKRIVNKEENIDVTLGSTLMNIFSNILSNSDSDLLESSSEALKTIDELAFKIDLNSTPHVNIATRNLALGVSSVSPGTNVISNFSIGLPSNNESYFQMDFESGQVDPLASVILPPNLLENLSQEDSILVRRAQFTFFNKTGLFQDVGPQRKTLVSYVMACSIGNITIQDLKDPVQIKIKHTRTQEVHHPICAFWDLNKNKSFGGWNTSGCIAHRDSDASETICLCNHFTHFGVLMD')
# ('G1SQ79-G1SQ79_RABIT-AGRG4-Oryctolagus_cuniculus', 'LQGLPDKILDLANITVSDENANDVAEHILNLLNESPPLDEEETKIIVSKVSDISLCEKISMNLTQLMLQIINAVLEKQNDSASGLHEVSNEILRLIERAGHKMEFWGRTANLMVARLALAMLRVDHKFEGVTFSIRSYTEGTDPEIYLGDVPAGKVLASIYLPKSLKKRLRVNNLQTILFNFFGQTSLFKVKNVSKALTTYVVSASISDLSIQNLADPVVITLQHVEGSQKYDQVHCAFWDFEKNNGLGGWNSSGCKVKETNVNYTICQCDHLTHFGVLMDL')
# ('G1T5U9-G1T5U9_RABIT-AGRL2-Oryctolagus_cuniculus'
tar = ["A0A151X191"]
tar_seqs = filter_by_list(valid_seqs, tar)
print(tar_seqs)
Xalignment_file = "/home/hildilab/projects/agpcr_nom/big_mafft14792.fa" # This is a combined alignment of ALL sequences in ALL queries!
Xquality_file = "/home/hildilab/projects/agpcr_nom/big_mafft14792.jal"  # ^ corresponding quality file
Xquality = sse_func.read_quality(Xquality_file)
Xgps_minus_one = 19258 #appended_big_mafft: 21160  #big_mafft14792: 19258
Xaln_cutoff = 19822 #appended_big_mafft: 21813 #big_mafft14792: 19822
Xstride_files = glob.glob("/home/hildilab/projects/agpcr_nom/all_gps_stride/*")
Xalignment_dict = sse_func.read_alignment(alignment_file, aln_cutoff)
Xname, Xseq = tar_seqs[0]
tar_stride = "/home/hildilab/projects/agpcr_nom/all_gps_stride/A0A369RSM9-A0A369RSM9_9METZ-AGRL3-Trichoplax_sp._H2.stride"
tar_gain = GainDomain(alignment_file = Xalignment_file,
                                        aln_cutoff = Xaln_cutoff,
                                        quality = Xquality,
                                        gps_index = Xgps_minus_one,
                                        name=Xname,
                                        #stride_files = Xstride_files,
                                        sequence = Xseq,
                                        alignment_dict = Xalignment_dict,
                                        explicit_stride_file=tar_stride,
                                        is_truncated = True,
                                        coil_weight = 0.01, # TESTING
                                        stride_outlier_mode = True,
                                        without_anchors=True)
#print(tar_gain.isValid)
#if tar_gain.isValid: 
#    tar_gain.plot_helicality(savename=f"{tar_gain.name.split('-')[0]}_helicality.full.svg", debug=True)
#    tar_gain.plot_profile()
#print(tar_gain.subdomain_boundary, tar_gain.start, tar_gain.end)
    #print(gain.name, gain.start, gain.end, gain.sequence, gain.index, gain.subdomain_boundary)
    #pdb_out = root_path+"human_32/trunc_pdbs/"+gain.name+"_gain.pdb"
    #ac = gain.name.split("-")[0]
    #found_pdb = [pdb for pdb in pdb_list if ac in pdb]
    #target_pdb = found_pdb[0]
    #gain.write_gain_pdb(target_pdb, pdb_out) """


In [None]:
aln_col = []

for idx, idx_dir in enumerate(all_base.indexing_dirs):
    try:
        l = idx_dir['H6']
    except:
        continue
    print(l)
    
    for i in range(l[0], l[1]+1):
        if valid_collection.collection[idx].sequence[i] == 'W':
            aln_col.append(valid_collection.collection[idx].alignment_indices[i])
            print(valid_collection.collection[idx].sequence[i],valid_collection.collection[idx].alignment_indices[i])
    
print(len(aln_col), np.unique(np.array(aln_col), return_counts=True))


In [None]:
# STATISTICS FOR ELEMENT-CONNECTING LOOPS
[print(v, len(k), round(np.average(k),2), round(np.std(k),2)) for v,k in loop_lengths.items()]

In [None]:
def get_loop_stats(indexing_dir, sequence):
    # Returns a named dict with loop lengths, i.e. {"H1-H2":13, "H8-S1":12}
    inverted_dir = {sse[0] : (sse[1],ki) for ki, sse in indexing_dir.items() if "GPS" not in ki} # The begin of each sse is here {0:(13, "H2")}
    loop_loc = {}
    loop_dir = {}
    ordered_starts = sorted(inverted_dir.keys())
    for i, sse_start in enumerate(ordered_starts):
        if i == 0: 
            continue # Skip the first and go from the second SSE onwards, looking in N-terminal direction.
        c_label = inverted_dir[sse_start][1]
        n_end, n_label = inverted_dir[ordered_starts[i-1]]
        loop_loc[f"{n_label}-{c_label}"] = (n_end, sse_start-1)
        loop_dir[f"{n_label}-{c_label}"] = sequence[n_end+1:sse_start] # The one-letter-coded seqeuence. Will be a list of lists
    return loop_loc, loop_dir

loop_lengths = {}
loop_seqs = {}
loop_seq = {}

loop_info = {}
#[loop_info[loop] = {} for loop in loop_seqs.keys()] # into each of these keys, any entry is composed of "name":$name, "sequence":$seq

for idx in range(all_base.length):
    curr_name = all_base.names[idx]
    start = valid_collection.collection[idx].start
    i_loc, i_dir = get_loop_stats(all_base.indexing_dirs[idx], valid_collection.collection[idx].sequence)
    for k, seq in i_dir.items():
        if k not in loop_info.keys():
            loop_info[k] = []
        loop_info[k].append({'name':f'{all_base.names[idx]}_{i_loc[k][0]+start}-{i_loc[k][1]+start}', 'sequence':''.join(seq)})
    #print(i_len)
    #loop_lengths = match_dirs(i_len, loop_lengths)
    #loop_seqs = match_dirs(i_dir, loop_seqs)



In [None]:
# Write the collected loop sequences to a FASTA file for later alignment.
def loop2fasta(outfile, itemlist):
    with open(outfile, 'w') as out:
        for subdict in itemlist:
            out.write(f">{subdict['name']}\n{subdict['sequence']}\n")
    print("Done with", outfile)

for loop in loop_info.keys():
    loop2fasta(f"../loops/{loop}.fa", loop_info[loop])


In [None]:
# Get all structures containing 7 Helices in Subdomain A.

mystring = ''
no=0
for gain in valid_collection.collection:
        try:sse_labl, _, _, _ = gain.create_indexing(silent=True,anchors=anchors, anchor_occupation=anchor_occupation, anchor_dict=anchor_dict, debug=False)
        except:no+=1;continue
        kk = len([k for k in sse_labl.keys() if "H" in k])
        if kk >= 7: mystring += gain.name+"\n"#+str(kk)+"\n"
print(no)
print(mystring)
with open("7helix.txt", 'w') as seven:
        seven.write(mystring)

In [None]:
print(len(mystring.split("\n")))

In [None]:
# Get all structures containing 14+ Strands in Subdomain A.

mystring = ''
no=0
num_sheets = np.zeros(shape=(14432))
total_strand = np.zeros(shape=(14432))
for i, gain in enumerate(valid_collection.collection):
        #try:sse_labl, _, _, _ = gain.create_indexing(silent=True,anchors=anchors, anchor_occupation=anchor_occupation, anchor_dict=anchor_dict, debug=False)
        #except:no+=1;continue
        num_sheets[i] = len(gain.sdb_sheets)
        if len(gain.sdb_sheets) > 14 and len(gain.sdb_sheets) < 17:
                mystring += gain.name+"\n"#+str(kk)+"\n"

        total_strand[i] = np.sum([strand[1]-strand[0] for strand in gain.sdb_sheets])
        #k = len([k for k in sse_labl.keys() if "S" in k])
        #if kk >= 13: mystring += gain.name+"\n"#+str(kk)+"\n"
print(no)
print(np.unique(num_sheets, return_counts=True))

#print(np.unique(total_strand, return_counts=True, return_index=True))
with open("14plussheet.txt", 'w') as seven:
        seven.write(mystring)

In [None]:
extents = {}
for gain in valid_collection.collection:
    extents[gain.name.split("-")[0]] = [str(gain.start+1), str(gain.subdomain_boundary+1), str(gain.end+1)] # make it compatible with ONE-indexed PDBs.

import json
with open('domain_extents.json', 'w') as j:
    dump = json.dumps(extents)
    j.write(dump)
