In [None]:
# ANALYSIS FOR BRANCH PKD - poycystins without alignment
# DEPENDENCIES
import glob
from shutil import copyfile
import pickle
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.ticker import (MultipleLocator, FixedLocator)
#import logomaker
# LOCAL IMPORTS
from gaingrn.utils.indexing_classes import GPCRDBIndexing
from gaingrn.utils.gain_classes import GainDomain, GainCollection, GainDomainNoAln
import gaingrn.utils.io
import gaingrn.utils.alignment_utils
import gaingrn.utils.structure_utils

In [None]:
valid_seqs = gaingrn.utils.io.read_multi_seq("../../data/pkd/all_pkd_gain.fa")
stride_files = glob.glob("../../data/pkd/pkd_stride/*")
# This only contains the sigma files for truncated (?) PDBs.
print(len(stride_files))

In [None]:
# re-offset the sequences to match the exact PDB indexing
f_seqs = gaingrn.utils.io.read_alignment("../../../all_pkds.fa")
full_seqs = {k.split("-")[0]:v for k,v in f_seqs.items()}
print(full_seqs.keys())
valid_adj_seqs = []
for tup in valid_seqs:
    name = tup[0].split("-")[0]
    x = gaingrn.utils.alignment_utils.find_the_start(longseq=full_seqs[name], shortseq=tup[1])
    if x == 0:
        print("already 1st res.\n", name, tup[1][:10], tup[1][-10:])
        valid_adj_seqs.append( (name,full_seqs[name][:len(tup[1])-1]) )
        print(full_seqs[name][:len(tup[1])-1])
    else:
        valid_adj_seqs.append( (name,full_seqs[name][x-1:x+len(tup[1])-1]) )


In [None]:
valid_collection = GainCollection(  
                                    stride_files = stride_files,
                                    sequence_files=None,
                                    sequences=valid_adj_seqs,#valid_seqs,
                                    is_truncated = True,
                                    coil_weight=0.00, # TESTING
                                    #domain_threshold=20, # TESTING
                                    stride_outlier_mode=True,
                                    no_alignment=True,
                                    debug=False)
#valid_collection = pd.read_pickle("../valid_collection.pkl")

In [None]:
pickle.dump(valid_collection, open('../../data/pkd/pkd_collection.pkl', 'wb'), -1)

In [None]:
pdbs = glob.glob("../pkd_pdbs/*_rank_1_*.pdb")
print(f"Found {len(pdbs)} best ranked models in target directories.")
valid_ct = 0

for gain in valid_collection.collection:
    valid_ct +=1
    name = gain.name
    tar_pdb = [p for p in pdbs if name.split("_")[0] in p][0]
    new_pdb_path = f'../trunc_pkd_pdbs/{gain.name.split("-")[0]}.pdb'
    gaingrn.utils.structure_utils.truncate_pdb(gain.start,gain.end, tar_pdb, new_pdb_path)
print("Copied and truncated", valid_ct, "GAIN domains.")
    

In [None]:
valid_collection.plot_sse_hist(title=f"Polycystins (Total: {len(valid_adj_seqs)})",
                               n_max=26,
                               #savename="../fig/hists/%s.adj"%(out_names[i]))
                               savename="../pkd_data/pkd_sse")