This is dssp_analysis.ipynb, a notebook for determining whether DSSP or STRIDE is better to be used for assessing the segment / secondary structure composition of GAIN domains.

In [None]:
# build a database from a dataset for nomenclating
# INPUT: a collection of GAIN domain PDBs, their sequences as one large ".fa" file
from gaingrn.scripts.gain_classes import GainDomain, GainCollection, Anchors, GPS
import sse_func
import numpy as np
import glob
#import multiprocessing as mp
#from subprocess import Popen, PIPE
from tqdm import tqdm
from matplotlib import pyplot as plt
from shutil import copyfile
import math
import gaingrn.scripts.alignment_utils

In [None]:
def filter_by_receptor(sequences, selection):
    new_list = []
    for seq_tup in sequences:
        if selection in seq_tup[0]:
            new_list.append(seq_tup)
    return new_list

def filter_by_list(sequences, selection): # selection list
    new_list = []
    for seq_tup in sequences:
        for it in selection:
            if it in seq_tup[0]:
                new_list.append(seq_tup)
    return new_list

def read_dssp(file):
    '''
    Similar to sse_func.read_sse_asg, parse the file and return the sse_sequence list
    Parameters:
        file : str, required
            DSSP file to be read (for now, GUILLEs pre-compiled version)
    Returns:
        sse_sequence : list
            LIST containing a sequence of all letters assigned to the residues
    '''
    sse_list = []
    with open(file) as f:
        for l in f.readlines():
                items = l.split()
                if len(items) > 1:
                    one_letter_sse = items[1]
                else:
                    one_letter_sse = ''
                sse_list.append(one_letter_sse)
    return sse_list

def build_dssp_dict(file):
    # Build a more rudimentary SSE dictionary based on the SSE sequence from read_dssp:
    def move2dict(sse_dict, dssp_keys, element, sse_tuple):
        if element not in dssp_keys.keys():
            #print(f"Skipping element with DSSP assignment \"{element}\" ")
            return sse_dict
        sse_id = dssp_keys[element]
        if sse_id not in sse_dict.keys():
            sse_dict[sse_id] = [sse_tuple]
        else:
            sse_dict[sse_id].append(sse_tuple)
        return sse_dict
    
    with open(file) as f:
        first_index = int(f.read().split()[0][1:]) # the integer value of the first index
        
    sse_list = read_dssp(file)             
    dssp_keys = { "H" : "AlphaHelix",
                     "B" : "Bridge",
                     "E" : "Strand",
                     "G" : "310Helix",
                     "I" : "5Helix",
                     "T" : "Turn",
                     "S" : "Bend"     }
    sse_dict = {}
    # Parse through the entries one by one and construc tuples (first_res, last_res)
    # pass them into the dictionary with the corresponding key
    stored_element = ''
    within_element = False
    for idx, assignment in enumerate(sse_list):
        if assignment != stored_element:
            if within_element:
                last = idx + first_index
                within_element = False
                # Move the tuple into the dictionary
                sse_dict = move2dict(sse_dict, dssp_keys, stored_element, (first,last))
            if not within_element:
                if assignment == "":
                    stored_element = ""
                    continue
                first = idx + first_index
                stored_element = assignment
                within_element = True
    return sse_dict

In [None]:
#test_dssp = '/home/hildilab/projects/agpcr_nom/dssp4/test.dssp'
#print(build_dssp_dict(test_dssp))
strides = glob.glob('/home/hildilab/projects/agpcr_nom/human_31_0/sse/*.stride')
# COMPARE LISTS OF STRIDE AGAINST DSSP
for f in strides:
    print("\n", f)
    dssp_list = read_dssp(f.replace(".stride","_gain.dssp"))
    with open(f.replace(".stride","_gain.dssp")) as g:
        data = g.read()
        #print(data.split("\n"))
        first_index = int(data.split()[0][1:]) # the integer value of the first index
        last_index = int(data.split("\n")[-2].split("\t")[0][1:])#.split()[1:]
        #print(last_index)
        
    stride_list, _ = sse_func.read_sse_asg(f)
    dssp_dict = build_dssp_dict(f.replace(".stride","_gain.dssp"))
    stride_dict = sse_func.cut_sse_dict(first_index, last_index, sse_func.read_sse_loc(f))
    for ki in dssp_dict.keys():
        if ki in stride_dict.keys():
            print(ki)
            """            for i, item in enumerate(dssp_dict[ki]):
                try: 
                    print(item,"\t\t", stride_dict[ki][i])
                except:
                    print(item)"""
            
            print("DSSP\t", dssp_dict[ki])
            print("STRIDE\t",stride_dict[ki])

        

In [None]:
valid_seqs = sse_func.read_multi_seq("/home/hildilab/projects/agpcr_nom/app_gain_gain.fa")
print(len(valid_seqs))
quality_file = "/home/hildilab/projects/agpcr_nom/app_gain_gain.mafft.jal"
alignment_file = "/home/hildilab/projects/agpcr_nom/app_gain_gain.mafft.fa"
stride_folder = "/home/hildilab/projects/agpcr_nom/all_gps_stride"
#stride_files = glob.glob("/home/hildilab/projects/agpcr_nom/all_gps_stride/*")
stride_files = glob.glob("/home/hildilab/projects/agpcr_nom/sigmas/sigma_2/*")
quality = sse_func.read_quality(quality_file)
gps_minus_one = 6781 # -1 of the ACTUAL COLUMN (6782) in JALVIEW since there is is ONE-INDEXED
aln_cutoff = 6826 # 
alignment_dict = sse_func.read_alignment(alignment_file, aln_cutoff)

In [None]:
valid_collection = GainCollection(  alignment_file = alignment_file,
                                    aln_cutoff = aln_cutoff,
                                    quality = quality,
                                    gps_index = gps_minus_one,
                                    stride_files = stride_files,
                                    sequence_files=None,
                                    sequences=valid_seqs,
                                    alignment_dict = alignment_dict,
                                    is_truncated = True,
                                    stride_outlier_mode=True)
#for gain in valid_collection.collection:
#    gain.create_indexing()

In [None]:
# Function for Parsing out specific Files from the overall dataset based on selection
def grab_selection(parse_string, stride_path, pdb_list, sequences, profile_path, target_dir, seqs=None):
    # grabs PDB file, stride file, profiles, sequence from FASTA and copies to target dir.
    if seqs is None:
        sub_seqs = [seq for seq in sequences if parse_string.lower() in seq[0].lower()]
    else: sub_seqs = seqs
    print(f"Found {len(sub_seqs)} sequences.")
    strides = glob.glob(stride_path+"*.stride")#
    profiles = glob.glob(profile_path+"*.png")
    
    sub_strides = []
    sub_profiles = []
    sub_pdbs = []
    
    for seq in sub_seqs:
        ac = seq[0].split("-")[0]
        [sub_profiles.append(prof) for prof in profiles if ac in prof]
        [sub_strides.append(stride) for stride in strides if ac in stride]
        [sub_pdbs.append(pdb) for pdb in pdb_list if ac in pdb]
    
    for prof in sub_profiles:
        name = prof.split("/")[-1]
        copyfile(prof, target_dir+"profiles/"+name)
    
    for stride in sub_strides:
        name = stride.split("/")[-1]
        copyfile(stride, target_dir+"strides/"+name)
    
    for pdb in sub_pdbs:
        name = pdb.split("/")[-1]
        copyfile(pdb, target_dir+"pdbs/"+name)
        
    for seq in sub_seqs:
        sse_func.write2fasta(seq[1]+"\n", seq[0], target_dir+"seqs/"+seq[0]+".fa")
        
    print(f"Copied {len(sub_pdbs)} PDB files, {len(sub_strides)} STRIDE files,",
          f" {len(sub_profiles)} Profiles and {len(sub_seqs)} Sequences",
          f"for Selection {parse_string}")
    
root_path = "/home/hildilab/projects/agpcr_nom/"
profile_path = root_path+"all_gps_profiles/"
pdb_list = glob.glob(f"{root_path}all_gps*/batch*/*rank_1_*.pdb")
print(len(pdb_list))
#valid_seqs
target_dir = root_path+"human/"

In [None]:
human_seqs = ["Q9HBW9","O60241","Q6QNK2","Q9UHX3","Q5T601","Q96PE1","O60242","Q86SQ4",
                "O94910","Q8IWK6","Q8IZP9","Q8WXG9","Q86Y34","O95490","Q14246","Q9BY15",
                "Q8IZF2","Q86SQ3","Q8IZF6","Q96K78","Q8IZF3","Q8IZF7","Q8IZF5","Q7Z7M1",
                "Q8IZF4","Q9HCU4","Q9NYQ6","Q9NYQ7","Q9HAR2","O14514","P48960",
                "Q9Y653"]
sigma_2_strides = glob.glob("sigma_2*/*.stride")
list_32 = filter_by_list(valid_seqs, human_seqs)

human_collection = GainCollection( alignment_file = alignment_file,
                                        aln_cutoff = aln_cutoff,
                                        quality = quality,
                                        gps_index = gps_minus_one,
                                        stride_files =  sigma_2_strides, #stride_files,
                                        sequence_files=None,
                                        sequences=list_32,
                                        alignment_dict = alignment_dict,
                                        is_truncated = True,
                                        stride_outlier_mode = True
                                         )
#print(len(human_collection.collection))
"""for gain in human_collection.collection:
    #print(gain.name, gain.start, gain.end, gain.sequence, gain.index, gain.subdomain_boundary)
    pdb_out = root_path+"human/trunc_pdbs/"+gain.name+"_gain.pdb"
    ac = gain.name.split("-")[0]
    found_pdb = [pdb for pdb in pdb_list if ac in pdb]
    target_pdb = found_pdb[0]
    gain.write_gain_pdb(target_pdb, pdb_out)"""

In [None]:
human_collection.plot_sse_hist(title=f"Receptor group: HUMAN_31 (Total: 31)",
                                   n_max=16,
                                   savename="human_31.s2_newAnch")

In [None]:
#dir(valid_collection)
occupancy = np.zeros([aln_cutoff],dtype=int)
sse_matrix = np.zeros([len(valid_collection.collection), aln_cutoff])
for i, gain in enumerate(valid_collection.collection):
    #print(gain.sda_helices, gain.sdb_sheets)
#    for res_id in range(gain.start,gain.end+1):
    occupancy[gain.alignment_indices] += 1
    
    for helix in gain.sda_helices:
        for res_id in range(helix[0],helix[1]+1):
            sse_matrix[i,gain.alignment_indices[res_id]] = -1
    for sheet in gain.sdb_sheets:
        for res_id in range(sheet[0],sheet[1]+1):
            sse_matrix[i,gain.alignment_indices[res_id]] = 1

In [None]:
anchors , anchor_occupation = valid_collection.find_anchors(cutoff=3000)
print(anchors)
for i, anchor in enumerate(anchors):
    print(i, anchor)
    if anchor < valid_collection.alignment_subdomain_boundary: 
        color = u'#1f77b4'
    else: 
        color = u'#ff7f0e'
    plt.scatter(anchor, valid_collection.anchor_hist[anchor]+1000, c=color, marker="1",s=60)
print(anchors, anchor_occupation)
print(valid_collection.alignment_subdomain_boundary)
anchor_dict = sse_func.make_anchor_dict(anchors, valid_collection.alignment_subdomain_boundary)

In [None]:
class Indexing:
    # A modified Indexing class similar to the indexing_classes.py Indexing, however specific for DSSP data integration.
    def __init__(self, aGainCollection, fasta_offsets=None, split_mode='single'):
        
        length = len(aGainCollection.collection)
        names = np.empty([length], dtype=object)
        indexing_dirs = np.empty([length], dtype=object)
        center_dirs = np.empty([length], dtype=object)
        offsets = np.zeros([length], dtype=int)
        total_keys = []
        center_keys = []
        if fasta_offsets is None:
            self.fasta_offsets = np.zeros([length])
        if fasta_offsets is not None: 
            corrected_offsets = []
            for i in range(length):
                # The existing FASTA offsets do not account for the residue starting not at 0,
                # Therefore the value of the starting res (gain.start) needs to be subtracted.
                corrected_offsets.append(fasta_offsets[i]-aGainCollection.collection[i].start)
            self.fasta_offsets = np.array(corrected_offsets, dtype=int)
            
        for gain_index, gain in enumerate(aGainCollection.collection):
            indexing_dir, indexing_centers = gain.create_indexing(anchors, 
                                                                  anchor_occupation, 
                                                                  anchor_dict,
                                                                  split_mode=split_mode)
            print(indexing_dir, indexing_centers)
            for key in indexing_dir.keys():
                if key not in total_keys:
                    total_keys.append(key)
                    
            for key in indexing_centers.keys():
                if key not in center_keys:
                    center_keys.append(key)                 
                
            indexing_dirs[gain_index] = indexing_dir
            center_dirs[gain_index] = indexing_centers
            # Patch ADGRC/CELSR naming
            names[gain_index] = gain.name.replace("CadherinEGFLAGseven-passG-typereceptor", "AGRC")
            offsets[gain_index] = gain.start

        self.indexing_dirs = indexing_dirs
        self.center_dirs = center_dirs
        self.names = names
        self.length = length
        self.offsets = offsets
        self.accessions = [gain.name.split("-")[0].split("_")[0] for gain in aGainCollection.collection]
        self.sequences = ["".join(gain.sequence) for gain in aGainCollection.collection]
        self.total_keys = sorted(total_keys)
        self.center_keys = sorted(center_keys)
        
        print("Total of keys found in the dictionaries:\n", self.total_keys, self.center_keys)
        print("First entry", self.indexing_dirs[0], self.center_dirs[0])
        
        header_list = ["Receptor", "Accession"] + self.total_keys + self.center_keys
        #header = "Receptor,Accession," + ",".join(self.total_keys) + ",".join(self.center_keys)
        header_list = ["Receptor", "Accession"] + self.total_keys + self.center_keys
        header_dict = {}
        for idx, item in enumerate(header_list):
            header_dict[item] = idx

        data_matrix = np.full([self.length, len(header_dict.keys())], fill_value='', dtype=object)
        # Go through each of the sub-dictionaries and populate the dataframe:
        for row in range(self.length):
                # Q5T601_Q5KU15_..._Q9H615-AGRF1_HUMAN-AGRF1-Homo_sapiens.fa
                # 0                        1           2     3
            name_parts = self.names[row].split("-")
            data_matrix[row, header_dict["Receptor"]] = name_parts[2]
            data_matrix[row, header_dict["Accession"]] = name_parts[0].split("_")[0]
            offset = self.offsets[row]
            fa_offset = self.fasta_offsets[row]

            for key in self.indexing_dirs[row].keys():
                if key == "GPS":
                    sse=[int(x+fa_offset) for x in self.indexing_dirs[row][key]]
                    data_matrix[row, header_dict[key]] = f"{sse[0]}-{sse[-1]}"
                else:
                    sse = [int(x+offset+fa_offset) for x in self.indexing_dirs[row][key]]
                    data_matrix[row, header_dict[key]] = f"{sse[0]}-{sse[1]}"

            for key in self.center_dirs[row].keys():
                data_matrix[row, header_dict[key]] = str(self.center_dirs[row][key]+offset+fa_offset)
            
            self.data_header = ",".join(header_list)
            self.data_matrix = data_matrix

    def data2csv(self, outfile):
        with open(outfile, "w") as f:
            f.write(self.data_header+"\n")
            for row in range(self.length):
                f.write(",".join(self.data_matrix[row,:])+"\n")
        print("Completed file", outfile, ".")

In [None]:
human_base = Indexing(human_collection, split_mode='double')

big_seq_file = "agpcr_celsr.fasta"
fasta_offsets = gaingrn.scripts.alignment_utils.find_offsets(big_seq_file, 
                                 human_base.accessions, 
                                 human_base.sequences)

fa_human_base = Indexing(human_collection, fasta_offsets = fasta_offsets, split_mode='double')

human_base.data2csv(f"default_indexed_s2_s2a_re_double_split.csv")
fa_human_base.data2csv(f"uniprot_indexed_s2_s2a_re_double_split.csv")

In [None]:
def offset_pdb(in_pdb, out_pdb, offset):
    with open(in_pdb,"r") as p:
        data = p.readlines()
    offset_pdb = open(out_pdb, "w")
    
    for line in data:
        if line.startswith("ATOM"):
            res_id = int(line[22:26])
            offset_pdb.write(f"{line[:22]}{str(res_id+offset).rjust(4)}{line[26:]}")
        else:
            offset_pdb.write(line)
    
    print(f"Created PDB {out_pdb} with last residue {res_id+offset}. Total offset {offset} .")

In [None]:
for i, gain in enumerate(human_collection.collection):
    #if "Q6QNK2" in gain.name:
        x1, x2 = gain.create_indexing(anchors, anchor_occupation, anchor_dict, 
                    outdir = "/home/hildilab/projects/agpcr_nom/human_31/indexing_files_s2_dsp",
                    #offset = fasta_offsets[i]-gain.start+1,
                    split_mode='double')
        
        print(x1, x2)

In [None]:
for gain in human_collection.collection:
    gain.GPS.info()
    #for res in gain.GPS.indices:
    print(np.where(gain.alignment_indices == gps_minus_one))
    #for i in range(len(gain.alignment_indices)):
    #    print(gain.alignment_indices[i], gain.sequence[i])
    