In [None]:
# INPUT: a collection of GAIN domain PDBs, their sequences as one large ".fa" file
from gain_classes import GainDomain, GainCollection, Anchors, GPS
import sse_func
import execute
import numpy as np
import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from shutil import copyfile
import math
import re

valid_seqs = sse_func.read_multi_seq("/home/hildilab/projects/agpcr_nom/app_gain_gain.fa")
quality_file = "/home/hildilab/projects/agpcr_nom/app_gain_gain.mafft.jal"
alignment_file = "/home/hildilab/projects/agpcr_nom/app_gain_gain.mafft.fa"
stride_files = glob.glob("/home/hildilab/projects/agpcr_nom/sigmas/sigma_2/*")
# This only contains the sigma files for truncated (?) PDBs.
quality = sse_func.read_quality(quality_file)
gps_minus_one = 6781 
aln_cutoff = 6826 
alignment_dict = sse_func.read_alignment(alignment_file, aln_cutoff)

# Pre-calculated Anchor data.
"""anchors = [ 662, 1194, 1912, 2490, 2848, 3011, 3073, 3260, 3455, 3607, 3998, 4279, 4850, 5339, 5341, 5413, 5813, 6337, 6659, 6696, 6765, 6808 ] 

anchor_occupation = [ 4594.0,  6539.0, 11392.0, 13658.0,  8862.0,  5092.0,  3228.0, 14189.0,  
					  9413.0, 12760.0, 9420.0, 11201.0, 12283.0,  3676.0,  4562.0, 13992.0, 
					  12575.0, 13999.0, 14051.0, 14353.0, 9760.0, 14215.0]"""
sdb = 3425
# ADJUSTED ANCHORS!
anchors = [ 662, 1194, 1912, 2490, 2848, 3011, 3073, 3260, 3455, 3607, 3998, 4279, 4850, 5339,
 5413, 5813, 6337, 6659, 6696, 6765, 6808] # removed S7 @  5341
anchor_occupation = [ 4594.,  6539., 11392., 13658.,  8862., 5092.,  3228., 14189.,  9413., 12760.,
  9420., 11201., 12283.,  3676.,   13992., 12575., 13999., 14051., 14353., 9760., 14215.] # removed S7 @ 4562.,

anchor_dict = sse_func.make_anchor_dict(anchors, sdb) # 3425 is the subdomain boundary in the GAIN alignment
print(anchor_dict)

This is for the integration of invertebrate receptors into the existing GAIN GRN. These receptors (and other low-homology GAIN-containing proteins) have homology too low to be tackled by MAFFT and therefore we need a separate workflow for their integration.

In [None]:
def read_pw_alignment(alnfile):
    """ Reads *.aln file from AlignMe, returns the double string ["XXXXX", "XXXXX"] where both are the alignment-wise sequences of chars"""
    with open(alnfile) as f:
        data = [ l.strip() for l in f.readlines() if len(l.strip()) > 4 ]
        print(f"[DEBUG] {len(data) = }")
    seq_dict = {}
    for l in data[1:]: # omit header.
        
        seq_name, string_block = l.split()
        
        if seq_name not in seq_dict.keys():
            seq_dict[seq_name] = string_block
            continue

        seq_dict[seq_name] += string_block

    return seq_dict

def map_pw_alignment(aln_dict, template_seq, target_seq, template_id):
    """ Maps two sequences from the target alignment onto one another, returns a matrix where the target sequence matches the template. """
    template_key = [k for k in aln_dict.keys() if template_id in k][0]
    template_aln_string = aln_dict[template_key]
    target_key = [k for k in aln_dict.keys() if template_id not in k][0]
    target_aln_string = aln_dict[target_key]

    current_col = 0
    current_template_res = 0
    mapper = np.empty((len(target_seq)))
    mapper.fill(None)

    # transform the template_aln_string to a set of numbers corresponding to resnum of template
    template_num = np.empty((len(template_aln_string)))
    template_num.fill(None)
    resnum = 0
    for idx, char in enumerate(template_aln_string):
        if char == "-": continue
        template_num[idx] = resnum
        resnum += 1

    # With target_seq, find the mapped residue numbers in the aligned target_aln_string
    for idx, char in enumerate(target_seq):
        
        while target_aln_string[current_col] == "-":   # Skip "-" characters
            current_col += 1

        if target_aln_string[current_col] == char:
            mapper[idx] = template_num[current_col] # Write in the residue number from the mapped template_num
        else:
            print("Character MISMATCH!", target_aln_string[current_col],"vs.", char)
        current_col += 1
    return mapper

ADGRD1 is our reference Domain. Any Anchors present in this domain will be assigned to profile-matched GAINs.

In [None]:
name = 'Q6QNK2_B2CKK9_B7ZLF7_Q2M1L3_Q6ZMQ1_Q7Z7M2_Q86SM4-AGRD1_HUMAN-AGRD1-Homo_sapiens' 
seq = 'TPDEIAMYFTAAIGKHALLSSTLPSLFMTSTASPVMPTDAYHPIITNLTEERKTFQSPGVILSYLQNVSLSLPSKSLSEQTALNLTKTFLKAVGEILLLPGWIALSEDSAVVLSLIDTIDTVMGHVSSNLHGSTPQVTVEGSSAMAEFSVAKILPKTVNSSHYRFPAHGQSFIQIPHEAFHRHAWSTVVGLLYHSMHYYLNNIWPAHTKIAEAMHHQDCLLFATSHLISLEVSPPPTLSQNLSGSPLITVHLKHRLTRKQHSEATNSSNRVFVYCAFLDFSSGEGVWSNHGCALTRGNLTYSVCRCTHLTNFAILMQVVPL'
d1_stride = "/home/hildilab/projects/agpcr_nom/all_gps_stride/Q6QNK2_B2CKK9_B7ZLF7_Q2M1L3_Q6ZMQ1_Q7Z7M2_Q86SM4-AGRD1_HUMAN-AGRD1-Homo_sapiens.stride"
d1_gain = GainDomain(alignment_file = alignment_file,
                                        aln_cutoff = aln_cutoff,
                                        quality = quality,
                                        gps_index = gps_minus_one,
                                        name = name,
                                        sequence = seq,
                                        alignment_dict = alignment_dict,
                                        explicit_stride_file=d1_stride,
                                        is_truncated = True,
                                        stride_outlier_mode = True,
                                        without_anchors=False)
#d1_gain.plot_helicality()
print(d1_gain.subdomain_boundary, d1_gain.start, d1_gain.end)


In [None]:
d1_gain.alignment_indices
invertebrate_proteins = [f for f in glob.glob("/home/hildilab/projects/GPS_massif/invert_truncs/alignme/aln/*") if "zip" not in f]

xdir = invertebrate_proteins[0]

alnfile = glob.glob(f"{xdir}/*aln")[0]
target_name, target_seq = sse_func.read_seq(glob.glob(f"{xdir}/fasta2*")[0], return_name=True)
print(glob.glob(f"{xdir}/fasta2*")[0])
print(target_name, target_seq)
aln_dict = read_pw_alignment(alnfile)
mapper = map_pw_alignment(aln_dict, template_seq=seq, target_seq=target_seq, template_id="Q6QNK2")
# this mapper is zero_indexed.
#print(mapper)

# Build the target alignment columns from mapping the parwise matrix onto the respective template alignment indices
tar_aln_cols = []
for val in mapper:
    if not math.isnan(val):
        tar_aln_cols.append(d1_gain.alignment_indices[int(val)])

#print(tar_aln_cols)


With the extracted alignment columns, we can inherit the GAIN domain class to patch in the newly generated alignment indices and generate the indexing.

In [None]:
class PrealignedGain(GainDomain):
    # Overwrite the init function to incorporate the skipping of alignment-index generation
    def __init__(self, 
                 alignment_file, 
                 aln_cutoff,
                 quality,
                 gps_index, 
                 alignment_indices,
                 alignment_dict=None,
                 fasta_file=None,
                 name=None,
                 sequence=None,
                 subdomain_bracket_size=20,
                 domain_threshold=20,
                 coil_weight=0,
                 explicit_stride_file=None,
                 #without_anchors=False,
                 is_truncated=True, # False
                 skip_naming=False,
                 stride_outlier_mode=False,
                 truncation_map=None,
                 aln_start_res=None):   
        
        #Initialize self.name for finding the correspondent alignment row!
        if name is not None:
            self.name = name
        else:
            if fasta_file:
                self.name = fasta_file.split("/")[-1] # This is how the name would be in the pre-calculated alignment
            else:
                print("No name specified. Exiting")
                return None

        # Initalize SSE Dict (LOC) and the SSE sequence (ASG) from STRIDE outfiles.
        # Either from the standard folder (base dataset) or from an explicitly stated STRIDE file. (new GAIN)
        if explicit_stride_file:
            # Read directly from the explicitly stated STRIDE file. (new GAIN)
            self.complete_sse_dict = sse_func.read_sse_loc(explicit_stride_file) 
            self.sse_sequence = sse_func.read_sse_asg(explicit_stride_file)
        else:
            # Find SSE data in corresponding STRIDE files (base data). Extract corresponding STRIDE files from the list and read SSE from that. (base dataset)
            self.complete_sse_dict = sse_func.find_stride_file(self.name.replace(".fa","")) # previously: fasta_file.split("/")[-1][:-3]
            self.sse_sequence = sse_func.read_sse_asg(self.name.replace(".fa",""))
        # Try to detect GAIN-like order of SSE. Frist criterion is a C-terminal strand being present (= Stachel/TA)
        try: 
            self.end = self.complete_sse_dict['Strand'][-1][1]
            self.isValid = True
        except: 
            print("No Strands detected. This is not a GAIN domain.")
            self.isValid = False
            return      

        # Find the domain boundaries (this includes a check whether the sequence is in fact a GAIN)
        # Will return (None, None) if checks fail. 
        self.start, self.subdomain_boundary = sse_func.find_boundaries(self.complete_sse_dict, 
                                                                       self.end, 
                                                                       bracket_size=subdomain_bracket_size, 
                                                                       domain_threshold=domain_threshold,
                                                                       coil_weight=coil_weight)
        if (self.start is not None):
            self.hasSubdomain = True

        if self.start == None:
            print("No Subdomain boundaries detected. Possible Fragment found.")
            self.hasSubdomain = False
            # For possible Fragment detection (i.e. Subdomain B only sequences), set start as the N-terminal res. of the first beta sheet    
            self.start = np.amin(np.array(self.complete_sse_dict["Strand"]))
        
        # Initialize residue indices as list, starting form zero, indexing EXISTING residues including "X" etc.
        self.index = list(range(0, self.end-self.start+1))
        # Initialize one-letter GAIN sequence as list
        #print(f"[DEBUG] gain_classes.GainDomain :\n\t{self.start = }\n\t{self.end = }\n\t{len(sequence) = }\n\t{self.end-self.start+1 = }")
        if is_truncated:
            # SANITY CHECK: There might occur a case where the GAIN domain is detected anew (i.e. when different parameters are used). There might be a truncation therefore.
            #               If that is the case, truncate the sequence N-terminally to that only ((self.end-self.start+1)) residues are included
            if len(sequence) > (self.end-self.start+1):
                print(f"[DEBUG] gain_classes.GainDomain : {self.name}\nDETECTED ALTERED GAIN DOMAIN DETECTION. TRUNCATING @ RESIDUE : {len(sequence)-self.end+self.start}"
                    f"\n\t{self.start = }\t{self.end = }\n\t{len(sequence) = }\n\t{self.end-self.start+1 = }")
                self.sequence = np.asarray(list(sequence[len(sequence)-self.end+self.start-1:])) # Begin with the new first residue, end normally
                print(f"[DEBUG]: gain_classes.GainDomain : \n\t {len(sequence) = }, {len(self.sequence) = }\n{sequence}\n{''.join(self.sequence)}")

            elif len(sequence) < (self.end-self.start+1): 
                # This is an edge case where the signal detection identifies a Sheet-segment in Subdomain A. Therefore, non-cons. GAIN domain.
                print(f"[DEBUG] gain_classes.GainDomain : {self.name}\nSEQUENCE LENGTH SHORTER THAN DETECTED GAIN BOUNDARIES.!\n"
                    f"IT WILL BE DECLARED INVALID.\n{len(sequence) =}\n{self.end+self.start = }")
                self.sse_dict = sse_func.cut_sse_dict(self.start, self.end, self.complete_sse_dict)
                print(f"[DEBUG] gain_classes.GainDomain.__init__():\n {self.subdomain_boundary = }, {type(self.subdomain_boundary) = }")
                if self.subdomain_boundary is None :
                    self.subdomain_boundary = 0
                self.plot_helicality(savename=f"{self.name}_SEQSHORT_SKIP.png")
                self.isValid = False
                self.hasSubdomain = False
                return

            else:
                self.sequence = np.asarray(list(sequence))
        if sequence and not is_truncated: 
            self.sequence = np.asarray(list(sequence[self.start:self.end+1]))
        if fasta_file and not is_truncated:
            self.sequence = np.asarray(list(sse_func.read_seq(fasta_file)))[self.start:self.end+1]
        ''' Find the indices of the Alignment where each residue of the sequence is located.
            For base dataset, this will be the base dataset alignment,
            For new GAIN, this will be the alignment appended by the adding method.
            Returns empty list if failed. '''
        #print(f"DEBUG", self.sequence, type(self.sequence), self.sequence.shape)
        #print(f"DEBUG: Getting alignment indices with: {self.name}, {self.sequence.shape = }, {alignment_file} {type(alignment_dict)}")
        #print(f"{self.sequence = }\n{self.start = }\n{self.end = }\n{len(self.sequence) = }")
        
        if truncation_map is not None: 
            cut_truncation_map = truncation_map[self.start:self.end+1]
        else:
            cut_truncation_map = None

        # Here the alignment indices get overridden by the input. Since these incorporate the full sequence, truncate between start and end.
        self.alignment_indices = alignment_indices[self.start:self.end+1]
        print(f"[DEBUG] {len(self.alignment_indices) = }")
        # Cut down the SSE dictionary down to the GAIN only
        self.sse_dict = sse_func.cut_sse_dict(self.start, self.end, self.complete_sse_dict)

        # get a name map based on enumerating the SSE segments,
        # THIS IS NOT THE ACTUAL NOMENCLATURE BUT A SELF-CONSISTENT METHOD FOR OVERVIEW PURPOSES
        self.sse_name_map = None
        if not skip_naming:
            self.sse_name_map = sse_func.name_sse(self.sse_dict, 
                                              self.subdomain_boundary, 
                                              self.start, 
                                              self.end,
                                              self.sse_sequence)
        
        # Find the GPS residues (triad) based on the alignment column of gps-minus-one (GPS-1 N-terminal residue before cleavage site)
        self.GPS = GPS(self.alignment_indices, 
                       self.sse_dict, 
                       self.index, 
                       self.sequence, 
                       self.start,
                       gps_minus_one=gps_index)
        
        # parse the Quality from the input quality LIST, not the quality file
        # The input as a list is deliberate to make the quality parameter more flexible,
        # You could input any kind of quality signal here
        self.residue_quality = sse_func.get_quality(self.alignment_indices, quality)

        if self.hasSubdomain == True:
            # enumeration + evaluation of subdomain SSE composition
            alpha, beta, a_breaks, b_breaks = sse_func.get_subdomain_sse(self.sse_dict, 
                                                                         self.subdomain_boundary, 
                                                                         self.start, 
                                                                         self.end,
                                                                         self.sse_sequence,
                                                                         stride_outlier_mode=stride_outlier_mode)
            self.sda_helices = np.subtract(alpha, self.start)
            #print(f"[DEBUG] gain_classes.GainDomain : {alpha = } ,{self.sda_helices = }")
            self.sdb_sheets = np.subtract(beta, self.start)
            self.a_breaks = a_breaks
            self.b_breaks = b_breaks
            #print(f"{a_breaks = }, \n {self.a_breaks = }")
        # Gather the respective anchors for this GainDomain
        if not hasattr(self, 'sda_helices'):
            if self.subdomain_boundary is None :
                self.subdomain_boundary = 0
            self.plot_helicality(savename=f"{self.name}_NO_HELICES.png")
            print(f"[ERROR] gain_classes.__init()__ : NO SDA HELICES DETECTED\n{self.name}")
            self.isValid = False 
            self.hasSubdomain = False
            return
        print("This Domain has notations for:\nHelices:\t",len(self.sda_helices),"\nSheets:\t", len(self.sdb_sheets),"\nLength:\t",len(self.sequence))
        print(self.residue_quality[-30:], len(self.residue_quality))
        
        self.Anchors = PrealignedAnchors(self)

class PrealignedAnchors(Anchors):
    def __init__(self, a_gain_domain):
        sse_names = []
        quality_values = []
        alignment_indices = []
        gain_residues = []
        relative_positions = []
        # Mush together the smoothened SSE from both subdomains
        all_sse = np.concatenate((a_gain_domain.sda_helices, a_gain_domain.sdb_sheets), axis=0)

        # print(f"{a_gain.domain.sdb_sheets = }")
        #print(f"[DEBUG] gain_classes.Anchors : {a_gain_domain.residue_quality}, \n {all_sse = } ")

        # Get the residue within each SSE of the highest value of the quality metric
        for i in range(all_sse.shape[0]):
            print(f"[DEBUG] gain_classes.Anchors :\n\t{i = }\n\t{all_sse[i,:] = }\n\t{a_gain_domain.residue_quality[all_sse[i,0]:all_sse[i,1]] = }")
            best_index = all_sse[i,0] + \
                         np.argmax(a_gain_domain.residue_quality[all_sse[i,0]:all_sse[i,1]])

            # For which (self-consistently enumerated) SSE is this
            #sse_names.append(a_gain_domain.sse_name_map[best_index])

            # What is the associated quality value
            quality_values.append(a_gain_domain.residue_quality[best_index])   
            # In which alignment column is it
            alignment_indices.append(a_gain_domain.alignment_indices[best_index]) 
            # What is the residue name
            gain_residues.append(a_gain_domain.sequence[best_index])
            # What is the relative best quality position in the respective SSE
            relative_positions.append(best_index)

        # Feed into class attributes
        self.sse_names = np.array(sse_names)
        self.quality_values = np.array(quality_values)
        self.alignment_indices = np.array(alignment_indices)
        self.gain_residues = np.array(gain_residues)
        self.relative_positions = np.array(relative_positions)
        self.count = all_sse.shape[0]

With the overridden intializing function for PrealignedGain, create this domain for each invertebrate sequence and create the indexing subsequently, if possible.

In [None]:
invertebrate_proteins = [f for f in glob.glob("/home/hildilab/projects/GPS_massif/invert_truncs/alignme/aln/*") if "zip" not in f]
invertebrate_strides = glob.glob("/home/hildilab/projects/GPS_massif/invert_truncs/*/best_model.stride")

for xdir in invertebrate_proteins: # This is so far based on directory names.
    #xdir = invertebrate_proteins[0]

    alnfile = glob.glob(f"{xdir}/*aln")[0]
    target_seq_file = [f for f in glob.glob(f"{xdir}/fasta2*") if "." not in f][0]
    target_name, target_seq = sse_func.read_seq(target_seq_file, return_name=True)
    #print(glob.glob(f"{xdir}/fasta2*")[0])
    print(target_name, target_seq)
    aln_dict = read_pw_alignment(alnfile)
    mapper = map_pw_alignment(aln_dict, template_seq=seq, target_seq=target_seq, template_id="Q6QNK2")
    # this mapper is zero_indexed.
    print(f"[DEBUG] {len(mapper) = }")

    # Build the target alignment columns from mapping the parwise matrix onto the respective template alignment indices
    tar_aln_cols = np.zeros((len(mapper)), dtype=int)
    for i,val in enumerate(mapper):
        if not math.isnan(val):
            tar_aln_cols[i]=d1_gain.alignment_indices[int(val)]
    #print(np.unique(tar_aln_cols[~np.isnan(mapper)]))

    # Find the corresponding stride file with the identifier
    identifier = re.split(r'[-_]', target_name)[0]
    print("identifier:", identifier)
    target_stride_file = [s for s in invertebrate_strides if identifier in s][0]

    gain = PrealignedGain(alignment_file=alignment_file, 
                    aln_cutoff=aln_cutoff,
                    quality=quality,
                    gps_index=gps_minus_one, 
                    alignment_indices=tar_aln_cols,
                    name=target_name,
                    sequence=target_seq,
                    explicit_stride_file=target_stride_file
                    )
    if gain.isValid:
        gain.create_indexing(anchors, anchor_occupation, anchor_dict, outdir="/home/hildilab/projects/GPS_massif/invert_truncs/indexing")
    #gain.plot_profile(outdir='/home/hildilab/projects/GPS_massif/invert_truncs/profiles')
    #gain.plot_helicality(savename=f'/home/hildilab/projects/GPS_massif/invert_truncs/profiles/{gain.name}.png')

In [None]:
enriched_positions = ['H1.50','H1.54','H1.57','H1.61','H2.56','H2.57','H2.60','H2.61','H3.36','H3.43',
'H3.44','H3.51','H3.53','H3.56','H4.38','H4.41','H4.51','H5.37','H5.38','H5.42','H5.44',
'H5.48','H5.50','H5.59','H6.42','H6.54','H6.56','H7.40','H7.51','H8.46','H8.58','H8.60',
'S1.48','S2.47','S2.51','S2.53','S2.58','S3.53','S3.55','S4.56','S5.48','S5.52','S5.55',
'S6.50','S7.45','S7.52','S7.57','S9.53','S10.50','S11.54','S11.55','S12.47','S12.50','S12.52','S13.48','S13.50']

In [None]:
print(invertebrate_proteins)
dcirl = '/home/hildilab/projects/GPS_massif/invert_truncs/alignme/aln/e1jh11'