# COLABFOLD STAGE of GAIN-GRN
In this stage, we start with the collection of GAIN domain PDBs and their sequences collected in a single large FASTA file.
The goal here is to filter only valid GAIN domains, that have a GPS triad (or residues there) and a helical subdomain A, since the N-terminal GAIN boundaries are not annotated in current databases.

**The corresponding RAW PDB files are not deposited in the ZENODO repository. Please contact www.researchgate.net/profile/Florian-Seufert-4 for the raw data.**

In [1]:
import numpy as np
import glob
import multiprocessing as mp
from subprocess import Popen, PIPE

from gaingrn.scripts.gain_classes import GainDomain, FilterCollection
import gaingrn.scripts.io

#### 1) Run STRIDE on all PDB files of best RANK (1) in the folded dataset

In [None]:

pdbs = glob.glob("/home/hildilab/projects/agpcr_nom/*output/**/*_rank_1_*.pdb")
print(f"Found {len(pdbs)} best ranked models in target directories.")
#print(len(celsr_pdbs))

stride_folder = f"/home/hildilab/projects/agpcr_nom/all_gps_stride"
stride_bin = "/home/hildilab/lib/stride/stride"
           
def compile_stride_mp_list(pdbs, stride_folder,stride_bin):
    stride_mp_list = []
    
    for pdb in pdbs:
        pdb_name = pdb.split("/")[-1]
        name = pdb_name.split("_unrelaxed_")[0]
        out_file = f"{stride_folder}/{name}.stride"
        arg = [pdb, out_file, stride_bin]
        
        stride_mp_list.append(arg)
        
    return stride_mp_list

def run_stride(arg):
    pdb_file, out_file, stride_bin = arg
    stride_command = f"{stride_bin} {pdb_file} -f{out_file}"
    gaingrn.scripts.io.run_command(stride_command)

def execute_stride_mp(stride_mp_list, n_threads=10):
        # multiprocessed variant wrapper
        stride_pool = mp.Pool(n_threads)
        stride_pool.map(run_stride, stride_mp_list)
        print("Completed mutithreaded creation of STRIDE files!")

stride_mp_list = compile_stride_mp_list(pdbs, stride_folder, stride_bin)
print(len(stride_mp_list))
# MP execution of STRIDE
execute_stride_mp(stride_mp_list, n_threads=10)

#### 2) Eliminate doublet entries (we are dealing with multiple folding runs) from the PDBs, by UniProtKB identifier.

In [None]:
# Eliminate double entries (both in the original run and the added small runs)
# Form the "pdbs" list

stride_files = glob.glob("/home/hildilab/projects/agpcr_nom/all_gps_stride/*")
print(len(stride_files))
accessions = [f.split(".strid")[0].split("/")[-1].split("-")[0] for f in stride_files]
pdb_accessions = np.array([p.split("_unrelaxed_")[0].split("/")[-1].split("-")[0] for p in pdbs])

# Find duplicate in the original pdbs list and indicate them via > is_duplicate = True <
is_duplicate=np.zeros([len(pdbs)], dtype=bool)
sort_pdb_ac = np.sort(pdb_accessions)
duplicate_list = []

for i, pdb in enumerate(sort_pdb_ac):
    if i+1 == len(sort_pdb_ac):
        break
    if pdb == sort_pdb_ac[i+1]:
        duplicate_list.append(pdb)
        multi_indices = np.where(pdb == pdb_accessions)[0]
        is_duplicate[multi_indices[0]] = True

np_pdbs = np.array(pdbs)
singlet_pdbs = np_pdbs[is_duplicate == False] # This is the reduced list with ONLY UNIQUE PDBs
print(f"Reduced the initial set of {len(pdbs)} PDB files down to {len(singlet_pdbs)} files.")

# This is a check routine if there are PDBs in the reduced list which have NOT a STRIDE file
singlet_pdb_accessions = np.array([p.split("_unrelaxed_")[0].split("/")[-1].split("-")[0] for p in singlet_pdbs])

counter = 0
for ac in singlet_pdb_accessions:
    if ac not in accessions:
        print(ac)
    else:
        counter += 1
print(f"Found {counter}/{len(singlet_pdb_accessions)} accessions in the accession list.")

#### 3) Create a FilterCollection object for assessing all GAIN domain models
This is done by transforming the initial MAFFT alignment + quality + FASTA to a collection of GainDomain objects to detect their C- and N-terminal boundaries and check their validity via:
- assessing presence of a helical Subdomain A
- checking the presence of two C-terminal strands and a loop in between (either in the alignment as GPS triad or alternatively via structure segment search)

We write all valid GAIN domain model sequences to a FASTA file. This will be the basis, along with a MAFFT alignment of this, for the template search and the statistical analysis of the GAIN domain dataset.

In [None]:
# SINGLE THREADED VERSION, this is very slow
fasta_file = "/home/hildilab/projects/GPS_massif/uniprot_query/all_celsr_trunc.fa"
alignment_file = "/home/hildilab/projects/agpcr_nom/appended_big_mafft.fa" # This is a combined alignment of ALL sequences in ALL queries!
quality_file = "/home/hildilab/projects/agpcr_nom/appended_big_mafft.jal"  # ^ corresponding quality file
stride_folder = "/home/hildilab/projects/agpcr_nom/all_gps_stride" 
quality = gaingrn.scripts.io.read_quality(quality_file)                              # extract BLOSUM62-based scores from the JAL file
gps_minus_one = 21160  # 19258
aln_cutoff = 21813 # 19822
stride_files = glob.glob("/home/hildilab/projects/agpcr_nom/all_gps_stride/*")
alignment_dict = gaingrn.scripts.io.read_alignment(alignment_file, aln_cutoff)

sequences = gaingrn.scripts.io.read_multi_seq(fasta_file)
print(len(sequences))
print(len(stride_files))

filterCollection = FilterCollection(alignment_file,
                                   aln_cutoff = 19822,
                                   quality = quality,
                                   gps_index = gps_minus_one,
                                   stride_files = stride_files,
                                   sequences = sequences)

filterCollection.write_filtered(savename="../data/all_valid_gain.fa", bool_mask = filterCollection.valid_gps, write_mode='w')

In [None]:
# MULTIPROCESSING VERSION FOR SPEED UP, we collect individual valid models and cat them into a single file.
# FUNCTION BLOCK

def batch_filter_seqs(arg_item):
                   # [sequences,      # A number of sequences as tuple instances
                   #  stride_folder,  # A folder containing ALL stride files
                   #   output_prefix,  # A prefix for individual file identification
                   #   alignment_file, # The big (initial) alignment file
                   #   quality,        # The corr. parsed quality for BLOSUM62 score
                   #   aln_cutoff,     # the left-most column (19822 for big_mafft.fa)
                   #   gps_minus_one,  # The column index of GPS-1 (zero-indexed! 19258 big_mafft)
                   #   ]
    sequences, stride_folder, output_prefix, alignment_file, quality, aln_cutoff, gps_minus_one, alignment_dict = arg_item
    # Parallelizable version of filtering sequences and models via FilterCollection
    # This should create separate files for each valid, fragment and no-SD group
    # These files should then be grouped together
    # The batch size is arbitrary and is considered the number of sequences passed
    
    # Output: Profiles; 4 Text files matching individual criteria (valid, fragment, invalidGPS, invalid)
    filteredBatch = FilterCollection(alignment_file,
                                   aln_cutoff = aln_cutoff,
                                   quality = quality,
                                   gps_index = gps_minus_one,
                                   stride_files = stride_files,
                                   sequences = sequences,
                                   alignment_dict = alignment_dict)
    outpath = "/home/hildilab/projects/agpcr_nom/all_gps_profiles_001" # autoproduce the GPS profiles as image here
    
    for Gain in filteredBatch.collection:
        if Gain:
            Gain.plot_profile(outdir=outpath, noshow=True)
            if Gain.hasSubdomain:
                Gain.plot_helicality(coil_weight=0.01, savename=f'{outpath}/{Gain.name}.hel.png', debug=False, noshow=True)
        
    suffixes = ["gain", 
                "fragments", 
                "noncons_gps", 
                "invalid"]
    masks = [np.logical_and(filteredBatch.valid_gps, filteredBatch.valid_subdomain),
              np.logical_not(filteredBatch.valid_subdomain),
              np.logical_not(filteredBatch.valid_gps),
              np.logical_not(np.logical_and(filteredBatch.valid_gps, filteredBatch.valid_subdomain))]
    # write four separate files, matching each criterion
    for k in range(4):
        filteredBatch.write_filtered(savename=f"{outpath}/{output_prefix}_{suffixes[k]}.fa", 
                                     bool_mask = masks[k],
                                     write_mode = 'w')
    del filteredBatch
    return None 

def run_mp_collection(arg_list, n_threads=10):
    pool = mp.Pool(n_threads)
    pool.map(batch_filter_seqs, arg_list)
    print("Completed mutithreaded filtering.")

def construct_arg_list(batch_sequence_files, 
                       output_folder,
                       stride_folder, 
                       quality, 
                       alignment_file, 
                       aln_cutoff, 
                       gps_minus_one,
                       alignment_dict = None):
    """ each item looks like this:
        sequences, \ 
        stride_folder, \
        output_prefix, \
        alignment_file, \
        quality, \
        aln_cutoff, \
        gps_minus_one = arg_item"""
    # static : stride_folder, quality, alignment_file, aln_cutoff, gps_minus_one
    # flexible : sequences, output_prefix
    arg_list = []
    #
    for idx, sequence_file in enumerate(batch_sequence_files):
        
        index_string = str(idx)
        sequences = gaingrn.scripts.io.read_multi_seq(sequence_file)
        output_prefix = f"{output_folder}_{index_string.zfill(3)}"
        item = [sequences, 
                stride_folder, 
                output_prefix, 
                alignment_file, 
                quality, 
                aln_cutoff, 
                gps_minus_one,
                alignment_dict]
        
        arg_list.append(item)
    
    print(f"[NOTE] : Compiled list of arguments for multithreaded filtering"
          f" containing {len(arg_list)} items.")
    return arg_list


def compile_fastas(prefix, out_prefix):
    # Compiles the fasta files together to construct one large file containing the sequences
    # satisfying each criterion in the 2x2 matrix
    # we want to have the GAIN sequence only that is output by the write_filtered() func.
    
    # Gather all files:
    suffixes = ["gain", 
                "fragments", 
                "noncons", 
                "invalid"]
    all_files = np.asarray(glob.glob(f"{prefix}*fa"))
    print(len(all_files))
    for suffix in suffixes:
        sub_list = sorted([f for f in all_files if suffix in f.split("_")[-1]])
        print(f"Sublist constructed for {suffix = } containing {len(sub_list)} files.")
        with open(f"{out_prefix}_{suffix}.fa", "w") as all_file:
            all_seqs = []
            for file in sub_list:
                seqs = sse_func.read_multi_seq(file)
                for j in seqs:
                    if j in all_seqs:
                        print(j[0], "doublet")
                        continue
                    all_seqs.append(j)
                    all_file.write(f">{j[0]}\n{j[1]}\n")

In [None]:
# RUN THE MULTIPROCESSED VARIANT OF FILTER_COLLECTION

batch_sequence_files = glob.glob("/home/hildilab/projects/agpcr_nom/*output/batch_*.fa")
print(len(batch_sequence_files))
output_folder = "app_gain_domains_001"

alignment_file = "/home/hildilab/projects/agpcr_nom/appended_big_mafft.fa" # This is a combined alignment of ALL sequences in ALL queries!
quality_file = "/home/hildilab/projects/agpcr_nom/appended_big_mafft.jal"  # ^ corresponding quality file
stride_folder = "/home/hildilab/projects/agpcr_nom/all_gps_stride" 
quality = gaingrn.scripts.io.read_quality(quality_file)
gps_minus_one = 21160  # 19258
aln_cutoff = 21813 # 19822
stride_files = glob.glob("/home/hildilab/projects/agpcr_nom/all_gps_stride/*")
alignment_dict = gaingrn.scripts.io.read_alignment(alignment_file, aln_cutoff)
print(len(stride_files))
print(len(batch_sequence_files))

arg_list = construct_arg_list(batch_sequence_files, 
                       output_folder,
                       stride_folder, 
                       quality, 
                       alignment_file, 
                       aln_cutoff, 
                       gps_minus_one,
                       alignment_dict)

run_mp_collection(arg_list, n_threads=16)


In [None]:
compile_fastas("/home/hildilab/projects/agpcr_nom/all_gps_profiles/app_gain_domains",
              out_prefix = "/home/hildilab/projects/agpcr_nom/app_gain")

#### We proceed with the compiled FASTA file containing only VALID GAIN domain sequences and their respective accessions.