# PKD_GAIN_PROCESSING
A juypter notebook for assigning the created GAIN-GRN to the set of PKD1/PKD1L1 protein structures that have been generated and are partially shown in Figure 2 ans Supp. Fig. 3. For this, we generated another 2824 PKD1/PDK1L1 GAIN domain models and run STRIDE on them. We proceed analogously to aGPCR GAIN domains, skipping the template stage and going directly from validation to assigning the indexing.

In [1]:
# build a database from a dataset for nomenclating
# INPUT: a collection of GAIN domain PDBs, their sequences as one large ".fa" file
import multiprocessing as mp
from subprocess import Popen, PIPE
import glob
from shutil import copyfile
import pickle
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.ticker import (MultipleLocator, FixedLocator)
#import logomaker
# LOCAL IMPORTS
import gaingrn.utils.structure_utils
from gaingrn.utils.indexing_classes import GPCRDBIndexing
from gaingrn.utils.gain_classes import GainDomain, GainCollection, GainDomainNoAln, FilterCollection
import gaingrn.utils.io

#### 1) Filter the folded PKD proteins analogously to **colabfold_stage.ipynb**
- run **STRIDE** on all models

In [None]:
pkd_pdb_dir = "/home/hildilab/agpcr_nom/pkd_pdbs"
pdbs = glob.glob(f"{pkd_pdb_dir}/*_rank_1_*.pdb")
print(f"Found {len(pdbs)} best ranked models in target directories.")

stride_folder = f"/home/hildilab/agpcr_nom/pkds_stride"
stride_bin = "/home/hildilab/lib/stride/stride"
           
def compile_stride_mp_list(pdbs, stride_folder,stride_bin):
    stride_mp_list = []
    
    for pdb in pdbs:
        pdb_name = pdb.split("/")[-1]
        name = pdb_name.split("_unrelaxed_")[0]
        out_file = f"{stride_folder}/{name}.stride"
        arg = [pdb, out_file, stride_bin]
        
        stride_mp_list.append(arg)
        
    return stride_mp_list

def run_stride(arg):
    pdb_file, out_file, stride_bin = arg
    stride_command = f"{stride_bin} {pdb_file} -f{out_file}"
    gaingrn.utils.io.run_command(stride_command)

def execute_stride_mp(stride_mp_list, n_threads=10):
        stride_pool = mp.Pool(n_threads)
        stride_pool.map(run_stride, stride_mp_list)
        print("Completed mutithreaded creation of STRIDE files!")
        
        #execute.run_stride(pdb, out_file, stride_bin)
    
stride_mp_list = compile_stride_mp_list(pdbs, stride_folder, stride_bin)
print(len(stride_mp_list))
# MP execution of STRIDE
execute_stride_mp(stride_mp_list, n_threads=3)

In [None]:
# Eliminate double entries (both in the original run and the added small runs)
# Form the "pdbs" list

stride_files = glob.glob(f"{stride_folder}/*")
print(len(stride_files))
accessions = [f.split(".strid")[0].split("/")[-1].split("-")[0] for f in stride_files]
pdb_accessions = np.array([p.split("_unrelaxed_")[0].split("/")[-1].split("-")[0] for p in pdbs])

# Find duplicate in the original pdbs list and indicate them via > is_duplicate = True <
is_duplicate=np.zeros([len(pdbs)], dtype=bool)
sort_pdb_ac = np.sort(pdb_accessions)
duplicate_list = []
for i, pdb in enumerate(sort_pdb_ac):
    if i+1 == len(sort_pdb_ac):
        break
    if pdb == sort_pdb_ac[i+1]:
        duplicate_list.append(pdb)
        multi_indices = np.where(pdb == pdb_accessions)[0]
        is_duplicate[multi_indices[0]] = True

np_pdbs = np.array(pdbs)
singlet_pdbs = np_pdbs[is_duplicate == False] # This is the reduced list with ONLY UNIQUE PDBs
print(f"Reduced the initial set of {len(pdbs)} PDB files down to {len(singlet_pdbs)} files.")

# This is a check routine if there are PDBs in the reduced list which have NOT a STRIDE file
singlet_pdb_accessions = np.array([p.split("_unrelaxed_")[0].split("/")[-1].split("-")[0] for p in singlet_pdbs])

counter = 0
for ac in singlet_pdb_accessions:
    if ac not in accessions:
        print(ac)
    else:
        counter += 1
print(f"Found {counter}/{len(singlet_pdb_accessions)} accessions in the accession list.")

In [None]:
fasta_file = "../all_pkds.fa"
stride_folder = "../pkds_stride"
sequences = gaingrn.utils.io.read_multi_seq(fasta_file)
print(len(sequences))
stride_files = glob.glob("../pkds_stride/*")
print(len(stride_files))

filtered_pkds = FilterCollection(
                                   stride_files = stride_files,
                                   sequences = sequences
                                   )
filtered_pkds.write_all_seq("../../all_pkd_gain.fa")

In [2]:
def filter_by_receptor(sequences, selection):
    new_list = []
    for seq_tup in sequences:
        if selection in seq_tup[0]:
            new_list.append(seq_tup)
    return new_list

def filter_by_list(sequences, selection): # selection list
    new_list = []
    for seq_tup in sequences:
        for it in selection:
            if it in seq_tup[0]:
                new_list.append(seq_tup)
    return new_list

In [13]:
valid_seqs = gaingrn.utils.io.read_multi_seq("/home/hildilab/agpcr_nom/all_pkd_gain.fa")
stride_files = glob.glob("/home/hildilab/agpcr_nom/pkd_stride/*")

# re-offset the sequences to match the exact PDB indexing
f_seqs = gaingrn.utils.io.read_alignment("/home/hildilab/agpcr_nom/all_pkds.fa")
full_seqs = {k.split("-")[0]:v for k,v in f_seqs.items()}

valid_adj_seqs = []
for tup in valid_seqs:
    name = tup[0].split("-")[0]
    x = gaingrn.utils.structure_utils.find_the_start(longseq=full_seqs[name], shortseq=tup[1])
    if x == 0:
        valid_adj_seqs.append( (name,full_seqs[name][:len(tup[1])-1]) )
    else:
        valid_adj_seqs.append( (name,full_seqs[name][x-1:x+len(tup[1])-1]) )


2823


In [None]:
valid_collection = GainCollection(  
                                    stride_files = stride_files,
                                    sequence_files=None,
                                    sequences=valid_adj_seqs,
                                    is_truncated = True,
                                    coil_weight=0.00,
                                    stride_outlier_mode=True,
                                    no_alignment=True,
                                    debug=False)
pickle.dump(valid_collection, open('../pkd_collection.pkl', 'wb'))

In [None]:
def truncate_pdb(start, end, oldpdb, newpdb):
    with open(oldpdb) as p:
        data = p.readlines()
    newdata = []
    for line in data:
        if line.startswith('ATOM'):
            resid = int(line[22:26])
            if start > resid or end < resid:
                continue
        newdata.append(line)
    with open(newpdb, 'w') as new:
        new.write(''.join(newdata))

pdbs = glob.glob("../pkd_pdbs/*_rank_1_*.pdb")
print(f"Found {len(pdbs)} best ranked models in target directories.")
valid_ct = 0

for gain in valid_collection.collection:
    valid_ct +=1
    name = gain.name
    tar_pdb = [p for p in pdbs if name.split("_")[0] in p][0]
    new_pdb_path = f'../trunc_pkd_pdbs/{gain.name.split("-")[0]}.pdb'
    truncate_pdb(gain.start,gain.end, tar_pdb, new_pdb_path)
print("Copied and truncated", valid_ct, "GAIN domains.")
    

In [None]:
# PLOT SEGMENT STATISTICS
valid_collection.plot_sse_hist(title=f"Polycystins (Total: {len(valid_adj_seqs)})",
                               n_max=26,
                               savename="../pkd_data/pkd_sse")

#### We proceed with **run_indexing_pkd.py** for generating the GAIN-GRN indexing of PKD1/PKD1L1 proteins.