In [1]:
import os
import time
import subprocess
from tqdm import tqdm

In [2]:
# Define the E-value cutoff for filtering search results
evalue_cutoff = 1e-15

# Directory path where proteome files are stored
proteome_dir = "/Users/akshayonly/Work/Sequence-Data/Remaining-Proteomes"

# Directory path where profile HMM files are stored
profiles_dir = "/Users/akshayonly/Work/Sequence-Data/Subunits-Sequences/Profiles"

# Creating a subdirectory name based on the profiles directory
# This is used to store search results
subdir = f"Search_{profiles_dir.split('/')[-2]}"

suffix = proteome_dir.split('/')[-1]

In [3]:
profiles = sorted([profile for profile in os.listdir(profiles_dir) if profile.endswith('.hmm')])

proteomes = sorted([proteome for proteome in os.listdir(proteome_dir) if proteome.endswith('.faa')])

In [4]:
# Iterating over each profile HMM file in the profiles directory
for profile in profiles:
    # Constructing the full file path for the profile HMM file
    profile_fp = os.path.join(profiles_dir, profile)

    # Extracting the hmm search name from the profile file name
    hmm_search = profile.replace('_profile.hmm', '')

    # Creating a directory path to store search results for this particular HMM search
    hmm_search_subdir = os.path.join(os.getcwd(), subdir)
    hmm_search_dir = os.path.join("/Users/akshayonly/Work/Remaining-Search-Results", f"Remaining-{hmm_search}")

    # Creating the directory to store the results, if it doesn't already exist
    os.makedirs(hmm_search_dir, exist_ok=True)
    
    profile_name = profile.replace('_clustered_mmseq_75_profile', '')
    
    # Iterating over each proteome file in the proteome directory
    for proteome in tqdm(proteomes, desc=f"Searching with {profile_name}"):

        # Constructing the full file path for the proteome file
        proteome_fp = os.path.join(proteome_dir, proteome)

        # Defining the path for the output result file
        result = os.path.join(hmm_search_dir, proteome.replace('.faa', '.txt'))

        # Constructing the HMMER command for searching the proteome with the HMM profile
        hmmer_command = f"hmmsearch --cpu 8 -E {evalue_cutoff} --noali --tblout {result} {profile_fp} {proteome_fp}"

        # Executing the HMMER command
        subprocess.run(hmmer_command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

Searching with combined_cds_interpro_nuoa.hmm: 100%|████████████████████████████████████| 44/44 [00:00<00:00, 79.67it/s]
Searching with combined_cds_interpro_nuob.hmm: 100%|████████████████████████████████████| 44/44 [00:00<00:00, 87.05it/s]
Searching with combined_cds_interpro_nuobcd.hmm: 100%|██████████████████████████████████| 44/44 [00:00<00:00, 56.92it/s]
Searching with combined_cds_interpro_nuoc.hmm: 100%|████████████████████████████████████| 44/44 [00:00<00:00, 82.16it/s]
Searching with combined_cds_interpro_nuocd.hmm: 100%|███████████████████████████████████| 44/44 [00:00<00:00, 63.52it/s]
Searching with combined_cds_interpro_nuod.hmm: 100%|████████████████████████████████████| 44/44 [00:00<00:00, 75.19it/s]
Searching with combined_cds_interpro_nuoe.hmm: 100%|████████████████████████████████████| 44/44 [00:00<00:00, 83.05it/s]
Searching with combined_cds_interpro_nuof.hmm: 100%|████████████████████████████████████| 44/44 [00:00<00:00, 69.51it/s]
Searching with combined_cds_inte