In [1]:
import os
import shutil
from tqdm import tqdm
import subprocess

In [2]:
cds_sequences_dir = "/Users/akshayonly/Work/Sequence-Data/Nuo-Sequences"

interpro_sequences_dir = "/Users/akshayonly/Work/Sequence-Data/InterPro"

combined_dir = "/Users/akshayonly/Work/Sequence-Data/Combined"

if not os.path.exists(combined_dir):
    os.makedirs(combined_dir)

seq_dir = "/Users/akshayonly/Work/Sequence-Data/Sequences"

if not os.path.exists(seq_dir):
    os.makedirs(seq_dir)

In [3]:
subunits = ['NuoN', 'NuoM', 'NuoL', 'NuoK', 'NuoJ', 'NuoI', 'NuoH', 'NuoG', 'NuoF', 
            'NuoE', 'NuoC', 'NuoA', 'NuoB', 'NuoD', 'NuoCD', 'NuoBCD', 'NuoEF', 'NuoHI']

subunits = [subunit.lower() for subunit in subunits]

In [4]:
interpro_sequences = [(fasta.split('_')[0].lower(), fasta) for fasta in os.listdir(interpro_sequences_dir)]
cds_sequences = [(fasta.split('_')[0].lower(), fasta) for fasta in os.listdir(cds_sequences_dir)]

In [5]:
def combining_sequences(sequences, combined_dir, source_directory):

    for subunit, fasta in sequences:
            
        destination_directory = os.path.join(combined_dir, subunit.upper())
        if not os.path.exists(destination_directory):
            os.makedirs(destination_directory)

        source =  os.path.join(source_directory, fasta)
        destination = os.path.join(destination_directory, fasta)

        shutil.copy(source, destination)          

In [6]:
combining_sequences(interpro_sequences, combined_dir, interpro_sequences_dir)
combining_sequences(cds_sequences, combined_dir, cds_sequences_dir)

In [7]:

for sequences in tqdm(os.listdir(combined_dir)):
    sequences_dir = os.path.join(combined_dir, sequences)
    fasta = f"combined_cds_interpro_{sequences.lower()}.faa"
    concat_sequences = f"{seq_dir}/{fasta}"
    cat_command = f"cat {sequences_dir}/*.faa > {concat_sequences}"
    os.system(cat_command)

100%|██████████| 16/16 [00:00<00:00, 268.47it/s]


In [8]:
def run_mmseqs_commands(fasta_file, basename, output_dir, threshold=0.85):
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    temp_dir = os.path.join(output_dir, "temp")
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    db_name = os.path.join(temp_dir, f"{basename}_db")
    cluster_db_name = f"{db_name}_clu"
    subset_db_name = f"{cluster_db_name}_rep"
    clustering_threshold = int(100 * threshold)
    output_fasta = os.path.join(output_dir, f"{basename}_clustered_mmseq_{clustering_threshold}.fasta")

    # Execute MMSeqs2 commands with silenced output
    subprocess.run(["mmseqs", "createdb", fasta_file, db_name], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    subprocess.run(["mmseqs", "cluster", db_name, cluster_db_name, temp_dir, "--min-seq-id", str(threshold)], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    subprocess.run(["mmseqs", "createsubdb", cluster_db_name, db_name, subset_db_name], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    subprocess.run(["mmseqs", "convert2fasta", subset_db_name, output_fasta], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

output_dir = "/Users/akshayonly/Work/Sequence-Data/Clustered-Sequences"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# Process each FASTA file
for fasta_file in tqdm(sorted(os.listdir(seq_dir))):
    if fasta_file.endswith(".faa"):
        basename = fasta_file.split('.')[0]
        fasta_file_path = os.path.join(seq_dir, fasta_file)
        run_mmseqs_commands(fasta_file_path, basename, output_dir, threshold=0.75)

100%|██████████| 16/16 [00:26<00:00,  1.65s/it]
