In [1]:
import os

from tqdm import tqdm
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import pandas as pd

import warnings
warnings.filterwarnings('ignore')  # Ignore all warnings

from pathlib import Path
from config import HMM_RESULTS_DIR, PROTEOMES_DIR

In [3]:
hmmer = pd.read_csv(HMM_ANALYSIS_DIR / "results_unfil.csv")

In [4]:
hmmer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1078643 entries, 0 to 1078642
Data columns (total 19 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Accession           1078643 non-null  object 
 1   Replicon            1078643 non-null  object 
 2   GenomeFile          1078643 non-null  object 
 3   SequenceLength(Mb)  1078643 non-null  float64
 4   ProteomeFile        1078643 non-null  object 
 5   ProteinAccession    1078643 non-null  object 
 6   evalue              1078643 non-null  float64
 7   BitScore            1078643 non-null  float64
 8   Bias                1078643 non-null  float64
 9   SequenceDesc        1078643 non-null  object 
 10  Subunit             1078643 non-null  object 
 11  SeqsClustThreshold  1078643 non-null  int64  
 12  HMMParameter        1078643 non-null  object 
 13  Start               1078643 non-null  int64  
 14  End                 1078643 non-null  int64  
 15  log10evalue    

In [5]:
hmmer['Subunit'].unique()

array(['NuoF', 'NuoI', 'NuoG', 'NuoA', 'NuoB', 'NuoC', 'NuoD', 'NuoE',
       'NuoH', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN', 'NuoCD', 'NuoBCD'],
      dtype=object)

In [6]:
# Set output directory
output_dir = HMM_ANALYSIS_DIR / "hits_sequences"
output_dir.mkdir(parents=True, exist_ok=True)

for subunit in sorted(hmmer['Subunit'].unique()):
    selected_hmmer = hmmer[hmmer['Subunit'] == subunit]
    
    # File to store the combined sequences
    output_fasta = output_dir / f"{subunit}_unfiltered_hits.fasta"
    
    # Create a mapping of proteome files to protein accessions
    proteome_accessions = selected_hmmer.groupby('ProteomeFile')['ProteinAccession'].unique().to_dict()
    
    # Open the output file once
    with open(output_fasta, 'w') as output_handle:
        # Process each proteome file only once
        for proteome_file, accessions in tqdm(proteome_accessions.items(), desc=f"Processing {subunit}"):
            proteome_file_fp = os.path.join(PROTEOMES_DIR, proteome_file)
            try:
                # Parse the proteome file and extract sequences for needed accessions
                needed_accessions = set(accessions)
                sequences_found = 0
                
                for record in SeqIO.parse(proteome_file_fp, 'fasta'):
                    if record.id in needed_accessions:
                        # Clean the sequence
                        protein_sequence = str(record.seq).replace('*', '')
                        
                        # Create a SeqRecord with the cleaned sequence
                        seq_record = SeqRecord(
                            seq=protein_sequence,
                            id=record.id,
                            description=record.description
                        )
                        
                        # Write the SeqRecord to the output FASTA file
                        SeqIO.write(seq_record, output_handle, 'fasta')
                        
                        sequences_found += 1
                        needed_accessions.remove(record.id)
                        
                        # Break if all needed accessions have been found
                        if not needed_accessions:
                            break
                if sequences_found == 0:
                    print(f"No sequences found in {proteome_file_fp} for the specified accessions.")
            except FileNotFoundError:
                print(f"File not found: {proteome_file_fp}")
            except Exception as e:
                print(f"An error occurred with file {proteome_file_fp}: {e}")

Processing NuoA: 100%|██████████████████████████████████████████████████████████████████████████| 31048/31048 [03:08<00:00, 164.91it/s]
Processing NuoB: 100%|██████████████████████████████████████████████████████████████████████████| 31669/31669 [03:21<00:00, 157.03it/s]
Processing NuoBCD: 100%|██████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 165.07it/s]
Processing NuoC: 100%|██████████████████████████████████████████████████████████████████████████| 14621/14621 [01:27<00:00, 167.11it/s]
Processing NuoCD: 100%|█████████████████████████████████████████████████████████████████████████| 16881/16881 [01:44<00:00, 161.57it/s]
Processing NuoD: 100%|██████████████████████████████████████████████████████████████████████████| 23987/23987 [02:46<00:00, 143.66it/s]
Processing NuoE: 100%|██████████████████████████████████████████████████████████████████████████| 30016/30016 [03:07<00:00, 160.29it/s]
Processing NuoF: 100%|██████████████████████████