In [1]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm
from glob import glob

import re

In [2]:
def parse_hmmer_tblout_output(file_fullpath):
    """
    Parse HMMER tblout output to extract key information and store it in a pandas DataFrame.
    
    Args:
        file_fullpath (str): Full path to the file containing HMMER tblout output.
    
    Returns:
        pd.DataFrame: DataFrame containing parsed hit data with cleaned and formatted columns.
    """
    # Precompile the regex pattern to remove accession version numbers.
    accession_pattern = re.compile(r'_[0-9]+$')

    # Initialize an empty list to store hit data.
    hits = []

    # Read the file, skipping the first three header lines.
    with open(file_fullpath, 'r') as handle:
        lines = handle.readlines()[3:]

    # Process each line in the file.
    for line in lines:
        if not line.strip() or line.startswith('#'):
            continue  # Skip empty lines and comments.
        
        # Split the line into columns and remove empty strings.
        cols = [x for x in line.strip().split() if x]

        # Extract the subunit name and format it.
        subunit_text = "Nuo" + cols[2].split('_')[3].replace('nuo', '').upper()

        # Create a dictionary for the current hit.
        hit = {
            'Accession': accession_pattern.sub('', cols[0]),  # Clean the accession number.
            'ProteinAccession' : cols[0],
            'Subunit': subunit_text,
            'evalue': np.float64(cols[4]),
            'BitScore': np.float64(cols[5]),
            'Bias': np.float64(cols[6]),
            # 'evalueDomain': float(cols[7]),
            # 'BitScoreDomain': float(cols[8]),
            # 'BiasDomain': float(cols[9]),
            'SequenceDesc': " ".join(cols[18:])  # Combine remaining columns for description.
        }

        # Append the hit to the list.
        hits.append(hit)

    # Convert the list of hits to a DataFrame and return.
    return pd.DataFrame(hits)


In [3]:
def process_directory(main_directory):
    """
    Traverse through all subdirectories of the main directory and process all txt files.
    
    Args:
        main_directory (str): Path to the main directory containing subdirectories of HMMER result files.
    
    Returns:
        pd.DataFrame: A single DataFrame containing all parsed data from all files.
    """
    all_dataframes = []  # List to hold all dataframes

    # Traverse each subdirectory and each file within the subdirectory
    for subdir in glob(os.path.join(main_directory, '*/')):  # Path pattern to get all subdirectories
        for file in tqdm(glob(os.path.join(subdir, '*.txt'))):  # Path pattern to get all txt files
            df = parse_hmmer_tblout_output(file)
            all_dataframes.append(df)

    # Concatenate all individual dataframes into one
    final_dataframe = pd.concat(all_dataframes, ignore_index=True)
    return final_dataframe

In [4]:
hmmer_results = "/Users/akshayonly/Work/Search-Results"
df_hmmer_results = process_directory(hmmer_results)

100%|█████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 5969.38it/s]
100%|███████████████████████████████████████████████████████████████████████████| 42549/42549 [00:05<00:00, 7155.33it/s]
100%|███████████████████████████████████████████████████████████████████████████| 42549/42549 [00:06<00:00, 6155.59it/s]
100%|███████████████████████████████████████████████████████████████████████████| 42593/42593 [00:06<00:00, 7051.63it/s]
100%|███████████████████████████████████████████████████████████████████████████| 42593/42593 [00:06<00:00, 7019.79it/s]
100%|███████████████████████████████████████████████████████████████████████████| 42549/42549 [00:06<00:00, 6495.73it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 6806.93it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 8073.38it/s]
100%|███████████████████████████

In [5]:
df_hmmer_results.reset_index(drop=True)

Unnamed: 0,Accession,ProteinAccession,Subunit,evalue,BitScore,Bias,SequenceDesc
0,CP016684.1,CP016684.1_1005,NuoM,8.700000e-194,642.4,45.7,# 1087039 # 1088535 # 1 # ID=1_1005;partial=00...
1,CP016684.1,CP016684.1_1004,NuoM,9.300000e-45,150.7,37.5,# 1084926 # 1086950 # 1 # ID=1_1004;partial=00...
2,CP016684.1,CP016684.1_1006,NuoM,3.300000e-44,148.9,37.6,# 1088545 # 1089990 # 1 # ID=1_1006;partial=00...
3,NZ_CP047242.1,NZ_CP047242.1_3932,NuoM,2.400000e-177,589.7,24.8,# 4762872 # 4764485 # 1 # ID=1_3932;partial=00...
4,NZ_CP047242.1,NZ_CP047242.1_2798,NuoM,1.700000e-171,570.4,23.2,# 3453876 # 3455453 # -1 # ID=1_2798;partial=0...
...,...,...,...,...,...,...,...
1223074,NZ_CP016569.1,NZ_CP016569.1_1466,NuoF,1.800000e-18,65.3,4.8,# 1532421 # 1534628 # -1 # ID=1_1466;partial=0...
1223075,NZ_CP016569.1,NZ_CP016569.1_2050,NuoF,6.600000e-16,56.8,0.0,# 2129822 # 2131177 # 1 # ID=1_2050;partial=00...
1223076,AP014565.1,AP014565.1_2516,NuoF,1.400000e-198,659.8,0.0,# 2601185 # 2602522 # -1 # ID=1_2516;partial=0...
1223077,AP014565.1,AP014565.1_1604,NuoF,1.100000e-18,66.0,4.2,# 1656470 # 1658677 # -1 # ID=1_1604;partial=0...


In [6]:
# Extracting numbers using regular expressions
for i, row in tqdm(df_hmmer_results.iterrows()):
    description = row['SequenceDesc']  # Accessing row data using 'row'
    match = re.search(r'#\s*(\d+)\s*#\s*(\d+)\s*', description)
    if match:
        start, end = map(int, match.groups())
        df_hmmer_results.at[i, 'Start'] = start
        df_hmmer_results.at[i, 'End'] = end

df_hmmer_results['Start'] = df_hmmer_results['Start'].astype(int)
df_hmmer_results['End'] = df_hmmer_results['End'].astype(int) 

df_hmmer_results.sort_values(by='evalue', inplace=True)
df_hmmer_results.drop_duplicates(subset=['Accession', 'ProteinAccession'], keep='first', inplace=True)
df_hmmer_results.reset_index(drop=True, inplace=True)

1223079it [00:23, 51185.30it/s]


In [7]:
df_hmmer_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 757211 entries, 0 to 757210
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Accession         757211 non-null  object 
 1   ProteinAccession  757211 non-null  object 
 2   Subunit           757211 non-null  object 
 3   evalue            757211 non-null  float64
 4   BitScore          757211 non-null  float64
 5   Bias              757211 non-null  float64
 6   SequenceDesc      757211 non-null  object 
 7   Start             757211 non-null  int64  
 8   End               757211 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 52.0+ MB


In [11]:
df_hmmer_results = df_hmmer_results.sort_values(['Accession', 'Start'])

In [12]:
df_hmmer_results.to_csv('/Users/akshayonly/Work/Updated/Data/hmmer_search_results_processed.csv', index=False)