In [1]:
import pandas as pd
import numpy as np
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from tqdm import tqdm
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from matplotlib.ticker import ScalarFormatter

import re

import warnings

warnings.filterwarnings('ignore')

In [2]:
def parse_results_tblout_output(file_fullpath):
    """
    Parse results tblout output to extract key information and store it in a pandas DataFrame.
    
    Args:
        file_fullpath (str): Full path to the file containing results tblout output.
    
    Returns:
        pd.DataFrame: DataFrame containing parsed hit data with cleaned and formatted columns.
    """
    # Precompile the regex pattern to remove accession version numbers.
    accession_pattern = re.compile(r'_[0-9]+$')

    # Initialize an empty list to store hit data.
    hits = []

    # Read the file, skipping the first three header lines.
    with open(file_fullpath, 'r') as handle:
        lines = handle.readlines()[3:]

        # Process each line in the file.
    for line in lines:
        if not line.strip() or line.startswith('#'):
            continue  # Skip empty lines and comments.
        
        # Split the line into columns and remove empty strings.
        cols = [x for x in line.strip().split() if x]

        # Extract the Profile name and format it.
        # Profile_text = "Nuo" + cols[2].split('_')[3].replace('nuo', '').upper()

        # Create a dictionary for the current hit.
        hit = {
            'Accession': accession_pattern.sub('', cols[0]),  # Clean the accession number.
            'ProteinAccession' : cols[0],
            'Profile': cols[2],
            'evalue': np.float64(cols[4]),
            'BitScore': np.float64(cols[5]),
            'Bias': np.float64(cols[6]),
            # 'evalueDomain': float(cols[7]),
            # 'BitScoreDomain': float(cols[8]),
            # 'BiasDomain': float(cols[9]),
            'SequenceDesc': " ".join(cols[18:])  # Combine remaining columns for description.
        }

        # Append the hit to the list.
        hits.append(hit)

    # Convert the list of hits to a DataFrame and return.
    return pd.DataFrame(hits)

In [3]:
def process_hmmer_results(result_dir, pattern_str=r'#\s*(\d+)\s*#\s*(\d+)\s*'):
    """
    Processes HMMER output files from the specified directory, extracts relevant data, 
    performs log transformation on e-values, and splits the 'Profile' column.

    Parameters:
    result_dir (str): The directory containing the .txt HMMER output files.
    pattern_str (str): The regex pattern for extracting 'Start' and 'End' from 'SequenceDesc'. 
                       Defaults to '#\\s*(\\d+)\\s*#\\s*(\\d+)\\s*'.

    Returns:
    pd.DataFrame: A concatenated and processed DataFrame containing the results from all files.
    """

    # Load all dataframes from files
    file_paths = glob(os.path.join(result_dir, '*.txt'))  # Get all .txt files in directory
    all_dataframes = [parse_results_tblout_output(file) for file in tqdm(file_paths)]

    # Concatenate all dataframes into a single one
    results = pd.concat(all_dataframes, ignore_index=True)

    # Sort by 'evalue' and remove duplicatess
    results.sort_values(by='evalue', inplace=True)
    results.drop_duplicates(subset=['Accession', 'ProteinAccession'], keep='first', inplace=True)

    # Split the 'Profile' column into three new columns: Subunit, SeqsClustThreshold, HMMParameter
    results[['Subunit', 'SeqsClustThreshold', 'HMMParameter']] = results['Profile'].str.split('_', expand=True)

    # Extract 'Start' and 'End' from 'SequenceDesc' using regex
    pattern = re.compile(pattern_str)
    matches = results['SequenceDesc'].apply(lambda x: pattern.search(x) if pd.notnull(x) else None)

    # Assign 'Start' and 'End' based on regex matches
    results['Start'] = matches.apply(lambda m: int(m.group(1)) if m else 0)
    results['End'] = matches.apply(lambda m: int(m.group(2)) if m else 0)

    # Apply log10 transformation to the 'evalue' column
    results['log10evalue'] = np.log10(results['evalue'])

    # Remove the 'Profile' column, as it's no longer needed
    results.drop(columns=['Profile', 'SeqsClustThreshold', 'HMMParameter'], inplace=True)

    # Reset index after all modifications
    results.reset_index(drop=True, inplace=True)

    return results

In [4]:
results = process_hmmer_results("/Users/akshayonly/Work/04-Complex-I/Data/04-HMM-Analysis-Data/01-HMM-Subunits-Search-Raws")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47278/47278 [00:17<00:00, 2723.56it/s]


In [5]:
results.to_csv('/Users/akshayonly/Work/04-Complex-I/Data/07-Figures/regenerate/results_unfil.csv', index=False)