In [1]:
import os
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm
import psa
from pathlib import Path
from config import HMM_RESULTS_DIR

def read_fasta_sequence(filepath):
    """Reads a sequence from a FASTA file."""
    record = SeqIO.read(filepath, "fasta")
    return str(record.seq)  # Convert the sequence to string and return

def align_and_calculate_scores(reference_seq, query_sequence):
    """Aligns the reference sequence with another sequence and calculates identity, similarity, and query coverage."""
    aln = psa.water(moltype='prot', qseq=reference_seq, sseq=str(query_sequence))  # Ensure sequence is passed as string
    # Extract relevant metrics
    identity = aln.pidentity
    similarity = aln.psimilarity
    query_coverage = aln.query_coverage()
    
    return identity, similarity, query_coverage

def read_fasta_sequence(filepath):
    """Reads a sequence from a FASTA file."""
    try:
        record = SeqIO.read(filepath, "fasta")
        return str(record.seq)
    except FileNotFoundError:
        print(f"Reference file {filepath} not found.")
        return None

In [3]:
hitssequences = HMM_ANALYSIS_DIR / "hits_sequences"

In [4]:
hitssequences_lists = [seq for seq in os.listdir(hitssequences)]

In [5]:
hitssequences_lists

['NuoB_unfiltered_hits.fasta']

In [6]:
output_dir = output_dir = HMM_ANALYSIS_DIR / "pairwise_seqs_align"
output_dir.mkdir(parents=True, exist_ok=True)

# Iterate through each sequence file in the Hits-Sequences directory
for sequence in sorted(hitssequences_lists):
    if sequence.endswith('.fasta'):
        reference_subunit = sequence.split('_')[0]

        # Define file paths
        query_fasta = os.path.join(hitssequences, sequence)
        reference_fasta = os.path.join(pairwise_seqs_align, f"Reference_Proteins/{reference_subunit}/{reference_subunit}_protein.fasta")
        
        # Read the reference sequence
        reference_sequence = read_fasta_sequence(reference_fasta)
        
        if reference_sequence is None:
            print(f"Skipping {reference_subunit} due to missing reference sequence.")
            continue
        
        # Initialize a list to store the results
        results = []
        
        # Iterate through query FASTA file
        for record in tqdm(SeqIO.parse(query_fasta, 'fasta'), desc=f"Alignment on {reference_subunit} hits"):
            # Retrieve the query sequence
            query_sequence = record.seq
            
            # Calculate identity, similarity, and query coverage
            try:
                identity, similarity, query_coverage = align_and_calculate_scores(reference_sequence, query_sequence)
            except Exception as e:
                print(f"Error in alignment for {record.id}: {e}")
                continue
            
            # Append the results as a dictionary
            results.append({
                'ReferenceSubunit': reference_subunit,
                'ProteinAccession': record.id,  # Accession ID from the query record
                'Identity': identity,
                'Similarity': similarity,
                'QueryCoverage': query_coverage
            })
        
        # Convert the results into a pandas DataFrame
        hmm_alignment_results = pd.DataFrame(results)
        
        # Save the DataFrame as a CSV file
        output_csv = os.path.join(output_dir, f"{reference_subunit}_alignment.csv")
        hmm_alignment_results.to_csv(output_csv, index=False)

        send_pushover_notification(f"Alignment results for {reference_subunit} saved to {output_csv}", user_key, api_token) 
       

Alignment on NuoB hits: 52851it [11:02, 79.80it/s]
