In [54]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple, List
import blosum as bl
from numba import jit
import multiprocessing as mp
from functools import partial
import time

In [55]:
def align_sequences(seq1_encoded, seq2_encoded, blosum_array, gap_open, gap_extend):
    """Semi-global alignment - returns matrices and score."""
    n = len(seq1_encoded)
    m = len(seq2_encoded)

    M = np.full((n + 1, m + 1), -np.inf, dtype=np.float32)
    X = np.full((n + 1, m + 1), -np.inf, dtype=np.float32)
    Y = np.full((n + 1, m + 1), -np.inf, dtype=np.float32)

    M[0, 0] = 0.0
    X[0, 0] = 0.0
    Y[0, 0] = 0.0

    for j in range(1, m + 1):
        M[0, j] = 0.0
        X[0, j] = -np.inf
        Y[0, j] = 0.0

    for i in range(1, n + 1):
        M[i, 0] = 0.0
        X[i, 0] = 0.0
        Y[i, 0] = -np.inf

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            char_score = blosum_array[seq1_encoded[i-1], seq2_encoded[j-1]]
            M[i, j] = char_score + max(M[i-1, j-1], X[i-1, j-1], Y[i-1, j-1])
            X[i, j] = max(M[i-1, j] + gap_open, X[i-1, j] + gap_extend)
            Y[i, j] = max(M[i, j-1] + gap_open, Y[i, j-1] + gap_extend)

    max_row = max(M[n, :].max(), X[n, :].max(), Y[n, :].max())
    max_col = max(M[:, m].max(), X[:, m].max(), Y[:, m].max())
    score = max(max_row, max_col)

    return M, X, Y, score


In [56]:
class SemiGlobalAligner:
    """Semi-global alignment with statistics collection."""

    def __init__(self, gap_open: float = -11, gap_extend: float = -1):
        self.gap_open = gap_open
        self.gap_extend = gap_extend
        self.blosum_array, self.aa_to_idx = self._create_blosum_matrix()

    def _create_blosum_matrix(self):
        """Create BLOSUM62 substitution matrix."""
        blosum62 = bl.BLOSUM(62)
        amino_acids = 'ARNDCQEGHILKMFPSTWYV'
        blosum_array = np.zeros((20, 20), dtype=np.int8)

        for i, aa1 in enumerate(amino_acids):
            for j, aa2 in enumerate(amino_acids):
                blosum_array[i, j] = blosum62[aa1][aa2]

        aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}
        return blosum_array.astype(np.float32), aa_to_idx

    def _encode_sequence(self, seq: str) -> np.ndarray:
        """Encode amino acids to indices."""
        seq = seq.upper()
        encoded = np.zeros(len(seq), dtype=np.int8)
        for i, aa in enumerate(seq):
            if aa in self.aa_to_idx:
                encoded[i] = self.aa_to_idx[aa]
            else:
                encoded[i] = 0
        return encoded

    def _get_score(self, aa1: str, aa2: str) -> float:
        """Get BLOSUM62 score."""
        try:
            i = self.aa_to_idx[aa1.upper()]
            j = self.aa_to_idx[aa2.upper()]
            return float(self.blosum_array[i, j])
        except:
            return 0.0

    def align(self, seq1: str, seq2: str) -> Dict:
        """Perform semi-global alignment and return all statistics."""
        seq1 = seq1.upper()
        seq2 = seq2.upper()
        n, m = len(seq1), len(seq2)

        seq1_encoded = self._encode_sequence(seq1)
        seq2_encoded = self._encode_sequence(seq2)

        M, X, Y, score = align_sequences(
            seq1_encoded, seq2_encoded, self.blosum_array,
            self.gap_open, self.gap_extend
        )

        # Find end position
        max_row = max(M[n, :].max(), X[n, :].max(), Y[n, :].max())
        max_col = max(M[:, m].max(), X[:, m].max(), Y[:, m].max())

        if max_row >= max_col:
            end_i = n
            scores = np.maximum(np.maximum(M[n, :], X[n, :]), Y[n, :])
            end_j = np.argmax(scores)
        else:
            end_j = m
            scores = np.maximum(np.maximum(M[:, m], X[:, m]), Y[:, m])
            end_i = np.argmax(scores)

        # Traceback
        aln1, aln2 = self._traceback(seq1, seq2, M, X, Y, end_i, end_j)

        # Calculate statistics
        matches = 0
        mismatches = 0
        gaps_seq1 = 0
        gaps_seq2 = 0

        for i in range(len(aln1)):
            if aln1[i] == '-':
                gaps_seq1 += 1
            elif aln2[i] == '-':
                gaps_seq2 += 1
            elif aln1[i] == aln2[i]:
                matches += 1
            else:
                mismatches += 1

        aligned_length = len(aln1)
        aligned_positions = matches + mismatches

        if aligned_positions > 0:
            identity = matches / aligned_positions
            distance = 1.0 - identity
        else:
            identity = 0.0
            distance = 1.0

        return {
            'alignment_score': float(score),
            'seq1_length': n,
            'seq2_length': m,
            'aligned_length': aligned_length,
            'aligned_positions': aligned_positions,
            'matches': matches,
            'mismatches': mismatches,
            'gaps_seq1': gaps_seq1,
            'gaps_seq2': gaps_seq2,
            'identity': identity,
            'distance': distance,
            'alignment1': aln1,
            'alignment2': aln2
        }

    def _traceback(self, seq1: str, seq2: str, M: np.ndarray, X: np.ndarray,
                   Y: np.ndarray, end_i: int, end_j: int) -> Tuple[str, str]:
        """Traceback from end position to get alignment."""
        aln1, aln2 = "", ""
        i, j = end_i, end_j

        if abs(M[i, j] - max(M[i, j], X[i, j], Y[i, j])) < 1e-9:
            current = 'M'
        elif abs(X[i, j] - max(M[i, j], X[i, j], Y[i, j])) < 1e-9:
            current = 'X'
        else:
            current = 'Y'

        while i > 0 or j > 0:
            if i == 0 and j == 0:
                break

            if i == 0 or j == 0:
                break

            if current == 'M':
                aln1 = seq1[i-1] + aln1
                aln2 = seq2[j-1] + aln2
                char_score = self._get_score(seq1[i-1], seq2[j-1])

                if abs(M[i, j] - (char_score + M[i-1, j-1])) < 1e-9:
                    current = 'M'
                elif abs(M[i, j] - (char_score + X[i-1, j-1])) < 1e-9:
                    current = 'X'
                else:
                    current = 'Y'

                i -= 1
                j -= 1

            elif current == 'X':
                aln1 = seq1[i-1] + aln1
                aln2 = "-" + aln2

                if abs(X[i, j] - (M[i-1, j] + self.gap_open)) < 1e-9:
                    current = 'M'
                elif abs(X[i, j] - (X[i-1, j] + self.gap_extend)) < 1e-9:
                    current = 'X'
                i -= 1

            elif current == 'Y':
                aln1 = "-" + aln1
                aln2 = seq2[j-1] + aln2

                if abs(Y[i, j] - (M[i, j-1] + self.gap_open)) < 1e-9:
                    current = 'M'
                elif abs(Y[i, j] - (Y[i, j-1] + self.gap_extend)) < 1e-9:
                    current = 'Y'
                j -= 1

        return aln1, aln2


In [57]:
def read_fasta(filename: str) -> List[Tuple[str, str]]:
    """Read FASTA file."""
    sequences = []
    current_header = None
    current_seq = ""

    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_header is not None:
                    sequences.append((current_header, current_seq))
                current_header = line[1:]
                current_seq = ""
            else:
                current_seq += line

        if current_header is not None:
            sequences.append((current_header, current_seq))

    return sequences

In [58]:
def align_pair(pair_data, aligner):
    """Align pair of sequences."""
    idx_i, idx_j, seq_i, seq_j, header_i, header_j = pair_data
    result = aligner.align(seq_i, seq_j)
    return idx_i, idx_j, header_i, header_j, result

In [59]:
def calculate_all_alignments(fasta_file: str, output_csv: str, num_processes: int = None):
    """Calculate all pairwise alignments and save detailed statistics."""

    sequences = read_fasta(fasta_file)
    n = len(sequences)
    print(f"Loaded {n} sequences\n")

    aligner = SemiGlobalAligner(gap_open=-11, gap_extend=-1)

    # Prepare alignment pairs
    pairs = []
    for i in range(n):
        for j in range(i + 1, n):
            pairs.append((
                i, j,
                sequences[i][1], sequences[j][1],
                sequences[i][0], sequences[j][0]
            ))

    print(f"Total alignments: {len(pairs):,}")

    if num_processes is None:
        num_processes = mp.cpu_count()

    print(f"Using {num_processes} processes\n")

    # Collect all results
    all_results = []
    align_func = partial(align_pair, aligner=aligner)

    start_time = time.time()
    with mp.Pool(processes=num_processes) as pool:
        results = pool.imap_unordered(align_func, pairs, chunksize=100)

        completed = 0
        for idx_i, idx_j, header_i, header_j, result in results:
            all_results.append({
                'seq1_index': idx_i,
                'seq2_index': idx_j,
                'seq1_header': header_i,
                'seq2_header': header_j,
                'alignment_score': result['alignment_score'],
                'seq1_length': result['seq1_length'],
                'seq2_length': result['seq2_length'],
                'aligned_length': result['aligned_length'],
                'aligned_positions': result['aligned_positions'],
                'matches': result['matches'],
                'mismatches': result['mismatches'],
                'gaps_seq1': result['gaps_seq1'],
                'gaps_seq2': result['gaps_seq2'],
                'identity': result['identity'],
                'distance': result['distance']
            })

            completed += 1
            if completed % 1000 == 0:
                elapsed = time.time() - start_time
                rate = completed / elapsed
                remaining = (len(pairs) - completed) / rate
                print(f"  {completed:,}/{len(pairs):,} ({remaining:.0f}s remaining)...")

    elapsed = time.time() - start_time
    print(f"\nCompleted in {elapsed:.1f} seconds\n")

    # Create DataFrame and save
    df = pd.DataFrame(all_results)
    df.to_csv(output_csv, index=False)

    print("="*80)
    print(f"Results saved to: {output_csv}")
    print(f"Total rows: {len(df):,}")
    print(f"Columns: {list(df.columns)}")
    print("="*80)

    return df

In [60]:
df = calculate_all_alignments(
    'hemoglobin_209_species_final_fasta.fasta',
    'alignment_statistics_detailed.csv',
    num_processes=4
)

print("\nStatistics Summary:")
print(df[['distance', 'identity', 'matches', 'gaps_seq1', 'gaps_seq2']].describe())

Loaded 209 sequences

Total alignments: 21,736
Using 4 processes

  1,000/21,736 (1614s remaining)...
  2,000/21,736 (1336s remaining)...
  3,000/21,736 (1340s remaining)...
  4,000/21,736 (1202s remaining)...
  5,000/21,736 (1180s remaining)...
  6,000/21,736 (1080s remaining)...
  7,000/21,736 (1043s remaining)...
  8,000/21,736 (950s remaining)...
  9,000/21,736 (895s remaining)...
  10,000/21,736 (811s remaining)...
  11,000/21,736 (750s remaining)...
  12,000/21,736 (675s remaining)...
  13,000/21,736 (605s remaining)...
  14,000/21,736 (530s remaining)...
  15,000/21,736 (466s remaining)...
  16,000/21,736 (393s remaining)...
  17,000/21,736 (326s remaining)...
  18,000/21,736 (255s remaining)...
  19,000/21,736 (187s remaining)...
  20,000/21,736 (118s remaining)...
  21,000/21,736 (50s remaining)...

Completed in 1457.2 seconds

Results saved to: alignment_statistics_detailed.csv
Total rows: 21,736
Columns: ['seq1_index', 'seq2_index', 'seq1_header', 'seq2_header', 'alignment_s

In [63]:
df = pd.read_csv('alignment_statistics_detailed.csv')

df.head()

Unnamed: 0,seq1_index,seq2_index,seq1_header,seq2_header,alignment_score,seq1_length,seq2_length,aligned_length,aligned_positions,matches,mismatches,gaps_seq1,gaps_seq2,identity,distance
0,0,101,2328020901 Antechinus flavipes | Dasyuromorphi...,2591575512 Erinaceus europaeus | Eulipotyphla ...,13.0,113,142,20,20,6,14,0,0,0.3,0.7
1,0,102,2328020901 Antechinus flavipes | Dasyuromorphi...,2076549982 Galemys pyrenaicus | Eulipotyphla |...,14.0,113,142,20,20,6,14,0,0,0.3,0.7
2,0,103,2328020901 Antechinus flavipes | Dasyuromorphi...,358002578 Neurotrichus gibbsii | Eulipotyphla ...,14.0,113,142,20,20,6,14,0,0,0.3,0.7
3,0,104,2328020901 Antechinus flavipes | Dasyuromorphi...,115502088 Ceratotherium simum | Perissodactyla...,19.0,113,142,20,20,7,13,0,0,0.35,0.65
4,0,105,2328020901 Antechinus flavipes | Dasyuromorphi...,2555152565 Diceros bicornis minor | Perissodac...,19.0,113,142,20,20,7,13,0,0,0.35,0.65
