In [13]:
!pip install blosum



In [14]:
!pip install Bio

Collecting Bio
  Downloading bio-1.8.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.8.1-py3-none-any.whl (321 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.3/321.3 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading g

In [15]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple, List
import blosum as bl
from numba import jit
import multiprocessing as mp
from functools import partial
import time

In [16]:
@jit(nopython=True)

def align_sequences(seq1_encoded, seq2_encoded, blosum_array, gap_open, gap_extend):
    """Semi-global alignment - returns matrices and score."""
    n = len(seq1_encoded)
    m = len(seq2_encoded)

    M = np.full((n + 1, m + 1), -np.inf, dtype=np.float32)
    X = np.full((n + 1, m + 1), -np.inf, dtype=np.float32)
    Y = np.full((n + 1, m + 1), -np.inf, dtype=np.float32)

    M[0, 0] = 0.0
    X[0, 0] = 0.0
    Y[0, 0] = 0.0

    for j in range(1, m + 1):
        M[0, j] = 0.0
        X[0, j] = -np.inf
        Y[0, j] = 0.0

    for i in range(1, n + 1):
        M[i, 0] = 0.0
        X[i, 0] = 0.0
        Y[i, 0] = -np.inf

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            char_score = blosum_array[seq1_encoded[i-1], seq2_encoded[j-1]]
            M[i, j] = char_score + max(M[i-1, j-1], X[i-1, j-1], Y[i-1, j-1])
            X[i, j] = max(M[i-1, j] + gap_open, X[i-1, j] + gap_extend)
            Y[i, j] = max(M[i, j-1] + gap_open, Y[i, j-1] + gap_extend)

    max_row = max(M[n, :].max(), X[n, :].max(), Y[n, :].max())
    max_col = max(M[:, m].max(), X[:, m].max(), Y[:, m].max())
    score = max(max_row, max_col)

    return M, X, Y, score


In [17]:
from Bio.Align import substitution_matrices
import numpy as np


class SemiGlobalAligner:
    """Semi-global alignment with normalized distance statistics."""

    def __init__(self, gap_open: float = -11, gap_extend: float = -1):
        self.gap_open = gap_open
        self.gap_extend = gap_extend
        self.blosum_array, self.aa_to_idx = self._create_blosum_matrix()

    def _create_blosum_matrix(self):
          """Create BLOSUM62 substitution matrix."""

          # --- FIX STARTS HERE ---
          # Load directly from the imported module
          blosum62 = substitution_matrices.load("BLOSUM62")
          # --- FIX ENDS HERE ---

          amino_acids = 'ARNDCQEGHILKMFPSTWYV'
          blosum_array = np.zeros((20, 20), dtype=np.float32)

          for i, aa1 in enumerate(amino_acids):
              for j, aa2 in enumerate(amino_acids):
                  # Modern Biopython matrices handle [aa1][aa2] access natively
                  # so this try/except block will work, though it is less necessary now.
                  try:
                      val = blosum62[aa1][aa2]
                  except (KeyError, IndexError, TypeError):
                      # Fallback for older Biopython versions or specific matrix formats
                      try:
                          val = blosum62[(aa1, aa2)]
                      except KeyError:
                          # Handle symmetry if key is missing (e.g. ('A', 'W') vs ('W', 'A'))
                          val = blosum62.get((aa2, aa1), -4.0)

                  blosum_array[i, j] = val

          aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}
          return blosum_array, aa_to_idx

    def _encode_sequence(self, seq: str) -> np.ndarray:
        """Encode amino acids to indices."""
        seq = seq.upper()
        encoded = np.zeros(len(seq), dtype=np.int8)
        for i, aa in enumerate(seq):
            if aa in self.aa_to_idx:
                encoded[i] = self.aa_to_idx[aa]
            else:
                # Handle unknown amino acids (e.g., 'X', 'Z') gracefully
                # Default to 0 or -1 depending on your C implementation preference
                encoded[i] = 0
        return encoded

    def _get_score(self, aa1: str, aa2: str) -> float:
        """Get BLOSUM62 score."""
        try:
            i = self.aa_to_idx[aa1.upper()]
            j = self.aa_to_idx[aa2.upper()]
            return float(self.blosum_array[i, j])
        except:
            return 0.0

    def align(self, seq1: str, seq2: str) -> Dict:
        """Perform semi-global alignment and return all statistics."""
        seq1 = seq1.upper()
        seq2 = seq2.upper()
        n, m = len(seq1), len(seq2)

        seq1_encoded = self._encode_sequence(seq1)
        seq2_encoded = self._encode_sequence(seq2)

        # NOTE: Ensure your 'align_sequences' function is imported and available!
        M, X, Y, score = align_sequences(
            seq1_encoded, seq2_encoded, self.blosum_array,
            self.gap_open, self.gap_extend
        )

        # Find end position
        # For Semi-Global, we look for max in the last row (n, :) or last col (:, m)
        max_row_val = max(M[n, :].max(), X[n, :].max(), Y[n, :].max())
        max_col_val = max(M[:, m].max(), X[:, m].max(), Y[:, m].max())

        if max_row_val >= max_col_val:
            end_i = n
            # Find best j in the last row
            scores = np.maximum(np.maximum(M[n, :], X[n, :]), Y[n, :])
            end_j = np.argmax(scores)
        else:
            end_j = m
            # Find best i in the last column
            scores = np.maximum(np.maximum(M[:, m], X[:, m]), Y[:, m])
            end_i = np.argmax(scores)

        # Traceback
        aln1, aln2 = self._traceback(seq1, seq2, M, X, Y, end_i, end_j)

        # ---------------------------------------------------------
        # NEW METRIC LOGIC START
        # ---------------------------------------------------------

        matches = 0
        mismatches = 0
        gaps_seq1 = 0
        gaps_seq2 = 0

        for i in range(len(aln1)):
            if aln1[i] == '-':
                gaps_seq1 += 1
            elif aln2[i] == '-':
                gaps_seq2 += 1
            elif aln1[i] == aln2[i]:
                matches += 1
            else:
                mismatches += 1

        # Calculate Normalization Factor (Min Original Length)
        # This penalizes short, perfect local alignments.
        # aln1/aln2 contain gaps, so we strip them to get rough original count,
        # OR better, use n and m passed in at the start.
        min_original_len = min(n, m)

        aligned_length = len(aln1)

        # Standard "Aligned Positions" (Matches + Mismatches only)
        # Kept for reference, but NOT used for distance anymore.
        aligned_positions = matches + mismatches

        if min_original_len > 0:
            # Identity relative to the SHORTER sequence
            identity = matches / min_original_len
            distance = 1.0 - identity
        else:
            identity = 0.0
            distance = 1.0

        # Sanity check: Distance cannot be negative
        # (possible if matches > min_len due to weird overlap logic, though unlikely here)
        distance = max(0.0, distance)

        # ---------------------------------------------------------
        # NEW METRIC LOGIC END
        # ---------------------------------------------------------

        return {
            'alignment_score': float(score),
            'seq1_length': n,
            'seq2_length': m,
            'aligned_length': aligned_length,
            'aligned_positions': aligned_positions,
            'matches': matches,
            'mismatches': mismatches,
            'gaps_seq1': gaps_seq1,
            'gaps_seq2': gaps_seq2,
            'identity': identity,
            'distance': distance,
            'alignment1': aln1,
            'alignment2': aln2
        }

    def _traceback(self, seq1: str, seq2: str, M: np.ndarray, X: np.ndarray,
                   Y: np.ndarray, end_i: int, end_j: int) -> Tuple[str, str]:
        """Traceback from end position to get alignment."""
        aln1, aln2 = "", ""
        i, j = end_i, end_j

        # Initial state determination
        if abs(M[i, j] - max(M[i, j], X[i, j], Y[i, j])) < 1e-9:
            current = 'M'
        elif abs(X[i, j] - max(M[i, j], X[i, j], Y[i, j])) < 1e-9:
            current = 'X'
        else:
            current = 'Y'

        # MODIFIED LOOP CONDITION for Semi-Global
        # Continue as long as we haven't hit the boundary (0 index)
        # For global, we go until i=0 AND j=0.
        # For semi-global, we might hit i=0 while j>0 (gap at start).
        while i > 0 and j > 0:

            if current == 'M':
                aln1 = seq1[i-1] + aln1
                aln2 = seq2[j-1] + aln2
                char_score = self._get_score(seq1[i-1], seq2[j-1])

                # Floating point comparison
                if abs(M[i, j] - (char_score + M[i-1, j-1])) < 1e-9:
                    current = 'M'
                elif abs(M[i, j] - (char_score + X[i-1, j-1])) < 1e-9:
                    current = 'X'
                else:
                    current = 'Y'
                i -= 1
                j -= 1

            elif current == 'X':
                aln1 = seq1[i-1] + aln1
                aln2 = "-" + aln2
                if abs(X[i, j] - (M[i-1, j] + self.gap_open)) < 1e-9:
                    current = 'M'
                elif abs(X[i, j] - (X[i-1, j] + self.gap_extend)) < 1e-9:
                    current = 'X'
                i -= 1

            elif current == 'Y':
                aln1 = "-" + aln1
                aln2 = seq2[j-1] + aln2
                if abs(Y[i, j] - (M[i, j-1] + self.gap_open)) < 1e-9:
                    current = 'M'
                elif abs(Y[i, j] - (Y[i, j-1] + self.gap_extend)) < 1e-9:
                    current = 'Y'
                j -= 1

        # Semi-Global Finish: If we hit a wall (i=0 or j=0), fill the rest with gaps
        # This handles the "unaligned" start parts
        while i > 0:
            aln1 = seq1[i-1] + aln1
            aln2 = "-" + aln2
            i -= 1
        while j > 0:
            aln1 = "-" + aln1
            aln2 = seq2[j-1] + aln2
            j -= 1

        return aln1, aln2

In [18]:
def read_fasta(filename: str) -> List[Tuple[str, str]]:
    """Read FASTA file."""
    sequences = []
    current_header = None
    current_seq = ""

    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_header is not None:
                    sequences.append((current_header, current_seq))
                current_header = line[1:]
                current_seq = ""
            else:
                current_seq += line

        if current_header is not None:
            sequences.append((current_header, current_seq))

    return sequences

In [19]:
def align_pair(pair_data, aligner):
    """Align pair of sequences."""
    idx_i, idx_j, seq_i, seq_j, header_i, header_j = pair_data
    result = aligner.align(seq_i, seq_j)
    return idx_i, idx_j, header_i, header_j, result

In [20]:
def parse_header(header: str):
    """
    Parse a header like:
    '2328020901 Antechinus flavipes | Dasyuromorphia | Marsupials'
    into (protein_id, species, order, superorder).
    """
    if header is None:
        return "", "", "", ""

    parts = [p.strip() for p in header.split("|")]
    if len(parts) != 3:
        # Fallback if format is unexpected
        return header.strip(), "", "", ""

    left, order, superorder = parts
    left = left.strip()

    # First token = protein_id, everything after = species name
    first_space = left.find(" ")
    if first_space == -1:
        protein_id = left
        species = ""
    else:
        protein_id = left[:first_space].strip()
        species = left[first_space + 1:].strip()

    return protein_id, species, order, superorder

def calculate_all_alignments(fasta_file: str, output_csv: str, num_processes: int = None):
    """Calculate all pairwise alignments and save detailed statistics."""

    sequences = read_fasta(fasta_file)
    n = len(sequences)
    print(f"Loaded {n} sequences\n")

    # Your semi-global aligner
    aligner = SemiGlobalAligner(gap_open=-11, gap_extend=-1)

    # --- Load metadata to get species/order/superorder like NW_Global ---
    meta = pd.read_csv("hemoglobin_209_species_final.csv")
    meta["protein_id"] = meta["protein_id"].astype(str)
    meta = meta.set_index("protein_id")

    # Prepare alignment pairs
    pairs = []
    for i in range(n):
        for j in range(i + 1, n):
            pairs.append((
                i, j,
                sequences[i][1], sequences[j][1],
                sequences[i][0], sequences[j][0]
            ))

    print(f"Total alignments: {len(pairs):,}")

    if num_processes is None:
        num_processes = mp.cpu_count()

    print(f"Using {num_processes} processes\n")

    # Collect all results
    all_results = []
    align_func = partial(align_pair, aligner=aligner)

    start_time = time.time()
    with mp.Pool(processes=num_processes) as pool:
        results = pool.imap_unordered(align_func, pairs, chunksize=100)

        completed = 0
        for idx_i, idx_j, header_i, header_j, result in results:
            # header_i / header_j are full FASTA headers.
            # Parse them into (protein_id, species, order, superorder)
            pid1, species1_h, order1_h, superorder1_h = parse_header(str(header_i))
            pid2, species2_h, order2_h, superorder2_h = parse_header(str(header_j))

            # Start with values from header
            species_1, order_1, superorder_1 = species1_h, order1_h, superorder1_h
            species_2, order_2, superorder_2 = species2_h, order2_h, superorder2_h

            # If metadata CSV has more authoritative info, override from meta
            if pid1 in meta.index:
                row1 = meta.loc[pid1]
                species_1 = row1["species"]
                order_1 = row1["order"]
                superorder_1 = row1["superorder"]

            if pid2 in meta.index:
                row2 = meta.loc[pid2]
                species_2 = row2["species"]
                order_2 = row2["order"]
                superorder_2 = row2["superorder"]

            # --- Only code1-style fields here ---
            all_results.append({
                  "protein_id_1": pid1,
                  "protein_id_2": pid2,
                  "species_1": species_1,
                  "species_2": species_2,
                  "order_1": order_1,
                  "order_2": order_2,
                  "superorder_1": superorder_1,
                  "superorder_2": superorder_2,
                  "matrix": "BLOSUM62",
                  "gap_open": abs(aligner.gap_open),
                  "gap_extend": abs(aligner.gap_extend),
                  "score": float(result["alignment_score"]),
                  "aln1": result["alignment1"],
                  "aln2": result["alignment2"],
                  "identity": result["identity"],
                  "distance": result["distance"],
                  "alignment_length": result["aligned_length"],
                  "aligned_non_gap_positions": result["aligned_positions"],
                  "matches": result["matches"],
                  "gaps_seq1": result["gaps_seq1"],
                  "gaps_seq2": result["gaps_seq2"],
                  "mismatches": result["mismatches"],
              })

            completed += 1
            if completed % 1000 == 0:
                elapsed = time.time() - start_time
                rate = completed / elapsed
                remaining = (len(pairs) - completed) / rate
                print(f"  {completed:,}/{len(pairs):,} ({remaining:.0f}s remaining)...")

    elapsed = time.time() - start_time
    print(f"\nCompleted in {elapsed:.1f} seconds\n")

    # Create DataFrame and enforce column order to match code1 exactly
    df = pd.DataFrame(all_results)

    code1_cols = [
        "protein_id_1", "protein_id_2",
        "species_1", "species_2",
        "order_1", "order_2",
        "superorder_1", "superorder_2",
        "matrix", "gap_open", "gap_extend",
        "score", "aln1", "aln2",
        "identity", "distance",
        "alignment_length",
        "aligned_non_gap_positions",
        "matches",
        "gaps_seq1", "gaps_seq2", "mismatches"
    ]
    df = df[code1_cols]

    df.to_csv(output_csv, index=False)

    print("="*80)
    print(f"Results saved to: {output_csv}")
    print(f"Total rows: {len(df):,}")
    print(f"Columns: {list(df.columns)}")
    print("="*80)

    return df


In [21]:
df = calculate_all_alignments(
    'hemoglobin_209_species_final_fasta.fasta',
    'alignment_statistics_detailed.csv',
    num_processes=4
)

print("\nStatistics Summary:")
print(df[['distance', 'identity', 'matches', 'gaps_seq1', 'gaps_seq2']].describe())

Loaded 209 sequences

Total alignments: 21,736
Using 4 processes

  1,000/21,736 (293s remaining)...
  2,000/21,736 (147s remaining)...
  3,000/21,736 (99s remaining)...
  4,000/21,736 (74s remaining)...
  5,000/21,736 (60s remaining)...
  6,000/21,736 (51s remaining)...
  7,000/21,736 (44s remaining)...
  8,000/21,736 (36s remaining)...
  9,000/21,736 (31s remaining)...
  10,000/21,736 (27s remaining)...
  11,000/21,736 (23s remaining)...
  12,000/21,736 (20s remaining)...
  13,000/21,736 (17s remaining)...
  14,000/21,736 (14s remaining)...
  15,000/21,736 (12s remaining)...
  16,000/21,736 (9s remaining)...
  17,000/21,736 (8s remaining)...
  18,000/21,736 (6s remaining)...
  19,000/21,736 (4s remaining)...
  20,000/21,736 (2s remaining)...
  21,000/21,736 (1s remaining)...

Completed in 29.4 seconds

Results saved to: alignment_statistics_detailed.csv
Total rows: 21,736
Columns: ['protein_id_1', 'protein_id_2', 'species_1', 'species_2', 'order_1', 'order_2', 'superorder_1', 'supero

In [22]:
df = pd.read_csv('alignment_statistics_detailed.csv')

df.head()

Unnamed: 0,protein_id_1,protein_id_2,species_1,species_2,order_1,order_2,superorder_1,superorder_2,matrix,gap_open,...,aln1,aln2,identity,distance,alignment_length,aligned_non_gap_positions,matches,gaps_seq1,gaps_seq2,mismatches
0,2328020901,122394,Antechinus flavipes,Dasyurus viverrinus,Dasyuromorphia,Dasyuromorphia,Marsupials,Marsupials,BLOSUM62,11,...,----------------------------------------------...,MVLSDADKTHVKAIWGKVGGHAGAYAAEALARTFLSFPTTKTYFPH...,0.132743,0.867257,144,65,15,77,2,50
1,2328020901,395515590,Antechinus flavipes,Sarcophilus harrisii,Dasyuromorphia,Dasyuromorphia,Marsupials,Marsupials,BLOSUM62,11,...,MSSLLFLLSDAQTKLEKESREMALLQSNQDVISSAMHEFSKLLDQQ...,---------------------MALSAADK----CNVREFWEKLG--...,0.20354,0.79646,114,83,23,1,30,60
2,2328020901,2990525618,Antechinus flavipes,Sminthopsis crassicaudata,Dasyuromorphia,Dasyuromorphia,Marsupials,Marsupials,BLOSUM62,11,...,MSSLLFLLSDAQTKLEKESREMALLQSNQDVISSAMHEFSKLLDQQ...,MPFLLFLLSDTQTKLEKESREMALLQSNQDVISSAMQEFSKLLDQQ...,0.929204,0.070796,113,113,105,0,0,8
3,2328020901,122395,Antechinus flavipes,Didelphis virginiana,Dasyuromorphia,Didelphimorphia,Marsupials,Marsupials,BLOSUM62,11,...,----------------------------------------------...,VLSANDKTNVKGAWSKVGGNSGAYMGEALYRTFLSFPTTKTYFPNY...,0.044248,0.955752,141,20,5,121,0,15
4,2328020901,2119467502,Antechinus flavipes,Gracilinanus agilis,Dasyuromorphia,Didelphimorphia,Marsupials,Marsupials,BLOSUM62,11,...,----------------------------------------------...,MVLSGADKNNVKAAWSKVGGNSGAYMSEALHRTFLSFPTTKTYFPH...,0.044248,0.955752,142,20,5,122,0,15


In [23]:
import pandas as pd

# 1. Load the original semi-global results
df = pd.read_csv("alignment_statistics_detailed.csv")

# 2. Helper: parse FASTA header into (protein_id, species, order, superorder)
#    Adjust this if your header format is different.
#
#    Assumed header format:
#       protein_id|species|order|superorder
#    e.g. >HBA_HUMAN|Homo sapiens|Primates|Euarchontoglires
def parse_header(header: str):
    header = str(header).lstrip(">")
    parts = header.split("|")
    protein_id  = parts[0].strip() if len(parts) > 0 else ""
    species     = parts[1].strip() if len(parts) > 1 else ""
    order       = parts[2].strip() if len(parts) > 2 else ""
    superorder  = parts[3].strip() if len(parts) > 3 else ""
    return protein_id, species, order, superorder

# 3. Parse both headers
prot1, spec1, ord1, sup1 = zip(*df["seq1_header"].map(parse_header))
prot2, spec2, ord2, sup2 = zip(*df["seq2_header"].map(parse_header))

# 4. Build a new DataFrame with the SAME format as NW_Global
new_df = pd.DataFrame({
    "protein_id_1": prot1,
    "protein_id_2": prot2,
    "species_1": spec1,
    "species_2": spec2,
    "order_1": ord1,
    "order_2": ord2,
    "superorder_1": sup1,
    "superorder_2": sup2,
    # Assuming you used a single matrix + gap scheme in Semi-global:
    "matrix": "BLOSUM62",        # change if you used a different matrix
    "gap_open": -11,             # from SemiGlobalAligner(gap_open=-11, gap_extend=-1)
    "gap_extend": -1,
    "score": df["alignment_score"],
    # If you stored aligned sequences in the result dict, map them here.
    # If not available, leave as empty strings or NaN.
    "aln1": "",                  # or df["aln1"] if you have it
    "aln2": "",                  # or df["aln2"] if you have it
    "identity": df["identity"],
    "distance": df["distance"],
    "alignment_length": df["aligned_length"],
    "aligned_non_gap_positions": df["aligned_positions"],
    "matches": df["matches"],
})

# 5. Overwrite the CSV with the NW_Global-like format
new_df.to_csv("alignment_statistics_detailed.csv", index=False)
print("Rewritten alignment_statistics_detailed.csv with NW_Global column format.")
print(new_df.head())


KeyError: 'seq1_header'