In [66]:
!pip install biopython numba pandas numpy
!pip install blosum



In [67]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple, List
from numba import jit
import multiprocessing as mp
from functools import partial
import time
from Bio.Align import substitution_matrices


In [68]:
def make_blosum62() -> Tuple[np.ndarray, Dict[str, int], str]:
    amino_acids = "ARNDCQEGHILKMFPSTWYV"
    mat = substitution_matrices.load("BLOSUM62")
    arr = np.zeros((20, 20), dtype=np.float32)
    for i, a1 in enumerate(amino_acids):
        for j, a2 in enumerate(amino_acids):
            arr[i, j] = float(mat[(a1, a2)])
    aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}
    return arr, aa_to_idx, amino_acids

def encode_sequence(seq: str, aa_to_idx: Dict[str,int]) -> np.ndarray:
    seq = seq.upper()
    out = np.zeros(len(seq), dtype=np.int8)
    for i, ch in enumerate(seq):
        out[i] = aa_to_idx.get(ch, 0)
    return out


In [69]:
def read_fasta(filename: str):
    records = []
    with open(filename, "r") as f:
        header = None
        seq_chunks = []
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                if header is not None:
                    records.append((header, "".join(seq_chunks).upper()))
                header = line[1:].strip()
                seq_chunks = []
            else:
                seq_chunks.append(line)
        if header is not None:
            records.append((header, "".join(seq_chunks).upper()))
    return records


In [70]:
@jit(nopython=True)
def smith_waterman_dp(seq1_enc: np.ndarray,
                      seq2_enc: np.ndarray,
                      blosum: np.ndarray,
                      gap_open: float,
                      gap_extend: float) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float, int, int]:
    n, m = len(seq1_enc), len(seq2_enc)
    M = np.full((n+1, m+1), -np.inf, dtype=np.float32)
    X = np.full((n+1, m+1), -np.inf, dtype=np.float32)
    Y = np.full((n+1, m+1), -np.inf, dtype=np.float32)

    for j in range(m+1):
        M[0, j] = 0.0
        X[0, j] = -np.inf
        Y[0, j] = 0.0
    for i in range(n+1):
        M[i, 0] = 0.0
        X[i, 0] = 0.0
        Y[i, 0] = -np.inf

    best = 0.0
    bi = 0
    bj = 0

    for i in range(1, n+1):
        ai = seq1_enc[i-1]
        for j in range(1, m+1):
            bj_idx = seq2_enc[j-1]
            s = blosum[ai, bj_idx]

            X[i, j] = max(X[i-1, j] - gap_extend, M[i-1, j] - gap_open)
            Y[i, j] = max(Y[i, j-1] - gap_extend, M[i, j-1] - gap_open)

            M[i, j] = max(0.0, M[i-1, j-1] + s, X[i, j], Y[i, j])

            if M[i, j] > best:
                best = M[i, j]
                bi = i
                bj = j

    return M, X, Y, float(best), bi, bj


In [71]:
def sw_traceback(seq1: str, seq2: str,
                 M: np.ndarray, X: np.ndarray, Y: np.ndarray,
                 gap_open: float, gap_extend: float,
                 blosum: np.ndarray,
                 aa_to_idx: Dict[str,int],
                 start_i: int, start_j: int) -> Tuple[str, str]:
    i, j = start_i, start_j
    aln1, aln2 = [], []

    def s(a, b):
        return float(blosum[aa_to_idx.get(a, 0), aa_to_idx.get(b, 0)])

    while i > 0 and j > 0:
        if M[i, j] <= 0:
            break

        if abs(M[i, j] - (M[i-1, j-1] + s(seq1[i-1], seq2[j-1]))) < 1e-5:
            aln1.append(seq1[i-1])
            aln2.append(seq2[j-1])
            i -= 1
            j -= 1
            continue

        if abs(M[i, j] - X[i, j]) < 1e-5:
            aln1.append(seq1[i-1]); aln2.append('-'); i -= 1; continue
        if abs(M[i, j] - Y[i, j]) < 1e-5:
            aln1.append('-'); aln2.append(seq2[j-1]); j -= 1; continue

        aln1.append(seq1[i-1]); aln2.append(seq2[j-1]); i -= 1; j -= 1

    return ''.join(reversed(aln1)), ''.join(reversed(aln2))


In [72]:
class SmithWatermanAligner:
    def __init__(self, gap_open: float = 11.0, gap_extend: float = 1.0):
        self.gap_open = float(gap_open)
        self.gap_extend = float(gap_extend)
        self.blosum, self.aa_to_idx, self.aa_order = make_blosum62()

    def align(self, seq1: str, seq2: str) -> Dict[str, object]:
        s1, s2 = seq1.upper(), seq2.upper()
        e1, e2 = encode_sequence(s1, self.aa_to_idx), encode_sequence(s2, self.aa_to_idx)

        M, X, Y, best, bi, bj = smith_waterman_dp(e1, e2, self.blosum, self.gap_open, self.gap_extend)
        aln1, aln2 = sw_traceback(s1, s2, M, X, Y, self.gap_open, self.gap_extend, self.blosum, self.aa_to_idx, bi, bj)

        matches = sum(1 for a, b in zip(aln1, aln2) if a == b and a != '-' and b != '-')
        mismatches = sum(1 for a, b in zip(aln1, aln2) if a != b and a != '-' and b != '-')
        g1, g2 = aln1.count('-'), aln2.count('-')
        aligned_positions = matches + mismatches
        identity = (matches / aligned_positions) if aligned_positions > 0 else 0.0
        distance = 1.0 - identity

        return {
            "alignment_score": float(best),
            "seq1_length": len(seq1),
            "seq2_length": len(seq2),
            "aligned_length": len(aln1),
            "aligned_positions": aligned_positions,
            "matches": matches,
            "mismatches": mismatches,
            "gaps_seq1": g1,
            "gaps_seq2": g2,
            "identity": identity,
            "distance": distance,
            "alignment1": aln1,
            "alignment2": aln2
        }


In [73]:
def pairs_from_fasta_records(records: List[Tuple[str,str]]):
    N = len(records)
    pairs = []
    for i in range(N):
        h1, s1 = records[i]
        for j in range(i+1, N):
            h2, s2 = records[j]
            pairs.append((i, j, s1, s2, h1, h2))
    return pairs

def worker_align_fasta(pair, gap_open, gap_extend, blosum, aa_to_idx):
    i, j, s1, s2, h1, h2 = pair
    e1 = encode_sequence(s1, aa_to_idx)
    e2 = encode_sequence(s2, aa_to_idx)
    M, X, Y, best, bi, bj = smith_waterman_dp(e1, e2, blosum, gap_open, gap_extend)
    aln1, aln2 = sw_traceback(s1, s2, M, X, Y, gap_open, gap_extend, blosum, aa_to_idx, bi, bj)

    matches = sum(1 for a, b in zip(aln1, aln2) if a == b and a != '-' and b != '-')
    mm = sum(1 for a, b in zip(aln1, aln2) if a != b and a != '-' and b != '-')
    g1, g2 = aln1.count('-'), aln2.count('-')
    aligned_positions = matches + mm
    identity = (matches / aligned_positions) if aligned_positions > 0 else 0.0
    distance = 1.0 - identity

    return {
        "seq1_index": i, "seq2_index": j,
        "seq1_header": h1, "seq2_header": h2,
        "alignment_score": float(best),
        "seq1_length": len(s1), "seq2_length": len(s2),
        "aligned_length": len(aln1), "aligned_positions": aligned_positions,
        "matches": matches, "mismatches": mm,
        "gaps_seq1": g1, "gaps_seq2": g2,
        "identity": identity, "distance": distance,
        "alignment1": aln1, "alignment2": aln2
    }

def run_all_sw_fasta(fasta_path, processes=None, chunk=200, gap_open=11, gap_extend=1):
    blosum, aa_to_idx, _ = make_blosum62()
    records = read_fasta(fasta_path)
    print("Loaded", len(records), "sequences from FASTA")

    pairs = pairs_from_fasta_records(records)
    print("Total pairs:", len(pairs))

    if processes is None:
        processes = max(1, mp.cpu_count()-1)

    start = time.time()
    results = []
    with mp.Pool(processes=processes) as pool:
        func = partial(worker_align_fasta,
                       gap_open=gap_open, gap_extend=gap_extend,
                       blosum=blosum, aa_to_idx=aa_to_idx)
        for k, out in enumerate(pool.imap_unordered(func, pairs, chunksize=chunk), 1):
            results.append(out)
            if k % 1000 == 0:
                rate = k / max(time.time()-start, 1e-6)
                rem = (len(pairs)-k) / max(rate, 1e-6)
                print(f"{k:,}/{len(pairs):,}  ~{rem:.0f}s remaining")

    print(f"Done {len(results)} pairs in {time.time()-start:.1f}s")
    return pd.DataFrame(results)


In [74]:
fasta_path = "/content/sample_data/hemoglobin_209_species_final_fasta.fasta"
out_df = run_all_sw_fasta(fasta_path, processes=4, chunk=200, gap_open=11, gap_extend=1)
out_csv = "/content/sample_data/smith_waterman_alignments.csv"
out_df.to_csv(out_csv, index=False)
print("Saved:", out_csv, "Rows:", len(out_df))


Loaded 209 sequences from FASTA
Total pairs: 21736
1,000/21,736  ~81s remaining
2,000/21,736  ~46s remaining
3,000/21,736  ~32s remaining
4,000/21,736  ~24s remaining
5,000/21,736  ~20s remaining
6,000/21,736  ~16s remaining
7,000/21,736  ~14s remaining
8,000/21,736  ~12s remaining
9,000/21,736  ~11s remaining
10,000/21,736  ~9s remaining
11,000/21,736  ~8s remaining
12,000/21,736  ~7s remaining
13,000/21,736  ~6s remaining
14,000/21,736  ~5s remaining
15,000/21,736  ~4s remaining
16,000/21,736  ~4s remaining
17,000/21,736  ~3s remaining
18,000/21,736  ~2s remaining
19,000/21,736  ~2s remaining
20,000/21,736  ~1s remaining
21,000/21,736  ~0s remaining
Done 21736 pairs in 12.2s
Saved: /content/sample_data/smith_waterman_alignments.csv Rows: 21736


In [75]:

df = pd.read_csv('/content/sample_data/smith_waterman_alignments.csv')

df.head()

Unnamed: 0,seq1_index,seq2_index,seq1_header,seq2_header,alignment_score,seq1_length,seq2_length,aligned_length,aligned_positions,matches,mismatches,gaps_seq1,gaps_seq2,identity,distance,alignment1,alignment2
0,0,201,2328020901 Antechinus flavipes | Dasyuromorphi...,527461777 Ochotona cansus | Lagomorpha | Euarc...,25.0,113,142,13,13,4,9,0,0,0.307692,0.692308,DVISSAMHEFSKL,DALTQAVHHLDDL
1,0,202,2328020901 Antechinus flavipes | Dasyuromorphi...,218157240 Oryctolagus cuniculus | Lagomorpha |...,33.0,113,142,31,31,12,19,0,0,0.387097,0.612903,EQERAMQKLQEELRTSASPFLDKYRAFLKTL,EQIKAHGKKVSEALTKAVGHLDDLPGALSTL
2,0,203,2328020901 Antechinus flavipes | Dasyuromorphi...,365177636 Sylvilagus floridanus | Lagomorpha |...,28.0,113,142,31,31,11,20,0,0,0.354839,0.645161,EQERAMQKLQEELRTSASPFLDKYRAFLKTL,EQVKAHGKKVAEALTKAVNHLDDLPTALSNL
3,0,204,2328020901 Antechinus flavipes | Dasyuromorphi...,530539693 Ochotona collaris | Lagomorpha | Eua...,26.0,113,142,20,20,6,14,0,0,0.3,0.7,ELRTSASPFLDKYRAFLKTL,EFNPAVHASLDKFLASVSTV
4,0,205,2328020901 Antechinus flavipes | Dasyuromorphi...,530539659 Ochotona hyperborea | Lagomorpha | E...,24.0,113,142,15,13,7,6,0,2,0.538462,0.461538,ASPFLDKYRAFLKTL,ASP--DKFLASVSTV


In [76]:
# Validation cell: independent Smithâ€“Waterman spot-check with Biopython
import pandas as pd, numpy as np
from Bio import pairwise2
from Bio.Align import substitution_matrices
from Bio import SeqIO

sw_csv = "/content/sample_data/smith_waterman_alignments.csv"  # adjust if needed
fasta_path = "/content/sample_data/hemoglobin_209_species_final_fasta.fasta"

# Load data
sw = pd.read_csv(sw_csv)
records = list(SeqIO.parse(fasta_path, "fasta"))
idx_to_seq = {i: str(rec.seq).upper() for i, rec in enumerate(records)}

# Sample rows
np.random.seed(42)
k = min(50, len(sw))
sample = sw.sample(n=k, replace=False).reset_index(drop=True)

# Config
gap_open, gap_extend = 11, 1
mat = substitution_matrices.load("BLOSUM62")

def sw_bio_metrics(s1, s2):
    aln = pairwise2.align.localds(s1, s2, mat, -gap_open, -gap_extend, one_alignment_only=True)
    if not aln:
        return np.nan, np.nan
    a = aln[0]
    a1, a2 = a.seqA, a.seqB
    matches = sum(1 for x, y in zip(a1, a2) if x == y and x != '-' and y != '-')
    subs = sum(1 for x, y in zip(a1, a2) if x != y and x != '-' and y != '-')
    ident = matches / max(1, matches + subs)
    return float(a.score), float(ident)

bio_scores, bio_idents = [], []
for _, r in sample.iterrows():
    s1 = idx_to_seq[int(r["seq1_index"])]
    s2 = idx_to_seq[int(r["seq2_index"])]
    sc, idt = sw_bio_metrics(s1, s2)
    bio_scores.append(sc); bio_idents.append(idt)

sample["score_bio"] = bio_scores
sample["identity_bio"] = bio_idents

# Report
print(sample[["alignment_score","score_bio","identity","identity_bio"]].head(10))
print("Score Pearson r:", sample["alignment_score"].corr(sample["score_bio"]))
print("Identity MAE:", (sample["identity"] - sample["identity_bio"]).abs().mean())
print("Identity max abs diff:", (sample["identity"] - sample["identity_bio"]).abs().max())


   alignment_score  score_bio  identity  identity_bio
0            545.0      545.0  0.739437      0.739437
1             23.0       23.0  0.285714      0.095238
2            549.0      549.0  0.751773      0.751773
3            619.0      619.0  0.836879      0.836879
4            572.0      572.0  0.798507      0.753521
5            588.0      588.0  0.836879      0.836879
6            624.0      624.0  0.836879      0.836879
7             24.0       24.0  0.344828      0.120690
8            630.0      630.0  0.852113      0.852113
9            353.0      353.0  0.666667      0.654545
Score Pearson r: 1.0
Identity MAE: 0.062004355276953686
Identity max abs diff: 0.3939393939393939
