In [5]:
from typing import Iterator, Tuple

FASTA = "/work4/Genome/Homo_sapiens/hg19/Sequence/WholeGenomeFasta/genome.fa"

def fasta_iter(path: str) -> Iterator[str]:
    """Yield sequence strings (concatenated) from FASTA."""
    seq = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                if seq:
                    yield "".join(seq)
                    seq = []
            else:
                seq.append(line)
        if seq:
            yield "".join(seq)

def revcomp(seq: str) -> str:
    comp = str.maketrans("ACGTNacgtn", "TGCANtgcan")
    return seq.translate(comp)[::-1]

def count_contexts_one_strand(seq: str, skip_n: bool = True) -> Tuple[int, int, int, int]:
    """
    Count CG/CHG/CHH on one strand (C-based).
    Returns (CG, CHG, CHH, skipped_due_to_N)
    """
    seq = seq.upper()
    cg = chg = chh = skipped_n = 0
    n = len(seq)
    for i in range(n - 2):
        if seq[i] != "C":
            continue
        b1 = seq[i+1]
        b2 = seq[i+2]
        if skip_n and (b1 == "N" or b2 == "N"):
            skipped_n += 1
            continue
        if b1 == "G":
            cg += 1
        elif b2 == "G":
            chg += 1
        else:
            chh += 1
    return cg, chg, chh, skipped_n

def count_genome_contexts(fasta_path: str, both_strands: bool = False, skip_n: bool = True):
    total_cg = total_chg = total_chh = 0
    total_skipped_n = 0

    for s in fasta_iter(fasta_path):
        cg, chg, chh, skipped_n = count_contexts_one_strand(s, skip_n=skip_n)
        total_cg += cg
        total_chg += chg
        total_chh += chh
        total_skipped_n += skipped_n

        if both_strands:
            rs = revcomp(s)
            cg, chg, chh, skipped_n = count_contexts_one_strand(rs, skip_n=skip_n)
            total_cg += cg
            total_chg += chg
            total_chh += chh
            total_skipped_n += skipped_n

    return {
        "mode": "both_strands" if both_strands else "forward_only",
        "CG": total_cg,
        "CHG": total_chg,
        "CHH": total_chh,
        "Total": total_cg + total_chg + total_chh,
        "Skipped_N_contexts": total_skipped_n if skip_n else 0
    }

# ✅ 跑：只算 forward strand
res = count_genome_contexts(FASTA, both_strands=False, skip_n=True)
res

{'mode': 'forward_only',
 'CG': 28217444,
 'CHG': 123851308,
 'CHH': 432948952,
 'Total': 585017704,
 'Skipped_N_contexts': 240}