# 01_data_preparation.ipynb — Neem cpDNA (KF986530.1)

# Cell 0 — perf env (paste first, then restart kernel once)

In [1]:
# Configure BLAS thread counts early to stabilize performance across runs
import os
os.environ.setdefault("OMP_NUM_THREADS", "8")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "8")
os.environ.setdefault("MKL_NUM_THREADS", "8")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "8")
print("BLAS threads:",
      os.environ.get("OMP_NUM_THREADS"),
      os.environ.get("OPENBLAS_NUM_THREADS"),
      os.environ.get("MKL_NUM_THREADS"),
      os.environ.get("NUMEXPR_NUM_THREADS"))

# Lightweight phase timing utility for coarse profiling of notebook stages
import time, json
from contextlib import contextmanager
from collections import defaultdict
class PhaseTimer:
    def __init__(self): self.t = defaultdict(float)
    @contextmanager
    def timed(self, key):
        t0 = time.perf_counter()
        yield
        self.t[key] += time.perf_counter() - t0
    def add(self, key, seconds): self.t[key] += seconds
    def to_dict(self): return dict(self.t)

def pretty_seconds(sec):
    # Present durations in human‑friendly units
    return f"{sec/60:.1f} min" if sec >= 60 else f"{sec:.1f} s"

BLAS threads: 8 8 8 8


# Cell 1 — setup & folders

In [2]:
# Core imports and library setup
from pathlib import Path
import json, os, warnings, sys, time
import numpy as np
import pandas as pd
from Bio import Entrez, SeqIO

warnings.filterwarnings("ignore")
np.random.seed(123)

# Define key project directories and ensure they exist
ROOT = Path(".")
DATA = ROOT / "data"
RAW = DATA / "raw"
PROCESSED = DATA / "processed"
RESULTS = ROOT / "results"
for p in [RAW, PROCESSED, RESULTS]:
    p.mkdir(parents=True, exist_ok=True)

# Runtime configuration (override with environment variables if desired)
ACCESSION = os.environ.get("NCBI_ACCESSION", "KF986530.1")
EMAIL = os.environ.get("NCBI_EMAIL", "you@example.com")  # Supply a real email to comply with NCBI policy
Entrez.email = EMAIL

# Sliding window parameters for sequence segmentation
W = int(os.environ.get("WINDOW_LEN", "256"))
STRIDE = int(os.environ.get("WINDOW_STRIDE", "128"))

GB_PATH = RAW / f"{ACCESSION}.gb"
print("Folders ready:", RAW, PROCESSED, RESULTS, sep=" | ")
print("Accession:", ACCESSION, "| Email:", EMAIL)
print("Window config:", W, STRIDE)
print("GenBank path:", GB_PATH)

Folders ready: | data\raw | data\processed | results
Accession: KF986530.1 | Email: you@example.com
Window config: 256 128
GenBank path: data\raw\KF986530.1.gb


# Cell 2 — fetch GenBank (cached + retries)

In [3]:
# Download (with caching + simple retry) the GenBank record for the target accession
# Returns the local path to the .gb file (reuses existing non‑empty file).
def fetch_genbank(accession: str, out_file: Path, retries=3, delay=2.0) -> Path:
    if out_file.exists() and out_file.stat().st_size > 0:
        print("✅ Using cached:", out_file)
        return out_file
    last_err = None
    for k in range(retries):
        try:
            with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
                text = handle.read()
            if not text.strip():
                raise RuntimeError("Empty GenBank response")
            out_file.write_text(text)
            print("⬇️  Downloaded:", out_file)
            return out_file
        except Exception as e:
            last_err = e
            print(f"! fetch error (attempt {k+1}/{retries}): {e}")
            time.sleep(delay)
    raise last_err

_ = fetch_genbank(ACCESSION, GB_PATH)

✅ Using cached: data\raw\KF986530.1.gb


# Cell 3 — parse features (CDS vs RNA)

In [4]:
# Parse GenBank feature table and retain coding (CDS) and structural RNA (tRNA/rRNA) regions
# Assign label 1 to CDS (positive) and 0 to RNA (negative) for downstream classification tasks.
record = SeqIO.read(GB_PATH, "genbank")
seq = record.seq
rows = []
for f in record.features:
    if f.type not in ("CDS", "tRNA", "rRNA"):
        continue
    if "pseudo" in f.qualifiers:  # Skip annotated pseudogenes
        continue
    start, end = int(f.location.start), int(f.location.end)
    lab = 1 if f.type == "CDS" else 0
    rows.append({
        "type": f.type,
        "start": start,
        "end": end,
        "length": end - start,
        "strand": int(f.location.strand or 1),
        "label": lab,
    })
df_feats = pd.DataFrame(rows).sort_values(["start","end"]).reset_index(drop=True)
df_feats.to_csv(PROCESSED / "features.csv", index=False)
print(df_feats["type"].value_counts())
df_feats.head()

type
CDS     86
tRNA    37
rRNA     8
Name: count, dtype: int64


Unnamed: 0,type,start,end,length,strand,label
0,tRNA,0,73,73,-1,0
1,CDS,539,1601,1062,-1,1
2,tRNA,1869,4447,2578,-1,0
3,CDS,2168,3692,1524,-1,1
4,CDS,5447,6590,1143,-1,1


# Cell 4 — windows & encodings

In [5]:
# Generate fixed-length sliding windows from annotated regions and compute multiple encodings
from collections import Counter
import itertools

def one_hot_encode(s: str):
    # Map A/C/G/T to one‑hot rows; unknowns remain zeros
    m = {"A":0, "C":1, "G":2, "T":3}
    X = np.zeros((len(s), 4), dtype=np.float32)
    for i,ch in enumerate(s.upper()):
        j = m.get(ch)
        if j is not None:
            X[i, j] = 1.0
    return X

def kmer_counts(s: str, k=3):
    # Normalized frequency vector over all possible k‑mers (lexicographic A,C,G,T ordering)
    bases = ["A","C","G","T"]
    kmers = ["".join(p) for p in itertools.product(bases, repeat=k)]
    c = Counter(s[i:i+k] for i in range(len(s)-k+1))
    v = np.array([c.get(km, 0) for km in kmers], dtype=np.float32)
    sm = v.sum()
    return v / sm if sm > 0 else v

def angle_encode(s: str):
    # Encode nucleotides as phase angles on the unit circle (0, pi/2, pi, 3pi/2)
    am = {"A":0.0, "C":np.pi/2, "G":np.pi, "T":3*np.pi/2}
    return np.array([am.get(ch.upper(), 0.0) for ch in s], dtype=np.float32)

windows, labels = [], []
for r in df_feats.itertuples():
    sub = str(seq[r.start:r.end])
    if len(sub) < W:
        continue
    for off in range(0, len(sub) - W + 1, STRIDE):
        chunk = sub[off:off+W]
        windows.append(chunk)
        labels.append(r.label)

windows = np.array(windows, dtype=object)
y = np.array(labels, dtype=np.int64)
print("Total windows:", len(windows), "| positive(CDS):", int(y.sum()))

Total windows: 1490 | positive(CDS): 1375


# Cell 5 — save arrays

In [6]:
# Materialize numerical encodings and persist to compressed disk artifacts for reuse
X_onehot = np.stack([one_hot_encode(w) for w in windows])     # Shape: [N, W, 4]
X_kmer   = np.stack([kmer_counts(w, k=3) for w in windows])   # Shape: [N, 64]
X_angle  = np.stack([angle_encode(w) for w in windows])       # Shape: [N, W]

np.savez_compressed(PROCESSED/"encodings.npz",
                    y=y, onehot=X_onehot, kmer=X_kmer, angle=X_angle,
                    window=W, stride=STRIDE)
pd.DataFrame({"y": y}).to_csv(PROCESSED/"labels.csv", index=False)
with open(PROCESSED/"meta.json","w") as f:
    json.dump({"accession": ACCESSION, "window": W, "stride": STRIDE,
               "n_samples": int(len(y))}, f, indent=2)
print("Saved:", PROCESSED/"encodings.npz")

Saved: data\processed\encodings.npz


# Cell 6 — splits

In [7]:
# Create stratified train/validation/test splits preserving class balance
from sklearn.model_selection import StratifiedShuffleSplit
idx = np.arange(len(y))

# First: carve out 20% test. Second: take 25% of remaining (i.e. 20% overall) as validation.
tr_idx, te_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42).split(idx, y))
tr_sub, va_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=43).split(tr_idx, y[tr_idx]))
tr_idx = tr_idx[tr_sub]

splits = {"train": tr_idx.tolist(), "val": va_idx.tolist(), "test": te_idx.tolist()}
with open(PROCESSED/"splits.json","w") as f:
    json.dump(splits, f, indent=2)
{k: len(v) for k,v in splits.items()}

{'train': 894, 'val': 298, 'test': 298}

# Cell 7 — dataset audit

In [8]:
# Compile a concise dataset audit summary for quick reproducibility checks
audit = {
    "n_samples": int(len(y)),
    "positives": int(y.sum()),
    "negatives": int((y==0).sum()),
    "window": int(W),
    "stride": int(STRIDE),
    "splits": {k: len(v) for k,v in splits.items()}
}
pd.DataFrame([audit]).to_csv(PROCESSED/"dataset_audit.csv", index=False)
audit

{'n_samples': 1490,
 'positives': 1375,
 'negatives': 115,
 'window': 256,
 'stride': 128,
 'splits': {'train': 894, 'val': 298, 'test': 298}}