In [1]:
import inspect, bnss_pipeline.etl_bnss as e
print(inspect.getsource(e._write_jsonl))


def _write_jsonl(path: Path, rows: Iterable[BaseModel]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8", newline="\n") as f:
        for r in rows:
            f.write(r.model_dump_json())
            f.write("\n")



In [2]:
import inspect
import bnss_pipeline.etl_bnss as e

print("etl_bnss.py:", e.__file__)
print("parse_index_bnss head:\n", inspect.getsource(e.parse_index_bnss)[:400])
print("CHAPTER_RE:", e.CHAPTER_RE.pattern)
print("SECTION_RE:", e.SECTION_RE.pattern)


etl_bnss.py: c:\bnss_pipeline\bnss_pipeline\etl_bnss.py
parse_index_bnss head:
 def parse_index_bnss(
    html: str, *, source_url: str, content_hash: str, version: str
) -> List[BnssSectionIndexRow]:
    soup = BeautifulSoup(html, "lxml")
    source_url = _plain_url(source_url)

    text = soup.get_text(" ", strip=True)
    text = " ".join(text.split())

    chapters: List[Tuple[int, int, int, str]] = []
    for m in CHAPTER_RE.finditer(text):
        ch_no = _roman_to_int(m
CHAPTER_RE: \bCHAPTER\s+([IVXLCDM]+)\s+(.+?)(?=\s+\d{1,3}\s*\.|\s+CHAPTER\s+|$)
SECTION_RE: \b(\d{1,3})\s*\.+\s*(.+?)(?=\s+\d{1,3}\s*\.+\s*|\s+CHAPTER\s+|$)


In [3]:
from bnss_pipeline.etl_bnss import run_etl_bnss
sections_path, crosswalk_path = run_etl_bnss(as_of="2026-01-10")
print(sections_path, crosswalk_path)


datasets\bnss_sections_index.jsonl datasets\bnss_crosswalk.jsonl


In [4]:
    from bnss_pipeline.config import get_settings
    from bnss_pipeline.etl_bnss import _read_json, _latest_hash_for, _load_html_by_hash

    s = get_settings()
    cache = _read_json(s.project_root / s.manifests_dir / "url_cache.json")
    h = _latest_hash_for(cache, s.cytrain_index_bnss)
    html = _load_html_by_hash(s.project_root / s.raw_html_dir, h)
    print(len(html))


93675


In [5]:
from pathlib import Path

print("sections lines:", len(Path(sections_path).read_text(encoding="utf-8").splitlines()))
print("crosswalk lines:", len(Path(crosswalk_path).read_text(encoding="utf-8").splitlines()))

print("sections first:", Path(sections_path).read_text(encoding="utf-8").splitlines()[0][:300])
print("crosswalk first:", Path(crosswalk_path).read_text(encoding="utf-8").splitlines()[0][:300])


sections lines: 532
crosswalk lines: 568
sections first: {"canonical_id":"BNSS:CH01:S001","law":"BNSS","chapter_no":1,"chapter_title":"PRELIMINARY SECTION","section_no":1,"section_title":"Short title, extent and commencement","source_url":"https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html","content_hash":"175496c89bef7a90dd9972bc62ecb38fbff5
crosswalk first: {"bnss_section_no":"1","bnss_section_title":"Short title, extent and commencement","crpc_section_no":"1","crpc_section_title":"Short title, extent and commencement","remarks":null,"source_url":"https://cytrain.ncrb.gov.in/staticpage/web_pages/SectionTableBNSS.html","content_hash":"14e524887274007884


In [6]:
from pathlib import Path

p1 = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
p2 = Path(r"C:\bnss_pipeline\datasets\bnss_crosswalk.jsonl")

print("sections exists:", p1.exists(), p1)
print("crosswalk exists:", p2.exists(), p2)

print("sections first:", p1.read_text(encoding="utf-8").splitlines()[0])
print("crosswalk first:", p2.read_text(encoding="utf-8").splitlines()[0])


sections exists: True C:\bnss_pipeline\datasets\bnss_sections_index.jsonl
crosswalk exists: True C:\bnss_pipeline\datasets\bnss_crosswalk.jsonl
sections first: {"canonical_id":"BNSS:CH01:S001","law":"BNSS","chapter_no":1,"chapter_title":"PRELIMINARY SECTION","section_no":1,"section_title":"Short title, extent and commencement","source_url":"https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html","content_hash":"175496c89bef7a90dd9972bc62ecb38fbff5387a8d8485a55e09ec8baace22c1","version":"bnss@2026-01-10"}
crosswalk first: {"bnss_section_no":"1","bnss_section_title":"Short title, extent and commencement","crpc_section_no":"1","crpc_section_title":"Short title, extent and commencement","remarks":null,"source_url":"https://cytrain.ncrb.gov.in/staticpage/web_pages/SectionTableBNSS.html","content_hash":"14e5248872740078841aa14ab129b6180fb74339f84183a41195bacbf936be87","version":"bnss@2026-01-10"}


In [7]:
import hashlib
from pathlib import Path

def sha256(p: Path) -> str:
    return hashlib.sha256(p.read_bytes()).hexdigest()

p1 = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
p2 = Path(r"C:\bnss_pipeline\datasets\bnss_crosswalk.jsonl")

print("sha sections:", sha256(p1))
print("sha crosswalk:", sha256(p2))
print("first sections:", p1.read_text(encoding="utf-8").splitlines()[0])


sha sections: 89c36f4a7d78428c57f4c8552ac9df5a270ff3b3fd78cd52d86f50b45dfd79ba
sha crosswalk: 61998e99a83951a32961957e45151d7e6cd6004e3b10c6613244a9a44579db95
first sections: {"canonical_id":"BNSS:CH01:S001","law":"BNSS","chapter_no":1,"chapter_title":"PRELIMINARY SECTION","section_no":1,"section_title":"Short title, extent and commencement","source_url":"https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html","content_hash":"175496c89bef7a90dd9972bc62ecb38fbff5387a8d8485a55e09ec8baace22c1","version":"bnss@2026-01-10"}


In [8]:
from pathlib import Path

p1 = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")

b = p1.read_bytes()
print("sha256 bytes:", __import__("hashlib").sha256(b).hexdigest())

line_bytes = b.splitlines()[0]
print("first line bytes head:", line_bytes[:120])
print("decoded head:", line_bytes.decode("utf-8")[:220])

print("contains markdown?", b"source_url\":\"[" in line_bytes)
print("contains plain?", b"source_url\":\"https://" in line_bytes)


sha256 bytes: 89c36f4a7d78428c57f4c8552ac9df5a270ff3b3fd78cd52d86f50b45dfd79ba
first line bytes head: b'{"canonical_id":"BNSS:CH01:S001","law":"BNSS","chapter_no":1,"chapter_title":"PRELIMINARY SECTION","section_no":1,"secti'
decoded head: {"canonical_id":"BNSS:CH01:S001","law":"BNSS","chapter_no":1,"chapter_title":"PRELIMINARY SECTION","section_no":1,"section_title":"Short title, extent and commencement","source_url":"https://cytrain.ncrb.gov.in/staticpag
contains markdown? False
contains plain? True


In [9]:
import json
from pathlib import Path

p1 = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
obj0 = json.loads(p1.read_text(encoding="utf-8").splitlines()[0])

print(obj0["source_url"])
print(obj0["source_url"].startswith("http"))
print(obj0["source_url"][:60])


https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html
True
https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.h


In [10]:
from pathlib import Path
p1 = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")

line = p1.read_bytes().splitlines()[0]
i = line.find(b'"source_url":"')
print("offset:", i)
print(line[i:i+180].decode("utf-8", errors="replace"))


offset: 169
"source_url":"https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html","content_hash":"175496c89bef7a90dd9972bc62ecb38fbff5387a8d8485a55e09ec8baace22c1","version":"bnss@202


In [11]:
import json
from pathlib import Path

p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
obj0 = json.loads(p.read_text(encoding="utf-8-sig").splitlines()[0])
print(obj0["source_url"])


https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html


In [12]:
import json
from pathlib import Path

p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
obj0 = json.loads(p.read_text(encoding="utf-8").splitlines()[0])
print(obj0["source_url"])


https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html


In [13]:
from pathlib import Path
p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
line = p.read_bytes().splitlines()[0]
i = line.find(b'"source_url":"')
print(line[i:i+120].decode("utf-8", "replace"))


"source_url":"https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html","content_hash":"175496c89bef7a90dd9972bc6


In [14]:
from pathlib import Path
p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
line = p.read_bytes().splitlines()[0]
i = line.find(b'"source_url":"')
print(line[i:i+120].decode("utf-8", "replace"))


"source_url":"https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html","content_hash":"175496c89bef7a90dd9972bc6


In [15]:
from pathlib import Path
p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")

b = p.read_bytes()
print("first16:", b[:16])
st = p.stat()
print("size:", st.st_size, "mtime:", st.st_mtime)


first16: b'{"canonical_id":'
size: 211716 mtime: 1770155422.8359323


In [16]:
import os, socket, sys, platform
print("host:", socket.gethostname())
print("platform:", platform.platform())
print("python:", sys.executable)
print("cwd:", os.getcwd())


host: Amit_Dabas
platform: Windows-11-10.0.26200-SP0
python: c:\bnss_pipeline\.venv\Scripts\python.exe
cwd: c:\bnss_pipeline


In [17]:
import hashlib
from pathlib import Path

p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
print(hashlib.sha256(p.read_bytes()).hexdigest())


89c36f4a7d78428c57f4c8552ac9df5a270ff3b3fd78cd52d86f50b45dfd79ba


In [18]:
import re
from pathlib import Path

p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
txt = p.read_text(encoding="utf-8-sig")  # handles BOM if present
txt2 = re.sub(r'"source_url":"\[[^]]+\]\((https?://[^)]+)\)"', r'"source_url":"\1"', txt)
p.write_text(txt2, encoding="utf-8", newline="\n")


211378

In [19]:
import hashlib, json
from pathlib import Path

p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")

b = p.read_bytes()
print("sha256:", hashlib.sha256(b).hexdigest())

line = b.splitlines()[0]
i = line.find(b'"source_url":"')
print("snippet:", line[i:i+140].decode("utf-8", "replace"))

obj0 = json.loads(b.splitlines()[0].decode("utf-8"))  # no utf-8-sig here
print("source_url:", obj0["source_url"])


sha256: 89c36f4a7d78428c57f4c8552ac9df5a270ff3b3fd78cd52d86f50b45dfd79ba
snippet: "source_url":"https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html","content_hash":"175496c89bef7a90dd9972bc62ecb38fbff5387a8d848
source_url: https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html


In [20]:
import re
from pathlib import Path

p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
txt = p.read_text(encoding="utf-8", errors="replace")

pat = r'"source_url":"\[[^]]+\]\((https?://[^)]+)\)"'
print("matches:", len(re.findall(pat, txt)))


matches: 0


In [21]:
import json
from pathlib import Path

def normalize_source_url(v: str) -> str:
    # If it's "[text](href)" return href, else return as-is.
    if v.startswith("[") and "](" in v and v.endswith(")"):
        return v.split("](", 1)[1][:-1]
    return v

def fix_jsonl(path: str) -> None:
    p = Path(path)
    lines = p.read_text(encoding="utf-8-sig").splitlines()  # tolerates BOM
    out_lines = []
    changed = 0

    for ln in lines:
        obj = json.loads(ln)
        if "source_url" in obj:
            new = normalize_source_url(obj["source_url"])
            if new != obj["source_url"]:
                obj["source_url"] = new
                changed += 1
        out_lines.append(json.dumps(obj, ensure_ascii=False))

    p.write_text("\n".join(out_lines) + "\n", encoding="utf-8", newline="\n")  # UTF-8, no BOM
    print(p, "rows:", len(out_lines), "source_url fixed:", changed)

fix_jsonl(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
fix_jsonl(r"C:\bnss_pipeline\datasets\bnss_crosswalk.jsonl")


C:\bnss_pipeline\datasets\bnss_sections_index.jsonl

 rows: 532 source_url fixed: 0
C:\bnss_pipeline\datasets\bnss_crosswalk.jsonl rows: 568 source_url fixed: 0


In [22]:
import json
from pathlib import Path

p = Path(r"C:\bnss_pipeline\datasets\bnss_sections_index.jsonl")
obj0 = json.loads(p.read_text(encoding="utf-8").splitlines()[0])
print(obj0["source_url"])


https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html


In [23]:
from bnss_pipeline.etl_bnss import run_etl_bnss
sections_path, crosswalk_path = run_etl_bnss(as_of="2026-01-10")
print(sections_path, crosswalk_path)


datasets\bnss_sections_index.jsonl datasets\bnss_crosswalk.jsonl


In [24]:
import json
from pathlib import Path

p = Path(sections_path)
obj0 = json.loads(p.read_text(encoding="utf-8").splitlines()[0])
print(obj0["source_url"])


https://cytrain.ncrb.gov.in/staticpage/web_pages/IndexBNSS.html
