In [1]:
!pip -q install --upgrade requests beautifulsoup4 lxml tqdm python-dateutil


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.75 requires requests_mock, which is not installed.
datasets 4.0.0 requires fsspec[http]<=2025.3.0,>=2023.1.0, but you have fsspec 2025.7.0 which is incompatible.
conda-repo-cli 1.0.75 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.75 requires python-dateutil==2.8.2, but you have python-dateutil 2.9.0.post0 which is incompatible.
conda-repo-cli 1.0.75 requires requests==2.31.0, but you have requests 2.32.5 which is incompatible.
streamlit 1.40.2 requires packaging<25,>=20, but you have packaging 25.0 which is incompatible.
anaconda-cloud-auth 0.1.4 requires pydantic<2.0, but you have pydantic 2.11.7 which is incompatible.[0m[31m
[0m

In [2]:
import os, re, io, json, time, zipfile, hashlib, argparse, sys
from datetime import datetime
from urllib.parse import urljoin, urlencode

import requests
from bs4 import BeautifulSoup
from dateutil import parser as dateparser
from tqdm import tqdm
from lxml import etree as ET

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "legal-ingestor/1.0 (+for research/educational use)",
    "Accept": "*/*",
    "Accept-Language": "en",
    "Connection": "keep-alive",
})


In [3]:
# ---------- utilities ----------

def ensure_dirs(*paths):
    for p in paths:
        os.makedirs(p, exist_ok=True)

def sha256(text_or_bytes):
    if isinstance(text_or_bytes, str):
        b = text_or_bytes.encode("utf-8", errors="ignore")
    else:
        b = text_or_bytes
    return hashlib.sha256(b).hexdigest()

def write_jsonl(path, records):
    ensure_dirs(os.path.dirname(path))
    with open(path, "a", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def save_file(path, content, mode="wb"):
    ensure_dirs(os.path.dirname(path))
    with open(path, mode) as f:
        f.write(content)

def normalize_whitespace(s):
    return re.sub(r"\s+\n", "\n", re.sub(r"[ \t]+", " ", s)).strip()

def section_hash(text):
    # Stable hash of normalized text for diffing
    return sha256(normalize_whitespace(text))

In [4]:
# ---------- Federal (Justice Laws) ----------

def find_zip_links_from_open_gov(dataset_page_url):
    """
    Scrape an Open Government dataset page to discover .zip resource URLs.
    (You can also pass direct ZIP URLs via --federal-zip)
    """
    r = SESSION.get(dataset_page_url, timeout=60)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.lower().endswith(".zip"):
            links.append(urljoin(dataset_page_url, href))
    # De-dup and keep order
    seen, out = set(), []
    for u in links:
        if u not in seen:
            seen.add(u)
            out.append(u)
    return out

def parse_justice_xml(xml_bytes):
    """
    Parse a single Justice Laws XML/WebXML file.
    This is tolerant: tries a few common namespaces.
    Returns normalized dict with sections.
    """
    parser = ET.XMLParser(recover=True, remove_blank_text=True)
    tree = ET.parse(io.BytesIO(xml_bytes), parser)
    root = tree.getroot()

    # Common namespaces observed on Justice WebXML/XML; fallback to '*'
    ns_candidates = [
        {"j": "http://laws-lois.justice.gc.ca"},
        {"w": "http://laws.justice.gc.ca"},
        {}  # no namespace
    ]

    def xp(path):
        for ns in ns_candidates:
            try:
                res = root.xpath(path, namespaces=ns)
                if res:
                    return res
            except ET.XPathEvalError:
                continue
        return []


In [5]:
def parse_justice_xml(xml_bytes):
    """
    Parse a single Justice Laws XML/WebXML file.
    Tolerant to a few common namespaces.
    Returns normalized dict with sections.
    """
    parser = ET.XMLParser(recover=True, remove_blank_text=True)
    tree = ET.parse(io.BytesIO(xml_bytes), parser)
    root = tree.getroot()

    # Common namespaces observed on Justice WebXML/XML; fallback to no-ns
    ns_candidates = [
        {"j": "http://laws-lois.justice.gc.ca"},
        {"w": "http://laws.justice.gc.ca"},
        {}  # no namespace
    ]

    def xp(path):
        for ns in ns_candidates:
            try:
                res = root.xpath(path, namespaces=ns)
                if res:
                    return res
            except ET.XPathEvalError:
                continue
        return []

    # Title
    title_nodes = xp("//j:Title/text()") or xp("//w:Title/text()") or xp("//Title/text()")
    title = title_nodes[0] if title_nodes else None

    # Citation (sometimes in ShortTitle or meta)
    citation_nodes = xp("//j:ShortTitle/text()") or xp("//w:ShortTitle/text()") or xp("//ShortTitle/text()")
    citation = citation_nodes[0] if citation_nodes else None

    # Consolidation date (varies)
    date_nodes = xp("//j:ConsolidationDate/text()") or xp("//w:ConsolidationDate/text()") or xp("//ConsolidationDate/text()")
    consolidation_date = None
    if date_nodes:
        try:
            consolidation_date = dateparser.parse(date_nodes[0]).date().isoformat()
        except Exception:
            consolidation_date = date_nodes[0]

    # Sections: gather by heading + block content heuristics
    sections = []
    sect_nodes = xp("//j:Section") or xp("//w:Section") or xp("//Section")
    if sect_nodes:
        for s in sect_nodes:
            # id/number
            sid = (s.get("id") or s.get("n") or s.findtext(".//{*}Num") or "").strip()
            heading = None
            hnode = s.find(".//{*}Heading")
            if hnode is not None:
                heading = "".join(hnode.itertext()).strip()

            # full text
            text = "".join(s.itertext())
            text = normalize_whitespace(text)
            sections.append({
                "id": sid or None,
                "heading": heading,
                "hash": section_hash(text),
            })
    else:
        # Fallback: split by top-level headings/anchors
        full_text = normalize_whitespace("".join(root.itertext()))
        chunks = re.split(r"\n(?=Section\s+\d+)", full_text)
        for i, chunk in enumerate(chunks, start=1):
            if not chunk.strip():
                continue
            sections.append({
                "id": f"s.{i}",
                "heading": None,
                "hash": section_hash(chunk),
            })

    return {
        "title": title,
        "citation": citation,
        "consolidation_date": consolidation_date,
        "sections": sections,
    }


In [6]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import zipfile, io
import os

# Adjust headers to mimic a browser (sometimes needed)
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

def find_zip_links(dataset_url):
    resp = requests.get(dataset_url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.lower().endswith(".zip"):
            full = urljoin(dataset_url, href)
            links.append(full)
    # de-dup preserving order
    seen = set()
    out = []
    for u in links:
        if u not in seen:
            seen.add(u)
            out.append(u)
    return out

def download_and_extract(zip_url, dest_folder="downloads"):
    print("Downloading:", zip_url)
    resp = requests.get(zip_url, headers=HEADERS, timeout=120)
    resp.raise_for_status()
    z = zipfile.ZipFile(io.BytesIO(resp.content))
    os.makedirs(dest_folder, exist_ok=True)
    for name in z.namelist():
        # we only want XML files
        if name.lower().endswith(".xml"):
            data = z.read(name)
            # write file
            out_path = os.path.join(dest_folder, os.path.basename(name))
            with open(out_path, "wb") as f:
                f.write(data)
            print("Saved:", out_path)

if __name__ == "__main__":
    ds_url = "https://open.canada.ca/data/en/dataset/eb0dee21-9123-4d0d-b11d-0763fa1fb403"
    # (remove utm query parameter)
    zip_links = find_zip_links(ds_url)
    print("Discovered ZIPs:", zip_links)
    for z in zip_links:
        download_and_extract(z, dest_folder="snapshots/federal_xml")


Discovered ZIPs: []


In [7]:
import requests, zipfile, io, os

# CKAN API endpoint for this dataset
API_URL = "https://open.canada.ca/data/api/action/package_show"
DATASET_ID = "eb0dee21-9123-4d0d-b11d-0763fa1fb403"

resp = requests.get(API_URL, params={"id": DATASET_ID}, timeout=60)
resp.raise_for_status()
data = resp.json()

# Find all resource URLs that end with .zip
zip_urls = [res["url"] for res in data["result"]["resources"] if res["url"].lower().endswith(".zip")]

print("Discovered ZIPs:", zip_urls)

# Download and extract XML files
os.makedirs("federal_xml", exist_ok=True)

for url in zip_urls:
    print("Downloading:", url)
    r = requests.get(url, timeout=120)
    r.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        for name in z.namelist():
            if name.lower().endswith(".xml"):
                out_path = os.path.join("federal_xml", os.path.basename(name))
                with open(out_path, "wb") as f:
                    f.write(z.read(name))
                print("Saved:", out_path)


Discovered ZIPs: []


In [8]:
import requests, zipfile, io, os

CKAN_API = "https://open.canada.ca/data/api/action/package_show"
DATASET_IDS = [
    "eb0dee21-9123-4d0d-b11d-0763fa1fb403",  # Consolidated Acts & Regs XML
    "2f4bb174-b2e7-4fb2-a7a1-a313507acbc6",  # WebXML dataset
]

def get_resources(dataset_id):
    resp = requests.get(CKAN_API, params={"id": dataset_id}, timeout=60)
    resp.raise_for_status()
    j = resp.json()
    return j["result"]["resources"]

def download_resource(resource, dest_folder="bulk_downloads"):
    url = resource["url"]
    name = resource.get("name") or resource.get("id")
    print("Trying resource:", name, url)
    # check for ZIP
    if url.lower().endswith(".zip"):
        r = requests.get(url, timeout=120)
        r.raise_for_status()
        with zipfile.ZipFile(io.BytesIO(r.content)) as z:
            for member in z.namelist():
                if member.lower().endswith(".xml"):
                    out_path = os.path.join(dest_folder, os.path.basename(member))
                    os.makedirs(os.path.dirname(out_path), exist_ok=True)
                    with open(out_path, "wb") as f:
                        f.write(z.read(member))
                    print("Extracted:", out_path)
    # direct xml
    elif url.lower().endswith(".xml"):
        r = requests.get(url, timeout=120)
        r.raise_for_status()
        out_path = os.path.join(dest_folder, name + ".xml")
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        with open(out_path, "wb") as f:
            f.write(r.content)
        print("Saved XML:", out_path)
    else:
        # maybe other formats (tar, gzip, etc.)
        print("Skipping unknown file type:", url)

def run():
    for ds in DATASET_IDS:
        resources = get_resources(ds)
        print("Dataset", ds, "has", len(resources), "resources")
        for res in resources:
            download_resource(res, dest_folder=f"downloads/{ds}")

if __name__ == "__main__":
    run()


Dataset eb0dee21-9123-4d0d-b11d-0763fa1fb403 has 3 resources
Trying resource: Consolidated Statutes and regulations in XML https://laws-lois.justice.gc.ca/eng/XML/Legis.xml
Saved XML: downloads/eb0dee21-9123-4d0d-b11d-0763fa1fb403/Consolidated Statutes and regulations in XML.xml
Trying resource: Data dictionary https://laws-lois.justice.gc.ca/eng/XML/index.html
Skipping unknown file type: https://laws-lois.justice.gc.ca/eng/XML/index.html
Trying resource: Data dictionary https://laws-lois.justice.gc.ca/fra/XML/index.html
Skipping unknown file type: https://laws-lois.justice.gc.ca/fra/XML/index.html
Dataset 2f4bb174-b2e7-4fb2-a7a1-a313507acbc6 has 3 resources
Trying resource: Consolidated federal Acts and regulations –Bulk XML (WEB) ftp://205.193.86.89/WebXML/
Skipping unknown file type: ftp://205.193.86.89/WebXML/
Trying resource: Data Dictionary https://laws-lois.justice.gc.ca/eng/XML/index.html
Skipping unknown file type: https://laws-lois.justice.gc.ca/eng/XML/index.html
Trying reso

In [9]:
from lxml import etree
import requests, os

LEGIS_PATH = "downloads/eb0dee21-9123-4d0d-b11d-0763fa1fb403/Consolidated Statutes and regulations in XML.xml"
tree = etree.parse(LEGIS_PATH)
root = tree.getroot()

entries = []

# Parse Acts
for act in root.findall(".//Act"):
    title = act.findtext("Title")
    number = act.findtext("OfficialNumber")
    xml_url = act.findtext("LinkToXML")
    current_to = act.findtext("CurrentToDate")
    if title and xml_url:
        entries.append({
            "type": "Act",
            "title": title.strip(),
            "official_number": number.strip() if number else None,
            "xml_url": xml_url.strip(),
            "current_to": current_to
        })

# Parse Regulations
for reg in root.findall(".//Regulation"):
    title = reg.findtext("Title")
    number = reg.findtext("OfficialNumber")
    xml_url = reg.findtext("LinkToXML")
    current_to = reg.findtext("CurrentToDate")
    if title and xml_url:
        entries.append({
            "type": "Regulation",
            "title": title.strip(),
            "official_number": number.strip() if number else None,
            "xml_url": xml_url.strip(),
            "current_to": current_to
        })

print("Total entries found:", len(entries))
print("Sample:", entries[:5])


Total entries found: 11510
Sample: [{'type': 'Act', 'title': 'Access to Information Act', 'official_number': 'A-1', 'xml_url': 'http://laws-lois.justice.gc.ca/eng/XML/A-1.xml', 'current_to': '2025-09-01'}, {'type': 'Act', 'title': 'Loi sur l’accès à l’information', 'official_number': 'A-1', 'xml_url': 'http://laws-lois.justice.gc.ca/fra/XML/A-1.xml', 'current_to': '2025-09-01'}, {'type': 'Act', 'title': 'Accessible Canada Act', 'official_number': '2019, c. 10', 'xml_url': 'http://laws-lois.justice.gc.ca/eng/XML/A-0.6.xml', 'current_to': '2025-09-01'}, {'type': 'Act', 'title': 'Loi canadienne sur l’accessibilité', 'official_number': '2019, ch. 10', 'xml_url': 'http://laws-lois.justice.gc.ca/fra/XML/A-0.6.xml', 'current_to': '2025-09-01'}, {'type': 'Act', 'title': 'Addition of Lands to Reserves and Reserve Creation Act', 'official_number': '2018, c. 27, s. 675', 'xml_url': 'http://laws-lois.justice.gc.ca/eng/XML/A-1.3.xml', 'current_to': '2025-09-01'}]


In [11]:
import time
import os
import requests

# Path to Desktop/Canadian_Laws on Mac
save_dir = os.path.expanduser("~/Desktop/Canadian_Laws")
os.makedirs(save_dir, exist_ok=True)

print("Saving files to:", os.path.abspath(save_dir))

for e in entries:
    url = e["xml_url"]
    fname = os.path.join(save_dir, os.path.basename(url))

    if os.path.exists(fname):
        continue  # skip already downloaded

    try:
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        with open(fname, "wb") as f:
            f.write(r.content)
        print("Saved:", fname)
    except Exception as ex:
        print("Failed:", url, ex)

    time.sleep(0.5)  # polite pause


Saving files to: /Users/nithish/Desktop/Canadian_Laws
Saved: /Users/nithish/Desktop/Canadian_Laws/A-1.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-0.6.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-1.3.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-1.5.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/C-49.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-2.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-2.4.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/Z-02.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-3.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-3.7.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-5.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-6.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-8.8.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-10.1.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-10.5.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-10.4.xml
Saved: /Users/nithish/Desktop/Canadian_Laws/A-10.6.xml
Saved: /Users/nithish/Desktop/Canadi

In [13]:
from lxml import etree

def parse_law_xml(path):
    tree = etree.parse(path)
    root = tree.getroot()

    # Title: look for Act/Reg Title
    title = root.findtext(".//Title")
    if not title:
        # fallback if English/French pairs exist
        title = root.findtext(".//TitleEn") or root.findtext(".//LongTitle")

    # Citation (short title or official number)
    citation = root.findtext(".//ShortTitle")
    if not citation:
        citation = root.findtext(".//OfficialNumber")

    # Consolidation date
    consolidation_date = root.findtext(".//CurrentToDate")

    sections = []
    for sec in root.findall(".//Section"):
        sid = sec.get("id") or sec.findtext(".//Label")  # try attribute or <Label>
        heading = sec.findtext(".//MarginalNote") or sec.findtext(".//Heading")
        text = " ".join(sec.itertext()).strip()
        sections.append({
            "id": sid,
            "heading": heading,
            "text": text
        })

    return {
        "title": title,
        "citation": citation,
        "consolidation_date": consolidation_date,
        "sections": sections
    }


In [15]:
from lxml import etree
import os, json

def parse_law_xml(path):
    tree = etree.parse(path)
    root = tree.getroot()

    title = root.findtext(".//Title")
    citation = root.findtext(".//ShortTitle") or os.path.basename(path)
    current_to = root.findtext(".//CurrentToDate")

    sections = []
    for sec in root.findall(".//Section"):
        sid = sec.get("id") or None
        heading = sec.findtext(".//MarginalNote") or sec.findtext(".//Heading")
        text = " ".join(sec.itertext()).strip()
        sections.append({
            "id": sid,
            "heading": heading,
            "text": text
        })

    return {
        "title": title,
        "citation": citation,
        "consolidation_date": current_to,
        "sections": sections
    }

# Example: parse one file
parsed = parse_law_xml("federal_laws/A-0.6.xml")
print(json.dumps(parsed, indent=2)[:800])


{
  "title": null,
  "citation": "Accessible Canada Act",
  "consolidation_date": null,
  "sections": [
    {
      "id": null,
      "heading": "Short title",
      "text": "Short title 1 This Act may be cited as the  Accessible Canada Act ."
    },
    {
      "id": null,
      "heading": "Definitions",
      "text": "Definitions 2 The following definitions apply in this Act. Accessibility Commissioner \u2002means the member of the Canadian Human Rights Commission that is appointed under subsection 26(1) of the  Canadian Human Rights Act  and that is referred to in that Act as the \u201cAccessibility Commissioner\u201d.\u2002( commissaire \u00e0 l\u2019accessibilit\u00e9 ) barrier \u2002means anything \u2014 including anything physical, architectural, technological or attitudinal, anythi


In [17]:
from lxml import etree
import os, json

def parse_law_xml(path):
    tree = etree.parse(path)
    root = tree.getroot()

    # Title (inside Act/Regulation)
    title = root.findtext(".//Act/Title") or root.findtext(".//Regulation/Title")

    # Citation (official number, or short title)
    citation = root.findtext(".//Act/OfficialNumber") or \
               root.findtext(".//Regulation/OfficialNumber") or \
               root.findtext(".//ShortTitle")

    # Consolidation date
    consolidation_date = root.findtext(".//CurrentToDate")

    sections = []
    for sec in root.findall(".//Section"):
        sid = sec.findtext(".//Label")  # section number
        heading = sec.findtext(".//MarginalNote") or sec.findtext(".//Heading")
        text = " ".join(sec.itertext()).strip()
        sections.append({
            "id": sid,
            "heading": heading,
            "text": text
        })

    return {
        "title": title,
        "citation": citation,
        "consolidation_date": consolidation_date,
        "sections": sections
    }

# Test on one file
test_file = os.path.expanduser("~/Desktop/Canadian_Laws/A-0.6.xml")
parsed = parse_law_xml(test_file)
print(json.dumps(parsed, indent=2, ensure_ascii=False)[:1200])


{
  "title": null,
  "citation": "Accessible Canada Act",
  "consolidation_date": null,
  "sections": [
    {
      "id": "1",
      "heading": "Short title",
      "text": "Short title 1 This Act may be cited as the  Accessible Canada Act ."
    },
    {
      "id": "2",
      "heading": "Definitions",
      "text": "Definitions 2 The following definitions apply in this Act. Accessibility Commissioner  means the member of the Canadian Human Rights Commission that is appointed under subsection 26(1) of the  Canadian Human Rights Act  and that is referred to in that Act as the “Accessibility Commissioner”. ( commissaire à l’accessibilité ) barrier  means anything — including anything physical, architectural, technological or attitudinal, anything that is based on information or communications or anything that is the result of a policy or a practice — that hinders the full and equal participation in society of persons with an impairment, including a physical, mental, intellectual, cognit

In [21]:
from lxml import etree
import os, json

def parse_law_xml(path):
    tree = etree.parse(path)
    root = tree.getroot()

    # Title: try multiple places (Acts and Regs may differ)
    title = (
        root.findtext(".//LongTitle") or
        root.findtext(".//ShortTitle") or
        root.findtext(".//Title")
    )

    # Citation: official number, or short title as fallback
    citation = (
        root.findtext(".//OfficialNumber") or
        root.findtext(".//ShortTitle") or
        os.path.basename(path).replace(".xml", "")
    )

    # Consolidation date (explicitly check Act or Regulation block)
    consolidation_date = (
        root.findtext(".//Act/CurrentToDate") or
        root.findtext(".//Regulation/CurrentToDate")
    )

    sections = []
    for sec in root.findall(".//Section"):
        sid = sec.findtext(".//Label")  # section number
        heading = sec.findtext(".//MarginalNote") or sec.findtext(".//Heading")
        text = " ".join(sec.itertext()).strip()
        sections.append({
            "id": sid,
            "heading": heading,
            "text": text
        })

    return {
        "title": title,
        "citation": citation,
        "consolidation_date": consolidation_date,
        "sections": sections
    }

# --- Test it on your sample XML ---
test_file = os.path.expanduser("~/Desktop/Canadian_Laws/A-0.6.xml")
parsed = parse_law_xml(test_file)

# Preview the result
print(json.dumps(parsed, indent=2, ensure_ascii=False)[:1200])


{
  "title": "An Act to ensure a barrier-free Canada",
  "citation": "Accessible Canada Act",
  "consolidation_date": null,
  "sections": [
    {
      "id": "1",
      "heading": "Short title",
      "text": "Short title 1 This Act may be cited as the  Accessible Canada Act ."
    },
    {
      "id": "2",
      "heading": "Definitions",
      "text": "Definitions 2 The following definitions apply in this Act. Accessibility Commissioner  means the member of the Canadian Human Rights Commission that is appointed under subsection 26(1) of the  Canadian Human Rights Act  and that is referred to in that Act as the “Accessibility Commissioner”. ( commissaire à l’accessibilité ) barrier  means anything — including anything physical, architectural, technological or attitudinal, anything that is based on information or communications or anything that is the result of a policy or a practice — that hinders the full and equal participation in society of persons with an impairment, including a ph

In [25]:
from lxml import etree
import os, json

def parse_law_xml(path):
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(path, parser)
    root = tree.getroot()

    # Title
    title = (
        root.findtext(".//{*}LongTitle") or
        root.findtext(".//{*}ShortTitle") or
        root.findtext(".//{*}Title")
    )

    # Citation
    citation = (
        root.findtext(".//{*}OfficialNumber") or
        root.findtext(".//{*}ShortTitle") or
        os.path.basename(path).replace(".xml", "")
    )

    # Consolidation / date info
    consolidation_date = None
    node = root.find(".//{*}CurrentToDate")
    if node is not None:
        consolidation_date = node.text or node.attrib.get("Date")
    else:
        # fallback: check attributes on root <Statute> or <Regulation>
        consolidation_date = root.attrib.get("{http://justice.gc.ca/lims}current-date")

    # Sections
    sections = []
    for sec in root.findall(".//{*}Section"):
        sid = sec.findtext(".//{*}Label")
        heading = sec.findtext(".//{*}MarginalNote") or sec.findtext(".//{*}Heading")
        text = " ".join(sec.itertext()).strip()
        sections.append({
            "id": sid,
            "heading": heading,
            "text": text
        })

    return {
        "title": title,
        "citation": citation,
        "consolidation_date": consolidation_date,
        "inforce_start_date": root.attrib.get("{http://justice.gc.ca/lims}inforce-start-date"),
        "enacted_date": root.attrib.get("{http://justice.gc.ca/lims}enacted-date"),
        "last_amended_date": root.attrib.get("{http://justice.gc.ca/lims}lastAmendedDate"),
        "sections": sections
    }

# Test
test_file = os.path.expanduser("~/Desktop/Canadian_Laws/A-0.6.xml")
parsed = parse_law_xml(test_file)
print(json.dumps(parsed, indent=2, ensure_ascii=False)[:1200])


{
  "title": "An Act to ensure a barrier-free Canada",
  "citation": "Accessible Canada Act",
  "consolidation_date": "2023-05-03",
  "inforce_start_date": "2019-06-21",
  "enacted_date": "2019-06-21",
  "last_amended_date": "2023-04-27",
  "sections": [
    {
      "id": "1",
      "heading": "Short title",
      "text": "Short title 1 This Act may be cited as the  Accessible Canada Act ."
    },
    {
      "id": "2",
      "heading": "Definitions",
      "text": "Definitions 2 The following definitions apply in this Act. Accessibility Commissioner  means the member of the Canadian Human Rights Commission that is appointed under subsection 26(1) of the  Canadian Human Rights Act  and that is referred to in that Act as the “Accessibility Commissioner”. ( commissaire à l’accessibilité ) barrier  means anything — including anything physical, architectural, technological or attitudinal, anything that is based on information or communications or anything that is the result of a policy or 

In [26]:
import os, json
from lxml import etree
from tqdm import tqdm

folder = os.path.expanduser("~/Desktop/Canadian_Laws")
save_path = os.path.expanduser("~/Desktop/canadian_laws.jsonl")

def parse_law_xml(path):
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(path, parser)
    root = tree.getroot()

    title = (
        root.findtext(".//{*}LongTitle") or
        root.findtext(".//{*}ShortTitle") or
        root.findtext(".//{*}Title")
    )

    citation = (
        root.findtext(".//{*}OfficialNumber") or
        root.findtext(".//{*}ShortTitle") or
        os.path.basename(path).replace(".xml", "")
    )

    consolidation_date = None
    node = root.find(".//{*}CurrentToDate")
    if node is not None:
        consolidation_date = node.text or node.attrib.get("Date")
    else:
        consolidation_date = root.attrib.get("{http://justice.gc.ca/lims}current-date")

    sections = []
    for sec in root.findall(".//{*}Section"):
        sid = sec.findtext(".//{*}Label")
        heading = sec.findtext(".//{*}MarginalNote") or sec.findtext(".//{*}Heading")
        text = " ".join(sec.itertext()).strip()
        sections.append({
            "id": sid,
            "heading": heading,
            "text": text
        })

    return {
        "title": title,
        "citation": citation,
        "consolidation_date": consolidation_date,
        "inforce_start_date": root.attrib.get("{http://justice.gc.ca/lims}inforce-start-date"),
        "enacted_date": root.attrib.get("{http://justice.gc.ca/lims}enacted-date"),
        "last_amended_date": root.attrib.get("{http://justice.gc.ca/lims}lastAmendedDate"),
        "sections": sections
    }

# --- Build JSONL dataset ---
with open(save_path, "w", encoding="utf-8") as f:
    for file in tqdm(os.listdir(folder)):
        if file.endswith(".xml"):
            path = os.path.join(folder, file)
            try:
                record = parse_law_xml(path)
                f.write(json.dumps(record, ensure_ascii=False) + "\n")
            except Exception as e:
                print("❌ Failed:", file, "->", e)

print("✅ Dataset saved to:", save_path)


100%|██████████| 10554/10554 [00:11<00:00, 890.34it/s]

✅ Dataset saved to: /Users/nithish/Desktop/canadian_laws.jsonl





In [27]:
import os
import json
from datasets import load_dataset
import pyarrow.parquet as pq

def ingest_from_hf(save_folder="cases_hf"):
    ds = load_dataset("a2aj/canadian-case-law", split="train")
    os.makedirs(save_folder, exist_ok=True)
    for i, rec in enumerate(ds):
        # Each rec likely has fields: citation, document_date, name, unofficial_text, source_url, etc.
        fname = f"{rec['citation']}_{i}.json"
        path = os.path.join(save_folder, fname)
        with open(path, "w", encoding="utf-8") as f:
            f.write(json.dumps(rec, ensure_ascii=False))
    print("Saved from Hugging Face dataset to", save_folder)

def ingest_from_parquet(parquet_path, save_folder="cases_parquet"):
    table = pq.read_table(parquet_path)
    os.makedirs(save_folder, exist_ok=True)
    # Convert to Python dict
    df = table.to_pandas()
    for idx, row in df.iterrows():
        rec = row.to_dict()
        fname = f"{rec.get('citation','case')}_{idx}.json"
        path = os.path.join(save_folder, fname)
        with open(path, "w", encoding="utf-8") as f:
            f.write(json.dumps(rec, ensure_ascii=False))
    print("Saved from Parquet to", save_folder)


In [28]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("a2aj/canadian-case-law", split="train")

# Peek at the first few cases
for i in range(3):
    print(ds[i])


{'dataset': 'CHRT', 'citation_en': '2018 CHRT 6', 'citation2_en': '', 'name_en': 'Attaran v. Citizenship and Immigration Canada', 'document_date_en': Timestamp('2018-02-21 00:00:00+0000', tz='UTC'), 'url_en': 'https://decisions.chrt-tcdp.gc.ca/chrt-tcdp/decisions/en/item/346073/index.do', 'scraped_timestamp_en': Timestamp('2023-12-01 16:04:03.290000+0000', tz='UTC'), 'unofficial_text_en': 'Attaran v. Citizenship and Immigration Canada\nCollection\nCanadian Human Rights Tribunal\nDate\n2018-02-21\nNeutral citation\n2018 CHRT 6\nFile number(s)\nT2163/3716\nDecision-maker(s)\nThomas, David L.\nDecision type\nRuling\nGrounds\nAge\nFamily Status\nNational or Ethnic Origin\nRace\nDecision Content\nCanadian Human Rights Tribunal\nTribunal canadien des droits de la personne\nCitation: 2018CHRT\n6\nDate:\nFebruary 21, 2018\nFile No.:\nT2163/3716\nBetween:\nAmir Attaran\nComplainant\n- and -\nCanadian Human Rights Commission\nCommission\n- and -\nImmigration, Refugees and Citizenship Canada (for

In [30]:
import json
from datasets import load_dataset

# Load dataset
ds = load_dataset("a2aj/canadian-case-law", split="train")

def clean_record(rec):
    """Convert timestamps and other non-JSON types to strings."""
    cleaned = {}
    for k, v in rec.items():
        if isinstance(v, (list, dict)):
            # nested structures are already JSON-compatible
            cleaned[k] = v
        elif hasattr(v, "isoformat"):  
            # pandas/Arrow Timestamps
            cleaned[k] = v.isoformat()
        else:
            cleaned[k] = str(v) if v is not None else None
    return cleaned

# Save as JSONL
with open("canadian_caselaw.jsonl", "w", encoding="utf-8") as f:
    for rec in ds:
        cleaned = clean_record(rec)
        f.write(json.dumps(cleaned, ensure_ascii=False) + "\n")

print("✅ Saved canadian_caselaw.jsonl with", len(ds), "cases")


✅ Saved canadian_caselaw.jsonl with 117352 cases


In [31]:
import json

path = "canadian_caselaw.jsonl"

# Read first 3 lines
with open(path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        rec = json.loads(line)
        print(json.dumps(rec, indent=2, ensure_ascii=False)[:1000])  # limit preview
        print("="*80)
        if i >= 2:
            break


{
  "dataset": "CHRT",
  "citation_en": "2018 CHRT 6",
  "citation2_en": "",
  "name_en": "Attaran v. Citizenship and Immigration Canada",
  "document_date_en": "2018-02-21T00:00:00+00:00",
  "url_en": "https://decisions.chrt-tcdp.gc.ca/chrt-tcdp/decisions/en/item/346073/index.do",
  "scraped_timestamp_en": "2023-12-01T16:04:03.290000+00:00",
  "unofficial_text_en": "Attaran v. Citizenship and Immigration Canada\nCollection\nCanadian Human Rights Tribunal\nDate\n2018-02-21\nNeutral citation\n2018 CHRT 6\nFile number(s)\nT2163/3716\nDecision-maker(s)\nThomas, David L.\nDecision type\nRuling\nGrounds\nAge\nFamily Status\nNational or Ethnic Origin\nRace\nDecision Content\nCanadian Human Rights Tribunal\nTribunal canadien des droits de la personne\nCitation: 2018CHRT\n6\nDate:\nFebruary 21, 2018\nFile No.:\nT2163/3716\nBetween:\nAmir Attaran\nComplainant\n- and -\nCanadian Human Rights Commission\nCommission\n- and -\nImmigration, Refugees and Citizenship Canada (formerly Citizenship and I