In [6]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CFR Data Inspector
Scans a directory, summarizes files, and lightly inspects XML/JSON samples.
"""

import os
import sys
import re
import json
import hashlib
import mimetypes
import datetime as dt
from collections import Counter, defaultdict

try:
    import pandas as pd
except ImportError:
    print("Installing pandas ... (Ctrl+C to cancel)")
    os.system(f"{sys.executable} -m pip install -q pandas")
    import pandas as pd

try:
    import lxml.etree as ET  # better/faster than xml.etree for big files
except ImportError:
    print("Installing lxml ... (Ctrl+C to cancel)")
    os.system(f"{sys.executable} -m pip install -q lxml")
    import lxml.etree as ET

# ---------- configure here ----------
BASE_DIR = "/Users/nithish/Desktop/USA/CFR-regulations"
OUTPUT_DIR = os.path.join(BASE_DIR, "_inspections")
SAMPLE_XML_TO_PARSE = 5      # how many XML files to sample
SAMPLE_JSON_TO_PARSE = 5     # how many JSON files to sample
HASH_LARGEST_N = 5           # hash top N largest files (helps detect duplicates)
# ------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

def human(nbytes: int) -> str:
    units = ["B","KB","MB","GB","TB"]
    i = 0
    x = float(nbytes)
    while x >= 1024 and i < len(units)-1:
        x /= 1024.0
        i += 1
    return f"{x:.2f} {units[i]}"

def file_sha256(path, bufsize=1024*1024):
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            b = f.read(bufsize)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def walk_dir(base_dir: str):
    rows = []
    for root, _, files in os.walk(base_dir):
        for name in files:
            fpath = os.path.join(root, name)
            try:
                st = os.stat(fpath)
                size = st.st_size
                mtime = dt.datetime.fromtimestamp(st.st_mtime)
                ext = os.path.splitext(name)[1].lower()
                mimetype, _ = mimetypes.guess_type(fpath)
                rel = os.path.relpath(fpath, base_dir)
                top_level = rel.split(os.sep)[0] if os.sep in rel else "."
                rows.append({
                    "relpath": rel,
                    "dir": os.path.dirname(rel),
                    "top_level": top_level,
                    "filename": name,
                    "ext": ext or "",
                    "mimetype": mimetype or "",
                    "size_bytes": size,
                    "size_human": human(size),
                    "mtime": mtime,
                    "abs_path": fpath
                })
            except Exception as e:
                print(f"‚ö†Ô∏è  Could not stat {fpath}: {e}")
    return pd.DataFrame(rows)

def summarize_df(df: pd.DataFrame):
    print("\n===== BASIC SUMMARY =====")
    total = len(df)
    total_bytes = int(df["size_bytes"].sum()) if total else 0
    print(f"Files: {total:,}")
    print(f"Total size: {human(total_bytes)}")

    if total == 0:
        return

    print("\nTop extensions (by count):")
    by_ext = (df.groupby("ext")["relpath"].count()
                .sort_values(ascending=False)
                .head(20))
    print(by_ext.to_string())

    print("\nTop subfolders (top_level) by file count:")
    by_top = (df.groupby("top_level")["relpath"].count()
                .sort_values(ascending=False)
                .head(20))
    print(by_top.to_string())

    print("\nLargest 20 files:")
    largest = df.sort_values("size_bytes", ascending=False).head(20)[
        ["relpath","size_human","mtime","ext"]
    ]
    print(largest.to_string(index=False))

    # Save CSV for deeper analysis
    csv_path = os.path.join(OUTPUT_DIR, "file_inventory.csv")
    df.sort_values(["ext","size_bytes"], ascending=[True, False]).to_csv(csv_path, index=False)
    print(f"\nüìÑ Inventory saved: {csv_path}")

def sample_xml(df: pd.DataFrame, max_n=5):
    xml_df = df[df["ext"].isin([".xml",".XML"])].copy()
    if xml_df.empty:
        print("\n(No XML files detected to sample.)")
        return

    print(f"\n===== XML SAMPLE PARSE (up to {max_n}) =====")
    for _, row in xml_df.sort_values("size_bytes", ascending=False).head(max_n).iterrows():
        f = row["abs_path"]
        rel = row["relpath"]
        try:
            # parse without huge memory blow-up
            # using iterparse to count tags of interest quickly
            tag_counts = Counter()
            title_text = None

            # Count common CFR-ish structural tags heuristically
            interesting = re.compile(r"(TITLE|CHAPTER|SUBCHAPTER|PART|SUBPART|SECTION|SECT|APPENDIX)$", re.I)

            for _, elem in ET.iterparse(f, events=("end",), recover=True):
                tag = elem.tag.split("}")[-1]  # strip namespace if present
                if interesting.search(tag):
                    tag_counts[tag.upper()] += 1
                # try capture a likely title/heading if present
                if title_text is None and tag.upper() in {"TITLE","TITLEHD","DOCTITLE","FRDOCTITLE"}:
                    title_text = (elem.text or "").strip()[:200]
                elem.clear()

            print(f"‚Ä¢ {rel}  ({row['size_human']})")
            if title_text:
                print(f"  - title/heading: {title_text}")
            if tag_counts:
                top_tags = ", ".join(f"{k}:{v}" for k,v in tag_counts.most_common(8))
                print(f"  - tag counts: {top_tags}")
        except Exception as e:
            print(f"  ‚ö†Ô∏è XML parse error for {rel}: {e}")

def sample_json(df: pd.DataFrame, max_n=5):
    json_df = df[df["ext"].isin([".json",".JSON"])].copy()
    if json_df.empty:
        print("\n(No JSON files detected to sample.)")
        return

    print(f"\n===== JSON SAMPLE PARSE (up to {max_n}) =====")
    for _, row in json_df.sort_values("size_bytes", ascending=False).head(max_n).iterrows():
        f = row["abs_path"]
        rel = row["relpath"]
        try:
            with open(f, "r", encoding="utf-8") as fh:
                data = json.load(fh)
            if isinstance(data, dict):
                keys = list(data.keys())[:20]
                print(f"‚Ä¢ {rel}  ({row['size_human']})  keys: {keys}")
            elif isinstance(data, list):
                print(f"‚Ä¢ {rel}  ({row['size_human']})  list len: {len(data)}")
            else:
                print(f"‚Ä¢ {rel}  ({row['size_human']})  type: {type(data)}")
        except Exception as e:
            print(f"  ‚ö†Ô∏è JSON parse error for {rel}: {e}")

def hash_largest(df: pd.DataFrame, n=5):
    if df.empty:
        return
    print(f"\n===== SHA256 of {n} Largest Files =====")
    for _, row in df.sort_values("size_bytes", ascending=False).head(n).iterrows():
        f = row["abs_path"]
        try:
            h = file_sha256(f)
            print(f"‚Ä¢ {row['relpath']}  {row['size_human']}  sha256={h[:16]}‚Ä¶")
        except Exception as e:
            print(f"  ‚ö†Ô∏è Could not hash {row['relpath']}: {e}")

def main():
    if not os.path.isdir(BASE_DIR):
        print(f"‚ùå Directory not found: {BASE_DIR}")
        sys.exit(1)

    print(f"Scanning: {BASE_DIR}")
    df = walk_dir(BASE_DIR)
    summarize_df(df)
    sample_xml(df, max_n=SAMPLE_XML_TO_PARSE)
    sample_json(df, max_n=SAMPLE_JSON_TO_PARSE)
    hash_largest(df, n=HASH_LARGEST_N)

    # Simple pivot tables saved for quick glance in Excel/Numbers
    if not df.empty:
        by_ext = (df.groupby("ext")["relpath"].count()
                    .sort_values(ascending=False).rename("count")).reset_index()
        by_ext.to_csv(os.path.join(OUTPUT_DIR, "by_extension.csv"), index=False)

        by_top = (df.groupby("top_level")["relpath"].count()
                    .sort_values(ascending=False).rename("count")).reset_index()
        by_top.to_csv(os.path.join(OUTPUT_DIR, "by_top_level.csv"), index=False)

        print(f"\nüìÑ Extra summaries saved: {os.path.join(OUTPUT_DIR,'by_extension.csv')} and by_top_level.csv")

if __name__ == "__main__":
    main()


Scanning: /Users/nithish/Desktop/USA/CFR-regulations

===== BASIC SUMMARY =====
Files: 245
Total size: 1.09 GB

Top extensions (by count):
ext
.xml    244
          1

Top subfolders (top_level) by file count:
top_level
title-40    37
title-26    22
title-7     15
title-50    13
title-12    10
title-21     9
title-29     9
title-49     9
title-46     9
title-48     7
title-32     6
title-42     5
title-45     5
title-24     5
title-47     5
title-17     5
title-14     5
title-20     4
title-41     4
title-10     4

Largest 20 files:
                            relpath size_human               mtime  ext
title-40/CFR-2023-title40-vol20.xml   12.00 MB 2025-07-09 19:15:52 .xml
 title-15/CFR-2023-title15-vol2.xml   10.68 MB 2025-07-09 19:15:40 .xml
 title-40/CFR-2023-title40-vol3.xml   10.02 MB 2025-07-09 19:15:52 .xml
title-40/CFR-2023-title40-vol32.xml    9.93 MB 2025-07-09 19:15:52 .xml
title-40/CFR-2023-title40-vol33.xml    9.66 MB 2025-07-09 19:15:54 .xml
 title-49/CFR-2023-title49-vo

In [7]:
import os
import lxml.etree as ET
import pandas as pd
import sqlite3

BASE_DIR = "/Users/nithish/Desktop/USA/CFR-regulations"
OUTPUT_DIR = os.path.join(BASE_DIR, "_processed")
os.makedirs(OUTPUT_DIR, exist_ok=True)

records = []

def extract_sections(xml_file):
    try:
        context = ET.iterparse(xml_file, events=("end",), recover=True)
        current_title = os.path.basename(xml_file).split("-")[2]  # e.g. CFR-2023-title40-vol1.xml -> title40
        for _, elem in context:
            tag = elem.tag.split("}")[-1].upper()
            if tag == "SECTION":
                sectno = elem.findtext(".//SECTNO")
                subject = elem.findtext(".//SUBJECT")
                paras = [p.text for p in elem.findall(".//P") if p.text]
                text = "\n".join(paras).strip()
                records.append({
                    "title": current_title,
                    "file": os.path.basename(xml_file),
                    "section_number": (sectno or "").strip(),
                    "heading": (subject or "").strip(),
                    "text": text
                })
            elem.clear()
    except Exception as e:
        print(f"‚ö†Ô∏è Error parsing {xml_file}: {e}")

# Walk all XMLs
for root, _, files in os.walk(BASE_DIR):
    for f in files:
        if f.lower().endswith(".xml"):
            fpath = os.path.join(root, f)
            extract_sections(fpath)

# Save to CSV
df = pd.DataFrame(records)
csv_path = os.path.join(OUTPUT_DIR, "cfr_sections.csv")
df.to_csv(csv_path, index=False)
print(f"‚úÖ Extracted {len(df)} sections ‚Üí {csv_path}")

# Save to SQLite
db_path = os.path.join(OUTPUT_DIR, "cfr_sections.db")
conn = sqlite3.connect(db_path)
df.to_sql("sections", conn, if_exists="replace", index=False)
conn.close()
print(f"‚úÖ Also saved to SQLite ‚Üí {db_path}")


‚úÖ Extracted 229637 sections ‚Üí /Users/nithish/Desktop/USA/CFR-regulations/_processed/cfr_sections.csv
‚úÖ Also saved to SQLite ‚Üí /Users/nithish/Desktop/USA/CFR-regulations/_processed/cfr_sections.db


In [None]:
import os
import time
import posixpath
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE = "https://static.case.law/"

HEADERS = {
    "User-Agent": "MyCaselawScraper/0.4 (+https://yourdomain.example.com)"
}

# üîß toggle this: True = only download JSON files, False = download everything
ONLY_JSON = True

def list_dir(url):
    """Return list of hrefs in that directory (excluding parent link)."""
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    links = []
    for a in soup.find_all("a"):
        href = a.get("href")
        if not href or href == "../":
            continue
        # Normalize absolute hrefs
        if href.startswith(BASE):
            href = href[len(BASE):]
        href = href.lstrip("/")   # remove leading slash
        links.append(href)
    return links

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def download_file(url, dest_path, retries=3):
    """Download a file with retry logic."""
    for attempt in range(retries):
        try:
            resp = requests.get(url, headers=HEADERS, stream=True, timeout=30)
            resp.raise_for_status()
            with open(dest_path, "wb") as f:
                for chunk in resp.iter_content(chunk_size=8192):
                    f.write(chunk)
            return True
        except Exception as e:
            print(f"‚ö†Ô∏è Error downloading {url} (attempt {attempt+1}/{retries}): {e}")
            time.sleep(2 * (attempt + 1))
    return False

def crawl_directory(rel_path="", local_dir="data"):
    """
    Recursively crawl and download from the given relative path.
    rel_path: path relative to BASE ("" means root)
    local_dir: local directory to mirror
    """
    full_url = urljoin(BASE, rel_path)
    links = list_dir(full_url)
    ensure_dir(local_dir)

    for href in links:
        href = href.lstrip("/")  # normalize

        # üö´ Skip links that "jump back up" (avoid a2d/31/a2d/ loops)
        if rel_path and not href.startswith(rel_path) and "/" in href:
            continue

        # If href repeats rel_path (like 'a2d/31/'), reduce to just last part
        if rel_path and href.startswith(rel_path):
            href = os.path.basename(href.rstrip("/")) + ("/" if href.endswith("/") else "")

        if href.endswith("/"):  # it's a directory
            sub_rel = posixpath.join(rel_path, href)
            sub_local = os.path.join(local_dir, href.rstrip("/"))
            crawl_directory(sub_rel, sub_local)
        else:  # it's a file
            if ONLY_JSON and not href.lower().endswith(".json"):
                continue  # skip non-JSON files if filter is enabled

            remote_file = posixpath.join(rel_path, href)
            local_file = os.path.join(local_dir, href)
            if os.path.exists(local_file):
                continue  # skip already downloaded
            print("‚¨áÔ∏è Downloading", remote_file)
            download_file(urljoin(BASE, remote_file), local_file)
            time.sleep(0.2)  # politeness delay

if __name__ == "__main__":
    # First download metadata
    meta_files = ["ReportersMetadata.json", "VolumesMetadata.json", "JurisdictionsMetadata.json"]
    ensure_dir("data")
    for mf in meta_files:
        print("‚¨áÔ∏è Downloading metadata:", mf)
        download_file(BASE + mf, os.path.join("data", mf))

    # Then crawl everything else
    crawl_directory("", "data")