In [1]:
import os
import time
import json
import math
from typing import Dict, Any, List, Optional

import requests

# ---------- CONSTANTS ----------

ADV_SEARCH_URL = "https://archive.org/advancedsearch.php"
METADATA_URL = "https://archive.org/metadata/"
DOWNLOAD_BASE_URL = "https://archive.org/download/"

HEADERS = {
    "User-Agent": "ArchiveScraper/1.0 (contact: youremail@example.com)"
}

# ---------- SEARCH HELPERS ----------

def search_archive(
    query: str,
    rows: int = 50,
    max_items: Optional[int] = None,
    sleep: float = 1.0,
) -> List[Dict[str, Any]]:
    """
    Use Internet Archive Advanced Search API to get a list of items for a query.

    :param query: Lucene-style query, e.g. '"service agreement"'
    :param rows: results per page (max ~50 is safe)
    :param max_items: stop after this many items (None => all)
    :param sleep: seconds to sleep between requests (be polite)
    :return: list of docs (each is a metadata dict with 'identifier', 'title', etc.)
    """
    all_docs: List[Dict[str, Any]] = []

    params = {
        "q": query,
        "output": "json",
        "rows": rows,
        "page": 1,
        # choose the fields you want back
        "fl[]": [
            "identifier",
            "title",
            "creator",
            "year",
            "mediatype",
            "collection"
        ],
    }

    while True:
        print(f"[search] Fetching page {params['page']} ...")
        resp = requests.get(ADV_SEARCH_URL, params=params, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        data = resp.json()

        response = data.get("response", {})
        docs = response.get("docs", [])
        num_found = response.get("numFound", 0)

        if not docs:
            break

        for d in docs:
            all_docs.append(d)
            if max_items is not None and len(all_docs) >= max_items:
                print(f"[search] Reached max_items={max_items}")
                return all_docs

        # paging
        total_pages = math.ceil(num_found / rows)
        if params["page"] >= total_pages:
            break

        params["page"] += 1
        time.sleep(sleep)  # be nice to archive.org

    print(f"[search] Collected {len(all_docs)} items (of {num_found} total).")
    return all_docs


# ---------- METADATA + DOWNLOAD HELPERS ----------

def fetch_metadata(identifier: str) -> Dict[str, Any]:
    """Get full metadata for a given item identifier."""
    url = METADATA_URL + identifier
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    return resp.json()


def choose_text_file(files: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
    """
    Try to find the best text-like file (OCR) from the metadata "files" list.
    Preference order:
        1. format "DjvuTXT" (common OCR text)
        2. format "Text"
        3. any file ending with .txt
    """
    preferred_formats = ["DjvuTXT", "Text", "TXT"]

    # 1) Try preferred formats
    for fmt in preferred_formats:
        for f in files:
            if f.get("format") == fmt:
                return f

    # 2) Fallback: any .txt file
    for f in files:
        name = f.get("name", "").lower()
        if name.endswith(".txt"):
            return f

    return None


def choose_pdf_file(files: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
    """Fallback: choose a PDF file if no text file exists."""
    pdf_formats = ["Text PDF", "PDF"]
    for fmt in pdf_formats:
        for f in files:
            if f.get("format") == fmt:
                return f

    # Any .pdf as last resort
    for f in files:
        name = f.get("name", "").lower()
        if name.endswith(".pdf"):
            return f
    return None


def download_file(identifier: str, file_info: Dict[str, Any], out_dir: str) -> str:
    """
    Download a single file from archive.org/download/{identifier}/{name}
    Returns local file path.
    """
    name = file_info["name"]
    url = f"{DOWNLOAD_BASE_URL}{identifier}/{name}"
    local_path = os.path.join(out_dir, name)

    if os.path.exists(local_path):
        print(f"[download] Already exists: {local_path}")
        return local_path

    print(f"[download] {url}")
    resp = requests.get(url, headers=HEADERS, timeout=60)
    resp.raise_for_status()

    os.makedirs(out_dir, exist_ok=True)
    with open(local_path, "wb") as f:
        f.write(resp.content)

    return local_path


# ---------- MAIN SCRAPE PIPELINE ----------

def scrape_query(
    query: str,
    base_output_dir: str = "archive_service_agreements",
    max_items: Optional[int] = None,
):
    """
    High-level function:
    - search for items
    - for each item, fetch metadata
    - download best text file (or PDF) if available
    - write a JSONL manifest with metadata + local paths
    """
    os.makedirs(base_output_dir, exist_ok=True)
    manifest_path = os.path.join(base_output_dir, "items.jsonl")

    docs = search_archive(query=query, rows=50, max_items=max_items)

    with open(manifest_path, "w", encoding="utf-8") as manifest_f:
        for idx, doc in enumerate(docs, start=1):
            identifier = doc["identifier"]
            print(f"\n=== [{idx}/{len(docs)}] {identifier} ===")

            try:
                meta = fetch_metadata(identifier)
            except Exception as e:
                print(f"[meta] Failed to fetch metadata for {identifier}: {e}")
                continue

            files = meta.get("files", []) or []

            text_file = choose_text_file(files)
            pdf_file = choose_pdf_file(files) if text_file is None else None

            item_dir = os.path.join(base_output_dir, identifier)
            os.makedirs(item_dir, exist_ok=True)

            text_path = None
            pdf_path = None

            try:
                if text_file:
                    text_path = download_file(identifier, text_file, item_dir)
                elif pdf_file:
                    pdf_path = download_file(identifier, pdf_file, item_dir)
                else:
                    print("[warn] No text/PDF file found for this item.")
            except Exception as e:
                print(f"[download] Error downloading files for {identifier}: {e}")

            # Build record for manifest
            record = {
                "identifier": identifier,
                "title": doc.get("title"),
                "creator": doc.get("creator"),
                "year": doc.get("year"),
                "mediatype": doc.get("mediatype"),
                "collection": doc.get("collection"),
                "details_url": f"https://archive.org/details/{identifier}",
                "downloaded_text": text_path,
                "downloaded_pdf": pdf_path,
            }

            manifest_f.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(f"\n[done] Wrote manifest to: {manifest_path}")


if __name__ == "__main__":
    # Same as: https://archive.org/search?query=%22service+agreement%22
    QUERY = '"service agreement"'

    # set max_items=None to fetch all; or a number like 200 while testing
    scrape_query(query=QUERY, base_output_dir="archive_service_agreements", max_items=200)


[search] Fetching page 1 ...
[search] Fetching page 2 ...
[search] Fetching page 3 ...
[search] Fetching page 4 ...
[search] Reached max_items=200

=== [1/200] rcgtn-Budget_Finance_Committee_-_August_10_2023 ===
[warn] No text/PDF file found for this item.

=== [2/200] Committee_on_Finance_12-6-2017 ===
[warn] No text/PDF file found for this item.

=== [3/200] lawin-Board_of_Public_Works_-_June_24_2021 ===
[warn] No text/PDF file found for this item.

=== [4/200] 6980330-BOE-Approved-ShotSpotter-Flex-Service-Agreement ===
[download] https://archive.org/download/6980330-BOE-Approved-ShotSpotter-Flex-Service-Agreement/6980330-BOE-Approved-ShotSpotter-Flex-Service-Agreement.pdf

=== [5/200] cgmn-Cottage_Grove_City_Council_Meeting_10-19-22 ===
[warn] No text/PDF file found for this item.

=== [6/200] BTWRLM121 ===
[warn] No text/PDF file found for this item.

=== [7/200] manualzilla-id-5970347 ===
[download] https://archive.org/download/manualzilla-id-5970347/5970347_djvu.txt

=== [8/200] 

KeyboardInterrupt: 

In [2]:
import requests
import json
from typing import Dict, List, Tuple

# =======================
# PASTE YOUR BRAINS HERE
# =======================
BRAINS = {
    "nda_brain": [
        "https://www.sec.gov/edgar/search-and-access",
        "https://www.sec.gov/edgar/search/#/q=nda&filter_forms=10-K",
        "https://github.com/ContractStandards/Contract-Clauses",
        "https://onenda.org/",
        "https://huggingface.co/datasets/atticus-project/cuad",
        "https://github.com/jamesacampbell/contracts",
        "https://github.com/alangrafu/legal-docs",
        "https://www.dol.gov/agencies/oasam/site-closures/nda",
        "https://archive.org/search?query=non-disclosure+agreement",
    ],
    "msa_brain": [
        "https://www.sec.gov/edgar/search-and-access",
        "https://www.sec.gov/edgar/search/#/q=%22master%20service%20agreement%22",
        "https://nvca.org/model-legal-documents/",
        "https://www.techcontracts.com/resources/",
        "https://huggingface.co/datasets/lex_glue",
        "https://content.next.westlaw.com/practical-law",
        "https://www.miamidade.gov/Apps/ContractSearch/",
        "https://www.data.gov/search?q=contract",
        "https://sam.gov/content/opportunities",
        "https://archive.org/search?query=%22service+agreement%22",
    ],
    "investigation_brain": [
        "https://www.cs.cmu.edu/~enron/",
        "https://www.sec.gov/enforcement",
        "https://www.justice.gov/news",
        "https://www.courtlistener.com/",
        "https://www.oig.dol.gov/reports.htm",
        "https://www.ftc.gov/legal-library/browse/cases-proceedings",
        "https://www.govinfo.gov/app/collection/chrg",
        "https://openjustice.doj.ca.gov/",
        "https://vault.fbi.gov/",
        "https://www.nist.gov/publications",
    ],
    "case_intelligence_brain": [
        "https://www.courtlistener.com/api/bulk-data/",
        "https://www.recapthelaw.org/",
        "https://www.supremecourt.gov/opinions/opinions.aspx",
        "https://www.uscourts.gov/court-records",
        "https://www.govinfo.gov/app/collection/uscourts",
        "https://vault.fbi.gov/",
        "https://case.law/",
        "https://opendata.cityofnewyork.us/",
    ],
    "new_york_module": [
        "https://public.leginfo.state.ny.us/laws/",
        "https://www.nycourts.gov/courts/appeals/decisions/",
        "https://nycourts.gov/courts/ad1/Decisions.shtml",
        "https://ag.ny.gov/press-releases",
        "https://opendata.cityofnewyork.us/",
        "https://www.nysenate.gov/transparency/contracts",
        "https://www.nyc.gov/site/law/publications/publications.page",
        "https://www.nycourts.gov/courthelp/UGC/UCC.shtml",
        "https://wwe1.osc.state.ny.us/transparency/contracts/contractsearch.cfm",
    ],
}

# =======================================
# CONFIG
# =======================================
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; LegalRAGLinkChecker/1.0)"
}
TIMEOUT = 10


def check_url(url: str):
    """Checks URL, returns (ok, status, error_msg)."""
    try:
        resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
        status = resp.status_code
        if status < 400:
            return True, status, None
        return False, status, f"HTTP {status}"
    except requests.exceptions.RequestException as e:
        return False, None, str(e)


def process_brains(brains: Dict[str, List[str]]):
    working = {}
    failed = {}

    for brain, urls in brains.items():
        print(f"\nüîç Checking {brain}")
        working[brain] = []
        failed[brain] = []

        for url in urls:
            print(f"   ‚Üí {url} ... ", end="")
            ok, status, err = check_url(url)

            if ok:
                print(f"OK ({status})")
                working[brain].append(url)
            else:
                print(f"FAILED ({status}, {err})")
                failed[brain].append({
                    "url": url,
                    "status": status,
                    "error": err
                })

        if not failed[brain]:
            del failed[brain]

    return working, failed


if __name__ == "__main__":
    working_links, failed_links = process_brains(BRAINS)

    print("\n" + "=" * 80)
    print("‚úÖ WORKING LINKS (FORMATTED LIKE INPUT)")
    print("=" * 80)
    print(json.dumps(working_links, indent=4))

    print("\n" + "=" * 80)
    print("‚ùå FAILED LINKS (BRAIN-WISE)")
    print("=" * 80)
    print(json.dumps(failed_links, indent=4))



üîç Checking nda_brain
   ‚Üí https://www.sec.gov/edgar/search-and-access ... OK (200)
   ‚Üí https://www.sec.gov/edgar/search/#/q=nda&filter_forms=10-K ... OK (200)
   ‚Üí https://github.com/ContractStandards/Contract-Clauses ... FAILED (404, HTTP 404)
   ‚Üí https://onenda.org/ ... OK (200)
   ‚Üí https://huggingface.co/datasets/atticus-project/cuad ... FAILED (401, HTTP 401)
   ‚Üí https://github.com/jamesacampbell/contracts ... FAILED (404, HTTP 404)
   ‚Üí https://github.com/alangrafu/legal-docs ... FAILED (404, HTTP 404)
   ‚Üí https://www.dol.gov/agencies/oasam/site-closures/nda ... FAILED (404, HTTP 404)
   ‚Üí https://archive.org/search?query=non-disclosure+agreement ... OK (200)

üîç Checking msa_brain
   ‚Üí https://www.sec.gov/edgar/search-and-access ... OK (200)
   ‚Üí https://www.sec.gov/edgar/search/#/q=%22master%20service%20agreement%22 ... OK (200)
   ‚Üí https://nvca.org/model-legal-documents/ ... OK (200)
   ‚Üí https://www.techcontracts.com/resources/ ... OK (200

In [3]:
import os
import json
import math
import time
import requests
from typing import List, Dict, Any, Optional
from pypdf import PdfReader
from docx import Document

ADV_SEARCH_URL = "https://archive.org/advancedsearch.php"
METADATA_URL = "https://archive.org/metadata/"
DOWNLOAD_BASE_URL = "https://archive.org/download/"

HEADERS = {"User-Agent": "LegalRAGBot/1.0"}

def safe(s: str) -> str:
    return "".join(c for c in s if c.isalnum() or c in " ._-").strip()


# ----------------------------------------------
# SEARCH FUNCTION
# ----------------------------------------------
def search_archive(query: str, rows=100, sleep=1.0):
    all_docs = []
    page = 1

    while True:
        params = {
            "q": query,
            "output": "json",
            "rows": rows,
            "page": page,
            "fl[]": ["identifier", "title", "creator", "year"]
        }

        r = requests.get(ADV_SEARCH_URL, params=params, headers=HEADERS, timeout=30)
        r.raise_for_status()
        data = r.json()["response"]

        docs = data.get("docs", [])
        total = data.get("numFound", 0)

        if not docs:
            break

        all_docs.extend(docs)
        print(f"Fetched page {page}, total: {len(all_docs)}/{total}")

        total_pages = math.ceil(total / rows)
        if page >= total_pages:
            break

        page += 1
        time.sleep(sleep)

    return all_docs


# ----------------------------------------------
# DOWNLOAD FUNCTIONS
# ----------------------------------------------
def fetch_metadata(identifier: str):
    r = requests.get(METADATA_URL + identifier, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.json()


def find_text_or_pdf(files):
    txt = None
    pdf = None

    for f in files:
        fmt = f.get("format", "").lower()
        name = f.get("name", "").lower()

        if "djvutxt" in fmt or "text" in fmt or name.endswith(".txt"):
            txt = f

        if "pdf" in fmt or name.endswith(".pdf"):
            pdf = f

    return txt, pdf


def download(identifier, fileinfo, outdir):
    name = fileinfo["name"]
    url = f"{DOWNLOAD_BASE_URL}{identifier}/{name}"
    path = os.path.join(outdir, name)

    if not os.path.exists(path):
        print(f"Downloading: {url}")
        r = requests.get(url, headers=HEADERS, timeout=60)
        r.raise_for_status()

        with open(path, "wb") as f:
            f.write(r.content)

    return path


# ----------------------------------------------
# TEXT EXTRACTION
# ----------------------------------------------
def extract_pdf_text(pdf_path):
    text = []
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            content = page.extract_text() or ""
            text.append(content)
    except:
        return ""
    return "\n\n".join(text)


# ----------------------------------------------
# CREATE DOCX FOR EACH DOCUMENT
# ----------------------------------------------
def save_docx(title, text, outdir):
    doc_path = os.path.join(outdir, safe(title) + ".docx")
    doc = Document()

    # Bold title
    h = doc.add_heading(level=1)
    run = h.add_run(title)
    run.bold = True

    doc.add_paragraph(text or "No text found.")

    doc.save(doc_path)
    return doc_path


# ----------------------------------------------
# MAIN PIPELINE
# ----------------------------------------------
def scrape(query):
    output_root = "archive_scraped"
    os.makedirs(output_root, exist_ok=True)

    docs = search_archive(query)

    for d in docs:
        identifier = d["identifier"]
        title = d.get("title") or identifier
        folder_name = safe(title)
        folder = os.path.join(output_root, folder_name)
        os.makedirs(folder, exist_ok=True)

        print(f"\n=== {identifier} ===")

        meta = fetch_metadata(identifier)
        files = meta.get("files", [])

        txt_file, pdf_file = find_text_or_pdf(files)

        text_content = ""

        if txt_file:
            txt_path = download(identifier, txt_file, folder)
            text_content = open(txt_path, "r", errors="ignore").read()

        elif pdf_file:
            pdf_path = download(identifier, pdf_file, folder)
            text_content = extract_pdf_text(pdf_path)

        else:
            text_content = "No OCR text or PDF found."

        save_docx(title, text_content, folder)

    print("\nDONE.")


if __name__ == "__main__":
    scrape('"service agreement"')


Fetched page 1, total: 100/684
Fetched page 2, total: 200/684
Fetched page 3, total: 300/684
Fetched page 4, total: 400/684
Fetched page 5, total: 500/684
Fetched page 6, total: 600/684
Fetched page 7, total: 684/684

=== City.Council.04.17.2018 ===

=== City_of_Lathrop_City_Council_Meeting_November_3_2014 ===

=== cobral-Brewton_City_Council_9_9_2019 ===
Downloading: https://archive.org/download/cobral-Brewton_City_Council_9_9_2019/Brewton_City_Council_9_9_2019.es.vtt

=== coltcav-City_Council_Regular_Meeting_-_July_13th_2020 ===

=== cowilv-City_Council_Meeting_6-3-19 ===

=== toflaz-Town_Council_Meeting_June_18_2018 ===


KeyboardInterrupt: 