In [None]:
import json
import os
import time
import re

from filelock import FileLock
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager


def click_and_select_all(url: str,
                         processed_file: str,
                         scraper_id: int,
                         num_scrapers: int):
    # Initialize file lock for safe concurrent access
    lock = FileLock(processed_file + ".lock")
    if not os.path.exists(processed_file):
        with lock:
            with open(processed_file, "w", encoding="utf-8") as f:
                json.dump([], f)

    # Setup headless Chrome
    options = Options()
    options.headless = True
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(url)
        time.sleep(2)
        total = len(driver.find_elements(By.CLASS_NAME, "rc-item-title-div"))

        for idx in range(total):
            # Distribute work among scrapers
            if idx % num_scrapers != scraper_id:
                continue

            items = driver.find_elements(By.CLASS_NAME, "rc-item-title-div")
            if idx >= len(items):
                total = len(items)
                if idx >= total:
                    break

            item = items[idx]
            name = item.text.strip()

            # Check if already processed
            with lock:
                with open(processed_file, "r", encoding="utf-8") as f:
                    processed = json.load(f)
            if name in processed:
                print(f"[{idx+1}/{total}] Skipping processed: {name}")
                continue

            # Scroll into view and click
            driver.execute_script("arguments[0].scrollIntoView(true);", item)
            time.sleep(0.5)
            print(f"[{idx+1}/{total}] Processing: {name}")
            item.click()
            time.sleep(2)

            # Click fit-height icon if present
            try:
                fit_btn = driver.find_element(By.CLASS_NAME, "rc-fit-height-icon")
                fit_btn.click()
                time.sleep(0.5)
            except Exception:
                print("  ⚠️ Fit-height icon not found, continuing")

            # Iterate through select options
            select_elem = driver.find_element(By.ID, "rc-page-select")
            select = Select(select_elem)
            for opt_idx, opt in enumerate(select.options):
                select.select_by_index(opt_idx)
                option_text = opt.text.strip()
                print(f"    Selecting option: {option_text}")
                time.sleep(1.6)

                # Capture the tile surface as PNG
                try:
                    surface = driver.find_element(By.CLASS_NAME, "rc-ti-tile-surface")
                    safe_name = re.sub(r"[^\w\- ]", "", name).strip()
                    safe_opt = re.sub(r"[^\w\- ]", "", option_text).strip().replace(" ", "_")
                    out_dir = os.path.join(os.getcwd(), "proj_data2", safe_name)
                    os.makedirs(out_dir, exist_ok=True)
                    file_path = os.path.join(out_dir, f"{safe_opt}.png")
                    surface.screenshot(file_path)
                    print(f"      Saved screenshot: {file_path}")
                except Exception as e:
                    print(f"      ⚠️ Failed to capture surface: {e}")

            # Navigate back to the main listing
            driver.back()
            time.sleep(2)

            # Mark as processed
            with lock:
                with open(processed_file, "r", encoding="utf-8") as f:
                    
                    processed = json.load(f)
                processed.append(name)
                with open(processed_file, "w", encoding="utf-8") as f:
                    json.dump(processed, f, indent=2)

        print(f"All done for scraper {scraper_id}")

    finally:
        driver.quit()


In [None]:
def main():
    URL = "https://box2.nmtvault.com/Hennepin2/jsp/RcWebSearchResults.jsp?result_start=0&result_items=48&result_layout=GRID&result_sort=Publication%20Date&collection1=7083e412-1de2-42fe-b070-7f82e5c869a4&query1_modifier=AND&query1_field=DATE_PUBLISHED_MILLIS&query1_min=-915148800000&query1_max=-631152000000"
    PROCESSED_FILE = "processed_items.json"
    NUM_SCRAPERS = 6

    threads = []
    for i in range(NUM_SCRAPERS):
        t = threading.Thread(
            target=click_and_select_all,
            args=(URL, PROCESSED_FILE, i, NUM_SCRAPERS),
            daemon=True
        )
        t.start()
        threads.append(t)
    # Wait for all threads to finish
    for t in threads:
        t.join()

    print("All scrapers have completed.")

In [None]:
if __name__ == "__main__":
    main()

In [None]:
pip install google-cloud-vision google-auth tqdm

In [None]:
import os
import re
import json
from pathlib import Path

from google.cloud import vision
from google.oauth2 import service_account
from tqdm import tqdm

# ─── Text Parsing ───────────────────────────────────────────────────────────

def parse_entry(entry_text: str, year: int) -> dict:
    txt = re.sub(r'\s+', ' ', entry_text).strip()
    m = re.match(r'^(?P<name>[^,]+)', txt)
    name = m.group('name').strip() if m else None
    rest = txt[len(name):].strip(' ,') if name else txt
    parts = [p.strip() for p in rest.split(',') if p.strip()]

    spouse = address = residence = occupation = employer = None
    for part in parts:
        low = part.lower()
        if low.startswith('w. '):
            spouse = part[3:].strip()
        elif low.startswith(('h. ', 'res. ')):
            ind, _, addr = part.partition(' ')
            residence = ind.rstrip('.')
            address = addr or None
        elif ' at ' in low:
            occ, emp = re.split(r' at ', part, flags=re.I, maxsplit=1)
            occupation, employer = occ.strip(), emp.strip()
        elif not address:
            address = part
        else:
            occupation = (occupation + '; ' + part) if occupation else part

    return {
        "name": name,
        "spouse": spouse,
        "address": address,
        "residence_indicator": residence,
        "occupation": occupation,
        "employer": employer,
        "year": year
    }

# ─── OCR via Google Cloud Vision ────────────────────────────────────────────

def ocr_with_gcv(image_path: Path, client: vision.ImageAnnotatorClient) -> str:
    with open(image_path, 'rb') as img_file:
        content = img_file.read()
    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)
    if response.error.message:
        raise RuntimeError(f"GCV error: {response.error.message}")
    return response.full_text_annotation.text

# ─── Folder Processing ──────────────────────────────────────────────────────

def process_year_folder(folder: Path, year: int, client: vision.ImageAnnotatorClient):
    entries = []
    for img_file in tqdm(sorted(folder.iterdir()), desc=f"Year {year}"):
        if img_file.suffix.lower() not in ('.png', '.jpg', '.jpeg', '.tif'):
            continue
        raw = ocr_with_gcv(img_file, client)
        chunks = [chunk for chunk in raw.split('\n\n') if chunk.strip()]
        for chunk in chunks:
            entries.append(parse_entry(chunk, year))
    return entries

# ─── Main Pipeline ───────────────────────────────────────────────────────────

def main():
    DATA_ROOT = Path("proj_data")      
    OUT_FILE  = "residents_gcv.json"

    # Initialize Google Cloud Vision client
    creds = service_account.Credentials.from_service_account_file(
        os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
    )
    client = vision.ImageAnnotatorClient(credentials=creds)

    all_records = []
    year_pattern = re.compile(r'(\d{4})$')

    for subdir in sorted(DATA_ROOT.iterdir()):
        if not subdir.is_dir():
            continue
        m = year_pattern.search(subdir.name)
        if not m:
            print(f"Skipping folder (no year): {subdir.name}")
            continue
        year = int(m.group(1))
        print(f"\n→ Processing '{subdir.name}' (year {year})")
        recs = process_year_folder(subdir, year, client)
        print(f"   → Parsed {len(recs)} entries")
        all_records.extend(recs)

    with open(OUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(all_records, f, indent=2)

    print(f"Done! Total entries: {len(all_records)} → '{OUT_FILE}'")

if __name__ == '__main__':
    main()

In [None]:
from pathlib import Path
import re
import sys

# Adjust this if your top-level folder is named differently
ROOT = Path("proj_data")
YEAR_RE = re.compile(r".*?(\d{4})$")

if not ROOT.exists() or not ROOT.is_dir():
    print(f"Error: '{ROOT}' not found or is not a directory.")
    sys.exit(1)

for sub in sorted(ROOT.iterdir()):
    if not sub.is_dir():
        continue

    match = YEAR_RE.match(sub.name)
    if not match:
        print(f"Skipping (no year in name): '{sub.name}'")
        continue

    year = match.group(1)
    target = ROOT / year

    if target.exists():
        print(f"⚠️  Target already exists, skipping: '{target.name}'")
        continue

    try:
        print(f"Renaming '{sub.name}' → '{year}'")
        sub.rename(target)
    except Exception as e:
        print(f"Error renaming '{sub.name}': {e}")

print("Done! Directory names normalized.")

In [None]:
#!/usr/bin/env python3
"""
OCR & Parsing Pipeline using Google Cloud Vision (1900–1950)

Dependencies:
    pip install google-cloud-vision google-auth tqdm

Set your Google credentials:
    export GOOGLE_APPLICATION_CREDENTIALS="path/to/your-service-account.json"
"""

import os
import re
import json
from pathlib import Path

from google.cloud import vision
from google.oauth2 import service_account
from tqdm import tqdm

# ─── Text Parsing ───────────────────────────────────────────────────────────

def parse_entry(entry_text: str, year: int) -> dict:
    txt = re.sub(r'\s+', ' ', entry_text).strip()
    m = re.match(r'^(?P<name>[^,]+)', txt)
    name = m.group('name').strip() if m else None
    rest = txt[len(name):].strip(' ,') if name else txt
    parts = [p.strip() for p in rest.split(',') if p.strip()]

    spouse = address = residence = occupation = employer = None
    for part in parts:
        low = part.lower()
        if low.startswith('w. '):
            spouse = part[3:].strip()
        elif low.startswith(('h. ', 'res. ')):
            ind, _, addr = part.partition(' ')
            residence = ind.rstrip('.')
            address = addr or None
        elif ' at ' in low:
            occ, emp = re.split(r' at ', part, flags=re.I, maxsplit=1)
            occupation, employer = occ.strip(), emp.strip()
        elif not address:
            address = part
        else:
            occupation = (occupation + '; ' + part) if occupation else part

    return {
        "name": name,
        "spouse": spouse,
        "address": address,
        "residence_indicator": residence,
        "occupation": occupation,
        "employer": employer,
        "year": year
    }

# ─── OCR via Google Cloud Vision ────────────────────────────────────────────

def ocr_with_gcv(image_path: Path, client: vision.ImageAnnotatorClient) -> str:
    with open(image_path, 'rb') as img_file:
        content = img_file.read()
    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)
    if response.error.message:
        raise RuntimeError(f"GCV error: {response.error.message}")
    return response.full_text_annotation.text

# ─── Folder Processing ──────────────────────────────────────────────────────

def process_year_folder(folder: Path, year: int, client: vision.ImageAnnotatorClient):
    entries = []
    for img_file in tqdm(sorted(folder.iterdir()), desc=f"Year {year}"):
        if img_file.suffix.lower() not in ('.png', '.jpg', '.jpeg', '.tif'):
            continue
        raw = ocr_with_gcv(img_file, client)
        chunks = [chunk for chunk in raw.split('\n\n') if chunk.strip()]
        for chunk in chunks:
            entries.append(parse_entry(chunk, year))
    return entries

# ─── Main Pipeline ───────────────────────────────────────────────────────────

def main():
    DATA_ROOT = Path("proj_data")      # top folder with subfolders “... 1900”, “... 1901”, …
    OUT_FILE  = "residents_gcv.json"

    # Initialize Google Cloud Vision client
    creds = service_account.Credentials.from_service_account_file(
        os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
    )
    client = vision.ImageAnnotatorClient(credentials=creds)

    all_records = []
    year_pattern = re.compile(r'(\d{4})$')

    for subdir in sorted(DATA_ROOT.iterdir()):
        if not subdir.is_dir():
            continue
        m = year_pattern.search(subdir.name)
        if not m:
            print(f"Skipping folder (no year): {subdir.name}")
            continue
        year = int(m.group(1))
        print(f"\n→ Processing '{subdir.name}' (year {year})")
        recs = process_year_folder(subdir, year, client)
        print(f"   → Parsed {len(recs)} entries")
        all_records.extend(recs)

    with open(OUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(all_records, f, indent=2)

    print(f"\n✅ Done! Total entries: {len(all_records)} → '{OUT_FILE}'")

if __name__ == '__main__':
    main()


In [None]:
conda install -c conda-forge pillow pytesseract tqdm


In [None]:
#!/usr/bin/env python3
import os
import re
import json
from pathlib import Path

from PIL import Image, ImageOps
import pytesseract
from tqdm import tqdm

# ─── Configuration ─────────────────────────────────────────────────────────

DATA_ROOT   = Path("proj_data")     # top‑level folder containing your year‑dirs
OUTPUT_FILE = "residents.json"      # where we'll save the array of entries

# simple decade thresholds if you want to tune per‑era
DECADE_THRESH = {
    (1850, 1879): 150,
    (1880, 1899): 170,
    (1900, 1950): 180,
}

# ─── Helpers ────────────────────────────────────────────────────────────────

def get_threshold(year: int) -> int:
    for (start, end), t in DECADE_THRESH.items():
        if start <= year <= end:
            return t
    return 180

def preprocess_image(path: Path, threshold: int) -> Image.Image:
    """
    Grayscale → point threshold → invert → return PIL.Image
    threshold in [0..255]
    """
    img = Image.open(path).convert("L")
    # any pixel < threshold → white; else → black
    bw  = img.point(lambda p: 255 if p < threshold else 0, mode="L")
    inv = ImageOps.invert(bw)
    return inv

def ocr_text(img: Image.Image) -> str:
    """
    Run Tesseract on a PIL image. psm 6 = assume a block of text.
    """
    return pytesseract.image_to_string(img, config="--psm 6")

def parse_entry(txt: str, year: int) -> dict:
    """
    From one block of text (one resident line), extract fields.
    """
    s = re.sub(r"\s+", " ", txt).strip()
    m = re.match(r"^(?P<name>[^,]+)", s)
    name = m.group("name").strip() if m else None
    rest = s[len(name):].strip(" ,") if name else s
    parts = [p.strip() for p in rest.split(",") if p.strip()]

    spouse = address = residence = occupation = employer = None
    for p in parts:
        lp = p.lower()
        if lp.startswith("w. "):
            spouse = p[3:].strip()
        elif lp.startswith(("h. ", "res. ")):
            ind, _, addr = p.partition(" ")
            residence = ind.rstrip(".")
            address   = addr or None
        elif " at " in lp:
            occ, emp = re.split(r" at ", p, flags=re.I, maxsplit=1)
            occupation, employer = occ.strip(), emp.strip()
        elif not address:
            address = p
        else:
            occupation = (occupation + "; " + p) if occupation else p

    return {
        "name": name,
        "spouse": spouse,
        "address": address,
        "residence_indicator": residence,
        "occupation": occupation,
        "employer": employer,
        "year": year
    }

# ─── Main Pipeline ─────────────────────────────────────────────────────────

def main():
    all_entries = []
    year_re     = re.compile(r"(\d{4})$")

    # iterate over every subfolder
    for sub in sorted(DATA_ROOT.iterdir()):
        if not sub.is_dir(): 
            continue

        # find trailing 4‑digit year
        m = year_re.search(sub.name)
        if not m:
            print(f"Skipping (no year): {sub.name}")
            continue

        year  = int(m.group(1))
        thresh = get_threshold(year)
        print(f"\n→ Year {year}  (threshold={thresh})")

        # OCR each image in that folder
        for img_file in tqdm(sorted(sub.iterdir()), desc=f"Y{year}"):
            if img_file.suffix.lower() not in (".png", ".jpg", ".jpeg", ".tif"):
                continue

            # 1) preprocess & OCR
            pil_img = preprocess_image(img_file, thresh)
            raw_txt = ocr_text(pil_img)

            # 2) split into blocks by blank lines
            blocks = [blk for blk in raw_txt.split("\n\n") if blk.strip()]
            for blk in blocks:
                entry = parse_entry(blk, year)
                all_entries.append(entry)

    # write JSON locally
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(all_entries, f, indent=2)

    print(f"Complete! Total entries: {len(all_entries)}")
    print(f"JSON written to '{OUTPUT_FILE}'")

if __name__ == "__main__":
    main()


In [None]:
#!/usr/bin/env python3
import os
import re
import json
from pathlib import Path

import cv2
from PIL import Image
import pytesseract
from tqdm import tqdm

# ─── Enhanced Preprocessing ────────────────────────────────────────────────

def deskew_image(gray):
    # threshold and find text contours for angle
    _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    coords = cv2.findNonZero(bw)
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = 90 + angle
    (h, w) = gray.shape
    M = cv2.getRotationMatrix2D((w/2, h/2), angle, 1.0)
    return cv2.warpAffine(gray, M, (w, h),
                          flags=cv2.INTER_CUBIC,
                          borderMode=cv2.BORDER_REPLICATE)

def enhanced_preprocess(path, threshold=180):
    """
    1. Read color → convert to grayscale
    2. Bilateral filter for noise removal
    3. Deskew based on text angle
    4. Adaptive thresholding
    5. Morphological opening
    6. Invert to dark text on light background
    """
    img = cv2.imread(str(path), cv2.IMREAD_COLOR)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    den = cv2.bilateralFilter(gray, 9, 75, 75)
    desk = deskew_image(den)
    thr = cv2.adaptiveThreshold(
        desk, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        blockSize=15,
        C=2
    )
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,1))
    opened = cv2.morphologyEx(thr, cv2.MORPH_OPEN, kernel)
    inv = cv2.bitwise_not(opened)
    return Image.fromarray(inv)

# ─── OCR & Parsing Helpers ─────────────────────────────────────────────────

def ocr_text(img):
    return pytesseract.image_to_string(img, config="--psm 6")

def parse_entry(txt, year):
    s = re.sub(r"\s+", " ", txt).strip()
    m = re.match(r"^(?P<name>[^,]+)", s)
    name = m.group("name").strip() if m else None
    rest = s[len(name):].strip(" ,") if name else s
    parts = [p.strip() for p in rest.split(",") if p.strip()]

    spouse = address = residence = occupation = employer = None
    for p in parts:
        lp = p.lower()
        if lp.startswith("w. "):
            spouse = p[3:].strip()
        elif lp.startswith(("h. ", "res. ")):
            ind, _, addr = p.partition(" ")
            residence = ind.rstrip(".")
            address   = addr or None
        elif " at " in lp:
            occ, emp = re.split(r" at ", p, flags=re.I, maxsplit=1)
            occupation, employer = occ.strip(), emp.strip()
        elif not address:
            address = p
        else:
            occupation = (occupation + "; " + p) if occupation else p

    return {
        "name": name,
        "spouse": spouse,
        "address": address,
        "residence_indicator": residence,
        "occupation": occupation,
        "employer": employer,
        "year": year
    }

# ─── Main ───────────────────────────────────────────────────────────────────

def main():
    DATA_FOLDER = Path("proj_data/1921")
    OUTPUT_FILE = "residents_1921.json"

    if not DATA_FOLDER.is_dir():
        print(f"Error: folder not found: {DATA_FOLDER}")
        return

    # extract year from folder name
    m = re.search(r"(\d{4})$", DATA_FOLDER.name)
    year = int(m.group(1)) if m else None
    if not year:
        print("Error: could not determine year from folder name.")
        return

    records = []
    for img_path in tqdm(sorted(DATA_FOLDER.iterdir()), desc=f"Year {year}"):
        if img_path.suffix.lower() not in (".png", ".jpg", ".jpeg", ".tif"):
            continue

        # enhanced preprocess + OCR
        pil_img = enhanced_preprocess(img_path, threshold=180)
        raw_txt = ocr_text(pil_img)

        # split and parse each block
        for blk in [b for b in raw_txt.split("\n\n") if b.strip()]:
            rec = parse_entry(blk, year)
            records.append(rec)

    # write out JSON
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=2)

    print(f"\n✅ Done: {len(records)} entries written to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


In [None]:
import json
import os
import time
import re
import threading

from filelock import FileLock
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager


def click_and_select_all(url: str,
                         processed_file: str,
                         scraper_id: int,
                         num_scrapers: int):
    lock = FileLock(processed_file + ".lock")
    if not os.path.exists(processed_file):
        with lock:
            with open(processed_file, "w") as f:
                json.dump([], f)

    options = Options()
    options.headless = True
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(url)
        time.sleep(2)
        items = driver.find_elements(By.CLASS_NAME, "rc-item-title-div")
        total = len(items)

        for idx in range(total):
            if idx % num_scrapers != scraper_id:
                continue

            items = driver.find_elements(By.CLASS_NAME, "rc-item-title-div")
            if idx >= len(items):
                break

            item = items[idx]
            name = item.text.strip()

            with lock:
                processed = json.load(open(processed_file, "r"))
            if name in processed:
                print(f"[{idx+1}/{total}] Skipping: {name}")
                continue

            # click the item
            driver.execute_script("arguments[0].scrollIntoView(true);", item)
            time.sleep(0.5)
            print(f"[{idx+1}/{total}] Processing: {name}")
            item.click()
            time.sleep(2)

            # ——— scroll pane from top→bottom in steps ———
            try:
                pane = WebDriverWait(driver, 10).until(
                    EC.visibility_of_element_located((By.CLASS_NAME, "rc-thumbnail-pane"))
                )
                # bring pane into view
                driver.execute_script("arguments[0].scrollIntoView(true);", pane)
                time.sleep(0.5)

                total_h = driver.execute_script("return arguments[0].scrollHeight;", pane)
                view_h  = driver.execute_script("return arguments[0].clientHeight;", pane)

                # step through in viewport-sized increments
                for y in range(0, total_h, view_h):
                    driver.execute_script(f"arguments[0].scrollTop = {y};", pane)
                    time.sleep(0.5)

                # ensure at very bottom
                driver.execute_script(f"arguments[0].scrollTop = {total_h};", pane)
                time.sleep(0.5)

                # now collect all thumbnails
                thumbs = pane.find_elements(By.CLASS_NAME, "rc-thumbnail")
                pg_idxs = []
                for thumb in thumbs:
                    try:
                        src = thumb.find_element(By.TAG_NAME, "img") \
                                   .get_attribute("src")
                        m = re.search(r"[?&]pg_idx=(\d+)", src)
                        if m:
                            pg_idxs.append(m.group(1))
                    except Exception:
                        continue

                # save out to name.json
                safe = re.sub(r"[^\w\- ]", "", name).strip().replace(" ", "_")
                out_dir = os.path.join(os.getcwd(), "proj_data2", safe)
                os.makedirs(out_dir, exist_ok=True)
                with open(os.path.join(out_dir, f"{safe}.json"), "w") as jf:
                    json.dump(pg_idxs, jf, indent=2)
                print(f"    ▶ Saved pg_idx list for {name}")

            except TimeoutException:
                print("    ⚠️ Pane never appeared – skipping pg_idx")
            except Exception as e:
                print(f"    ⚠️ Error extracting pg_idx: {e}")
            # ——————————————————————————————————————

            # rest of your existing fit‑height + screenshot logic…
            try:
                btn = driver.find_element(By.CLASS_NAME, "rc-fit-height-icon")
                btn.click(); time.sleep(0.5)
            except: pass

            try:
                sel = Select(driver.find_element(By.ID, "rc-page-select"))
                for i_opt, opt in enumerate(sel.options):
                    sel.select_by_index(i_opt)
                    time.sleep(1.6)
                    surf = driver.find_element(By.CLASS_NAME, "rc-ti-tile-surface")
                    opt_name = re.sub(r"[^\w\- ]", "", opt.text).strip().replace(" ", "_")
                    surf.screenshot(os.path.join(out_dir, f"{opt_name}.png"))
            except Exception as e:
                print(f"    ⚠️ Screenshot error: {e}")

            driver.back()
            time.sleep(2)

            with lock:
                processed.append(name)
                with open(processed_file, "w") as f:
                    json.dump(processed, f, indent=2)

        print(f"Scraper {scraper_id} done")

    finally:
        driver.quit()


def main():
    URL = (
        "https://box2.nmtvault.com/Hennepin2/jsp/RcWebSearchResults.jsp"
        "?result_start=0&result_items=48&result_layout=GRID"
        "&result_sort=Publication%20Date"
        "&collection1=7083e412-1de2-42fe-b070-7f82e5c869a4"
        "&query1_modifier=AND"
        "&query1_field=DATE_PUBLISHED_MILLIS"
        "&query1_min=-915148800000"
        "&query1_max=-631152000000"
    )
    PROCESSED = "processed_items.json"
    NUM = 6

    threads = []
    for i in range(NUM):
        t = threading.Thread(
            target=click_and_select_all,
            args=(URL, PROCESSED, i, NUM),
            daemon=True
        )
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    print("All scrapers finished.")


if __name__ == "__main__":
    main()


In [None]:
import json
import os
import time
import re
import threading

from filelock import FileLock
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager


def click_and_select_all(url: str,
                         processed_file: str,
                         scraper_id: int,
                         num_scrapers: int):
    lock = FileLock(processed_file + ".lock")
    if not os.path.exists(processed_file):
        with lock:
            with open(processed_file, "w", encoding="utf-8") as f:
                json.dump([], f)

    # Run in headed mode with DevTools open for debugging
    options = Options()
    options.headless = False
    options.add_argument("--auto-open-devtools-for-tabs")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(url)
        time.sleep(2)

        items = driver.find_elements(By.CLASS_NAME, "rc-item-title-div")
        total = len(items)

        for idx in range(total):
            # distribute across scrapers
            if idx % num_scrapers != scraper_id:
                continue

            items = driver.find_elements(By.CLASS_NAME, "rc-item-title-div")
            if idx >= len(items):
                break

            item = items[idx]
            name = item.text.strip()

            # skip already processed
            with lock:
                processed = json.load(open(processed_file, "r"))
            if name in processed:
                print(f"[{idx+1}/{total}] Skipping: {name}")
                continue

            # scroll this item into view & click
            driver.execute_script("arguments[0].scrollIntoView(true);", item)
            time.sleep(0.5)
            print(f"[{idx+1}/{total}] Processing: {name}")
            item.click()
            time.sleep(2)

            # ——— Debug: scroll thumbnail pane from top to bottom ———
            try:
                pane = WebDriverWait(driver, 10).until(
                    EC.visibility_of_element_located((By.CLASS_NAME, "rc-thumbnail-pane"))
                )
                # bring the pane itself into view
                driver.execute_script("arguments[0].scrollIntoView(true);", pane)
                time.sleep(0.5)

                total_h = driver.execute_script("return arguments[0].scrollHeight;", pane)
                view_h  = driver.execute_script("return arguments[0].clientHeight;", pane)
                print(f"DEBUG: pane.scrollHeight={total_h}, pane.clientHeight={view_h}")

                # scroll in increments of viewport height
                for y in range(0, total_h + view_h, view_h):
                    driver.execute_script("arguments[0].scrollTop = arguments[1];", pane, y)
                    # dispatch a real scroll event for lazy‑load
                    driver.execute_script("""
                        let ev = new Event('scroll', { bubbles: true });
                        arguments[0].dispatchEvent(ev);
                    """, pane)
                    curr = driver.execute_script("return arguments[0].scrollTop;", pane)
                    print(f"DEBUG: scrolled to {curr}")
                    time.sleep(0.5)

                # ensure at absolute bottom
                driver.execute_script("arguments[0].scrollTop = arguments[1];", pane, total_h)
                driver.execute_script("""
                    let ev = new Event('scroll', { bubbles: true });
                    arguments[0].dispatchEvent(ev);
                """, pane)
                print(f"DEBUG: forced to bottom, scrollTop={driver.execute_script('return arguments[0].scrollTop;', pane)}")
                time.sleep(0.5)

                # collect all thumbnails now
                thumbs = pane.find_elements(By.CLASS_NAME, "rc-thumbnail")
                pg_idxs = []
                for thumb in thumbs:
                    try:
                        src = thumb.find_element(By.TAG_NAME, "img").get_attribute("src")
                        m = re.search(r"[?&]pg_idx=(\d+)", src)
                        if m:
                            pg_idxs.append(m.group(1))
                    except Exception:
                        continue

                # write pg_idx list to name.json
                safe = re.sub(r"[^\w\- ]", "", name).strip().replace(" ", "_")
                out_dir = os.path.join(os.getcwd(), "proj_data2", safe)
                os.makedirs(out_dir, exist_ok=True)
                json_path = os.path.join(out_dir, f"{safe}.json")
                with open(json_path, "w", encoding="utf-8") as jf:
                    json.dump(pg_idxs, jf, indent=2)
                print(f"    ▶ Saved pg_idx list: {json_path}")

            except TimeoutException:
                print("    ⚠️ Thumbnail pane never appeared – skipping pg_idx extraction")
            except Exception as e:
                print(f"    ⚠️ Error during pg_idx extraction: {e}")
            # ——————————————————————————————————————————————

            # now your existing fit‑height & screenshot logic
            try:
                fit_btn = driver.find_element(By.CLASS_NAME, "rc-fit-height-icon")
                fit_btn.click()
                time.sleep(0.5)
            except Exception:
                pass

            try:
                select = Select(driver.find_element(By.ID, "rc-page-select"))
                for i_opt, opt in enumerate(select.options):
                    select.select_by_index(i_opt)
                    option_text = opt.text.strip()
                    print(f"    Selecting option: {option_text}")
                    time.sleep(1.6)

                    surface = driver.find_element(By.CLASS_NAME, "rc-ti-tile-surface")
                    safe_opt = re.sub(r"[^\w\- ]", "", option_text).strip().replace(" ", "_")
                    surface.screenshot(os.path.join(out_dir, f"{safe_opt}.png"))
            except Exception as e:
                print(f"    ⚠️ Screenshot error: {e}")

            # return and mark processed
            driver.back()
            time.sleep(2)
            with lock:
                processed.append(name)
                with open(processed_file, "w", encoding="utf-8") as f:
                    json.dump(processed, f, indent=2)

        print(f"Scraper {scraper_id} done")

    finally:
        driver.quit()


def main():
    URL = (
        "https://box2.nmtvault.com/Hennepin2/jsp/RcWebSearchResults.jsp"
        "?result_start=0&result_items=48&result_layout=GRID"
        "&result_sort=Publication%20Date"
        "&collection1=7083e412-1de2-42fe-b070-7f82e5c869a4"
        "&query1_modifier=AND"
        "&query1_field=DATE_PUBLISHED_MILLIS"
        "&query1_min=-915148800000"
        "&query1_max=-631152000000"
    )
    PROCESSED = "processed_items.json"
    NUM = 6

    threads = []
    for i in range(NUM):
        t = threading.Thread(
            target=click_and_select_all,
            args=(URL, PROCESSED, i, NUM),
            daemon=True
        )
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    print("All scrapers finished.")


if __name__ == "__main__":
    main()


In [None]:
conda install -c conda-forge opencv -y

In [None]:
# High-Accuracy OCR for 1900 Minneapolis City Directory
# Optimized for Google Colab with GPU acceleration

import json
import re
import cv2
import numpy as np
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Install required packages
import subprocess
import sys

def install_packages():
    """Install all required packages for OCR processing"""
    packages = [
        "easyocr",
        "paddlepaddle",
        "paddleocr",
        "pytesseract",
        "opencv-python",
        "Pillow",
        "transformers",
        "torch",
        "torchvision"
    ]
    
    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        except:
            print(f"Warning: Could not install {package}")

# Install packages
print("Installing required packages...")
install_packages()

# Import OCR libraries
import easyocr
import pytesseract
from paddleocr import PaddleOCR

class HistoricalDirectoryOCR:
    """
    Advanced OCR processor specifically designed for historical city directories
    with multiple OCR engines for maximum accuracy
    """
    
    def __init__(self):
        print("Initializing OCR engines...")
        
        try:
            # Initialize multiple OCR engines for ensemble processing
            self.easyocr_reader = easyocr.Reader(['en'], gpu=True)
            print("✅ EasyOCR initialized")
        except Exception as e:
            print(f"⚠️ EasyOCR initialization failed: {e}")
            self.easyocr_reader = None
        
        logging.getLogger("paddleocr").setLevel(logging.WARNING)

        try:
            # show_log has been removed – just omit it
            self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang='en')
            print("✅ PaddleOCR initialized")
        except Exception as e:
            print(f"⚠️ PaddleOCR initialization failed: {e}")
            self.paddle_ocr = None
       
        # Configure Tesseract for historical documents
        self.tesseract_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:()&-\' '
        
        print("OCR engines initialized successfully!")
    
    def preprocess_image(self, image_path):
        """
        Advanced image preprocessing for historical documents
        """
        print("Preprocessing image for optimal OCR...")
        
        # Load image
        img = cv2.imread(image_path)
        original = img.copy()
        
        # Convert to PIL for advanced processing
        pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        
        # Enhance image quality
        enhancer = ImageEnhance.Contrast(pil_img)
        pil_img = enhancer.enhance(1.5)
        
        enhancer = ImageEnhance.Sharpness(pil_img)
        pil_img = enhancer.enhance(1.2)
        
        enhancer = ImageEnhance.Brightness(pil_img)
        pil_img = enhancer.enhance(1.1)
        
        # Convert back to OpenCV format
        img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
        
        # Noise reduction
        img = cv2.medianBlur(img, 3)
        
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Adaptive thresholding for better text extraction
        processed = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )
        
        # Morphological operations to clean up text
        kernel = np.ones((1,1), np.uint8)
        processed = cv2.morphologyEx(processed, cv2.MORPH_CLOSE, kernel)
        processed = cv2.morphologyEx(processed, cv2.MORPH_OPEN, kernel)
        
        return original, processed
    
    def extract_text_easyocr(self, image):
        """Extract text using EasyOCR with confidence scoring"""
        if not self.easyocr_reader:
            return []
            
        try:
            results = self.easyocr_reader.readtext(image, detail=1, paragraph=False)
            text_blocks = []
            
            for (bbox, text, conf) in results:
                if conf > 0.3:  # Filter low confidence detections
                    x1, y1 = int(bbox[0][0]), int(bbox[0][1])
                    x2, y2 = int(bbox[2][0]), int(bbox[2][1])
                    text_blocks.append({
                        'text': text.strip(),
                        'bbox': [x1, y1, x2, y2],
                        'confidence': conf,
                        'method': 'easyocr'
                    })
            
            return text_blocks
        except Exception as e:
            print(f"EasyOCR error: {e}")
            return []
    
    def extract_text_paddle(self, image):
        """Extract text using PaddleOCR"""
        if not self.paddle_ocr:
            return []
            
        try:
            results = self.paddle_ocr.ocr(image, cls=True)
            text_blocks = []
            
            if results and results[0]:
                for item in results[0]:
                    if len(item) >= 2:
                        bbox_points = item[0]
                        text_info = item[1]
                        
                        if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
                            text = text_info[0]
                            conf = text_info[1]
                            
                            if conf > 0.3:
                                # Convert bbox to x1,y1,x2,y2 format
                                x_coords = [point[0] for point in bbox_points]
                                y_coords = [point[1] for point in bbox_points]
                                x1, y1 = int(min(x_coords)), int(min(y_coords))
                                x2, y2 = int(max(x_coords)), int(max(y_coords))
                                
                                text_blocks.append({
                                    'text': text.strip(),
                                    'bbox': [x1, y1, x2, y2],
                                    'confidence': conf,
                                    'method': 'paddle'
                                })
            
            return text_blocks
        except Exception as e:
            print(f"PaddleOCR error: {e}")
            return []
    
    def extract_text_tesseract(self, image):
        """Extract text using Tesseract OCR"""
        try:
            # Get detailed data from Tesseract
            data = pytesseract.image_to_data(image, config=self.tesseract_config, output_type=pytesseract.Output.DICT)
            text_blocks = []
            
            for i in range(len(data['text'])):
                text = data['text'][i].strip()
                conf = int(data['conf'][i])
                
                if text and conf > 30:  # Filter low confidence
                    x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
                    text_blocks.append({
                        'text': text,
                        'bbox': [x, y, x + w, y + h],
                        'confidence': conf / 100.0,
                        'method': 'tesseract'
                    })
            
            return text_blocks
        except Exception as e:
            print(f"Tesseract error: {e}")
            return []
    
    def ensemble_ocr(self, image):
        """
        Combine results from multiple OCR engines for maximum accuracy
        """
        print("Running ensemble OCR with multiple engines...")
        
        all_blocks = []
        
        # Extract text with all OCR engines
        easyocr_blocks = self.extract_text_easyocr(image)
        paddle_blocks = self.extract_text_paddle(image)
        tesseract_blocks = self.extract_text_tesseract(image)
        
        all_blocks.extend(easyocr_blocks)
        all_blocks.extend(paddle_blocks)
        all_blocks.extend(tesseract_blocks)
        
        print(f"Found {len(all_blocks)} text blocks across all OCR engines")
        
        # Sort by vertical position for proper reading order
        all_blocks.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]))
        
        return all_blocks
    
    def parse_directory_entry(self, text_line):
        """
        Parse a single directory entry into structured data
        Handles various formats found in 1900 city directories
        """
        entry = {
            "FirstName": None,
            "LastName": None, 
            "Spouse": None,
            "Occupation": None,
            "CompanyName": None,
            "HomeAddress": {
                "StreetNumber": None,
                "StreetName": None,
                "ApartmentOrUnit": None,
                "ResidenceIndicator": None
            },
            "WorkAddress": None,
            "Telephone": None,
            "DirectoryName": "Minneapolis 1900",
            "PageNumber": None
        }
        
        # Clean the text
        text = re.sub(r'\s+', ' ', text_line.strip())
        
        # Pattern for typical directory entry
        # Format: LastName FirstName (spouse) occupation, company, address
        
        # Extract name (usually at the beginning)
        name_match = re.match(r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]*)*)\s+([A-Z][a-z]*(?:\s+[A-Z])?)', text)
        if name_match:
            entry["LastName"] = name_match.group(1)
            entry["FirstName"] = name_match.group(2)
        
        # Extract spouse (usually in parentheses)
        spouse_match = re.search(r'\(([^)]+)\)', text)
        if spouse_match:
            entry["Spouse"] = spouse_match.group(1)
        
        # Extract address (numbers followed by street names)
        address_match = re.search(r'(\d+)\s+([A-Za-z\s]+(?:av|Ave|st|St|Ave|avenue|street|rd|Rd|road))', text)
        if address_match:
            entry["HomeAddress"]["StreetNumber"] = address_match.group(1)
            entry["HomeAddress"]["StreetName"] = address_match.group(2).strip()
            entry["HomeAddress"]["ResidenceIndicator"] = "h"
        
        # Extract apartment/unit info
        apt_match = re.search(r'apt\s*(\d+|[A-Z])', text, re.IGNORECASE)
        if apt_match:
            entry["HomeAddress"]["ApartmentOrUnit"] = f"apt {apt_match.group(1)}"
        
        # Extract occupation (common occupations in 1900)
        occupations = [
            'clerk', 'salesman', 'carpenter', 'laborer', 'machinist', 'engineer',
            'teacher', 'physician', 'lawyer', 'merchant', 'blacksmith', 'tailor',
            'shoemaker', 'baker', 'barber', 'painter', 'plumber', 'electrician'
        ]
        
        for occ in occupations:
            if occ in text.lower():
                entry["Occupation"] = occ.title()
                break
        
        return entry
    
    def group_text_into_lines(self, text_blocks, line_threshold=20):
        """
        Group text blocks into logical lines based on vertical positioning
        """
        if not text_blocks:
            return []
        
        # Sort by vertical position
        sorted_blocks = sorted(text_blocks, key=lambda x: x['bbox'][1])
        
        lines = []
        current_line = [sorted_blocks[0]]
        current_y = sorted_blocks[0]['bbox'][1]
        
        for block in sorted_blocks[1:]:
            block_y = block['bbox'][1]
            
            # If blocks are on the same line (within threshold)
            if abs(block_y - current_y) <= line_threshold:
                current_line.append(block)
            else:
                # Sort current line by horizontal position and join
                current_line.sort(key=lambda x: x['bbox'][0])
                line_text = ' '.join([b['text'] for b in current_line])
                lines.append(line_text.strip())
                
                # Start new line
                current_line = [block]
                current_y = block_y
        
        # Add the last line
        if current_line:
            current_line.sort(key=lambda x: x['bbox'][0])
            line_text = ' '.join([b['text'] for b in current_line])
            lines.append(line_text.strip())
        
        return lines
    
    def extract_directory_data(self, image_path, page_number=None):
        """
        Main function to extract structured directory data from image
        """
        print(f"Processing directory page: {image_path}")
        
        # Preprocess image
        original, processed = self.preprocess_image(image_path)
        
        # Extract text using ensemble OCR
        text_blocks = self.ensemble_ocr(processed)
        
        # Group text blocks into logical lines
        text_lines = self.group_text_into_lines(text_blocks)
        
        print(f"Extracted {len(text_lines)} text lines from the image")
        
        # Parse each line into structured data
        directory_entries = []
        
        for i, line in enumerate(text_lines):
            if len(line.strip()) > 10:  # Filter out very short lines
                entry = self.parse_directory_entry(line)
                if page_number:
                    entry["PageNumber"] = page_number
                
                # Only add entries that have at least a name
                if entry["FirstName"] or entry["LastName"]:
                    directory_entries.append(entry)
        
        print(f"Successfully parsed {len(directory_entries)} directory entries")
        
        return directory_entries, text_lines, original

def main():
    """
    Main execution function
    """
    # Initialize OCR processor
    ocr_processor = HistoricalDirectoryOCR()
    # Image path (update this to your image file in Colab)
    image_path = "/Users/darshilshukla/Desktop/104.png"  # Change this to your image filename
    
    try:
        # Extract directory data
        directory_entries, raw_text_lines, original_image = ocr_processor.extract_directory_data(
            image_path, page_number=104
        )
        
        # Display results
        print("\n" + "="*80)
        print("EXTRACTED DIRECTORY ENTRIES (JSON FORMAT)")
        print("="*80)
        
        # Pretty print JSON
        json_output = json.dumps(directory_entries, indent=2, ensure_ascii=False)
        print(json_output)
        
        # Save to file
        output_filename = "/Users/darshilshukla/Desktop/extracted_directory_data.json"
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(directory_entries, f, indent=2, ensure_ascii=False)
        
        print(f"\n✅ Data saved to: {output_filename}")
        print(f"📊 Total entries extracted: {len(directory_entries)}")
        
        # Display raw text for verification
        print("\n" + "="*80)
        print("RAW EXTRACTED TEXT LINES")
        print("="*80)
        for i, line in enumerate(raw_text_lines[:10]):  # Show first 10 lines
            print(f"{i+1:2d}: {line}")
        
        if len(raw_text_lines) > 10:
            print(f"... and {len(raw_text_lines) - 10} more lines")
        
        # Display original image
        plt.figure(figsize=(12, 16))
        plt.imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
        plt.title("Original Directory Page")
        plt.axis('off')
        plt.tight_layout()
        plt.show()
        
        return directory_entries
        
    except FileNotFoundError:
        print(f"❌ Error: Image file not found at {image_path}")
        print("Please upload your directory page image to the /content/ folder in Colab")
        print("and update the image_path variable with the correct filename.")
        return []
    except Exception as e:
        print(f"❌ Error processing image: {str(e)}")
        return []

# Example usage and testing
if __name__ == "__main__":
    print("🚀 Starting High-Accuracy OCR for 1900 Minneapolis Directory")
    print("="*80)
    
    # Run the main extraction
    extracted_data = main()
    
    if extracted_data:
        print("\n✅ Extraction completed successfully!")
        print("📁 JSON data is available in the 'extracted_data' variable")
        print("💾 Data has been saved to '/content/extracted_directory_data.json'")
    else:
        print("\n❌ Extraction failed. Please check the image path and try again.")


def validate_extraction_quality(extracted_data):
    """Validate the quality of extracted data"""
    if not extracted_data:
        return "No data extracted"
    
    total_entries = len(extracted_data)
    complete_entries = 0
    
    for entry in extracted_data:
        score = 0
        if entry.get("FirstName"): score += 1
        if entry.get("LastName"): score += 1  
        if entry.get("Occupation"): score += 1
        if entry.get("HomeAddress", {}).get("StreetName"): score += 1
        
        if score >= 3:  # At least 3 fields filled
            complete_entries += 1
    
    quality_score = (complete_entries / total_entries) * 100 if total_entries > 0 else 0
    
    return f"Quality Score: {quality_score:.1f}% ({complete_entries}/{total_entries} complete entries)"

# Display sample format
display_sample_entry()

In [None]:
# Advanced Historical Directory OCR System
# High-performance, production-ready OCR with ML-based text parsing

import json
import re
import cv2
import numpy as np
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter
import matplotlib.pyplot as plt
import warnings
import logging
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass, asdict
from pathlib import Path
import subprocess
import sys
from concurrent.futures import ThreadPoolExecutor
import time
from collections import defaultdict

warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class AddressInfo:
    """Structured address information"""
    street_number: Optional[str] = None
    street_name: Optional[str] = None
    apartment_or_unit: Optional[str] = None
    residence_indicator: Optional[str] = None
    city: Optional[str] = None
    state: Optional[str] = None

@dataclass
class DirectoryEntry:
    """Complete directory entry with all possible fields"""
    first_name: Optional[str] = None
    middle_name: Optional[str] = None
    last_name: Optional[str] = None
    suffix: Optional[str] = None
    spouse: Optional[str] = None
    occupation: Optional[str] = None
    company_name: Optional[str] = None
    home_address: Optional[AddressInfo] = None
    work_address: Optional[str] = None
    telephone: Optional[str] = None
    directory_name: str = "Minneapolis 1900"
    page_number: Optional[int] = None
    confidence_score: float = 0.0
    raw_text: Optional[str] = None

class OCREngineManager:
    """Manages multiple OCR engines with fallback mechanisms"""
    
    def __init__(self, use_gpu: bool = True):
        self.use_gpu = use_gpu
        self.engines = {}
        self.engine_weights = {
            'easyocr': 0.4,
            'paddle': 0.35,
            'tesseract': 0.25
        }
        self._initialize_engines()
    
    def _install_packages(self):
        """Install required packages with better error handling"""
        packages = {
            "easyocr": "easyocr",
            "paddlepaddle": "paddlepaddle",
            "paddleocr": "paddleocr",
            "pytesseract": "pytesseract",
            "opencv-python": "cv2",
            "Pillow": "PIL",
            "transformers": "transformers",
            "torch": "torch",
            "torchvision": "torchvision",
            "scikit-learn": "sklearn",
            "nltk": "nltk"
        }
        
        for package, import_name in packages.items():
            try:
                __import__(import_name)
                logger.info(f"✅ {package} already installed")
            except ImportError:
                try:
                    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
                    logger.info(f"✅ Successfully installed {package}")
                except Exception as e:
                    logger.warning(f"⚠️ Failed to install {package}: {e}")
    
    def _initialize_engines(self):
        """Initialize OCR engines with proper error handling"""
        self._install_packages()
        
        # Initialize EasyOCR
        try:
            import easyocr
            self.engines['easyocr'] = easyocr.Reader(['en'], gpu=self.use_gpu)
            logger.info("✅ EasyOCR initialized successfully")
        except Exception as e:
            logger.warning(f"⚠️ EasyOCR failed to initialize: {e}")
        
        # Initialize PaddleOCR
        try:
            from paddleocr import PaddleOCR
            # Suppress PaddleOCR logs
            logging.getLogger("paddleocr").setLevel(logging.WARNING)
            self.engines['paddle'] = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=self.use_gpu)
            logger.info("✅ PaddleOCR initialized successfully")
        except Exception as e:
            logger.warning(f"⚠️ PaddleOCR failed to initialize: {e}")
        
        # Initialize Tesseract
        try:
            import pytesseract
            self.engines['tesseract'] = pytesseract
            # Test if tesseract is available
            pytesseract.get_tesseract_version()
            logger.info("✅ Tesseract initialized successfully")
        except Exception as e:
            logger.warning(f"⚠️ Tesseract failed to initialize: {e}")
        
        if not self.engines:
            raise RuntimeError("No OCR engines could be initialized")
    
    def extract_with_easyocr(self, image: np.ndarray) -> List[Dict]:
        """Extract text using EasyOCR"""
        if 'easyocr' not in self.engines:
            return []
        
        try:
            results = self.engines['easyocr'].readtext(image, detail=1, paragraph=False)
            text_blocks = []
            
            for bbox, text, conf in results:
                if conf > 0.3:
                    x1, y1 = int(bbox[0][0]), int(bbox[0][1])
                    x2, y2 = int(bbox[2][0]), int(bbox[2][1])
                    text_blocks.append({
                        'text': text.strip(),
                        'bbox': [x1, y1, x2, y2],
                        'confidence': conf,
                        'engine': 'easyocr'
                    })
            
            return text_blocks
        except Exception as e:
            logger.error(f"EasyOCR extraction failed: {e}")
            return []
    
    def extract_with_paddle(self, image: np.ndarray) -> List[Dict]:
        """Extract text using PaddleOCR"""
        if 'paddle' not in self.engines:
            return []
        
        try:
            results = self.engines['paddle'].ocr(image, cls=True)
            text_blocks = []
            
            if results and results[0]:
                for item in results[0]:
                    if len(item) >= 2:
                        bbox_points = item[0]
                        text_info = item[1]
                        
                        if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
                            text, conf = text_info[0], text_info[1]
                            
                            if conf > 0.3:
                                x_coords = [point[0] for point in bbox_points]
                                y_coords = [point[1] for point in bbox_points]
                                x1, y1 = int(min(x_coords)), int(min(y_coords))
                                x2, y2 = int(max(x_coords)), int(max(y_coords))
                                
                                text_blocks.append({
                                    'text': text.strip(),
                                    'bbox': [x1, y1, x2, y2],
                                    'confidence': conf,
                                    'engine': 'paddle'
                                })
            
            return text_blocks
        except Exception as e:
            logger.error(f"PaddleOCR extraction failed: {e}")
            return []
    
    def extract_with_tesseract(self, image: np.ndarray) -> List[Dict]:
        """Extract text using Tesseract"""
        if 'tesseract' not in self.engines:
            return []
        
        try:
            pytesseract = self.engines['tesseract']
            config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:()&-\' '
            
            data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
            text_blocks = []
            
            for i in range(len(data['text'])):
                text = data['text'][i].strip()
                conf = int(data['conf'][i])
                
                if text and conf > 30:
                    x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
                    text_blocks.append({
                        'text': text,
                        'bbox': [x, y, x + w, y + h],
                        'confidence': conf / 100.0,
                        'engine': 'tesseract'
                    })
            
            return text_blocks
        except Exception as e:
            logger.error(f"Tesseract extraction failed: {e}")
            return []

class ImagePreprocessor:
    """Advanced image preprocessing for historical documents"""
    
    @staticmethod
    def enhance_image(image: np.ndarray) -> np.ndarray:
        """Apply comprehensive image enhancement"""
        # Convert to PIL for advanced processing
        pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        
        # Multiple enhancement steps
        enhancers = [
            (ImageEnhance.Contrast, 1.5),
            (ImageEnhance.Sharpness, 1.3),
            (ImageEnhance.Brightness, 1.1)
        ]
        
        for enhancer_class, factor in enhancers:
            enhancer = enhancer_class(pil_img)
            pil_img = enhancer.enhance(factor)
        
        return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
    
    @staticmethod
    def denoise_image(image: np.ndarray) -> np.ndarray:
        """Apply advanced denoising"""
        # Multiple denoising techniques
        denoised = cv2.medianBlur(image, 3)
        denoised = cv2.bilateralFilter(denoised, 9, 75, 75)
        return denoised
    
    @staticmethod
    def adaptive_threshold(image: np.ndarray) -> np.ndarray:
        """Apply adaptive thresholding with multiple methods"""
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Try multiple thresholding methods and combine
        thresh1 = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
        thresh2 = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
        
        # Combine results
        combined = cv2.bitwise_and(thresh1, thresh2)
        
        # Morphological operations
        kernel = np.ones((2, 2), np.uint8)
        combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel)
        combined = cv2.morphologyEx(combined, cv2.MORPH_OPEN, kernel)
        
        return combined
    
    @classmethod
    def preprocess_pipeline(cls, image_path: str) -> Tuple[np.ndarray, np.ndarray]:
        """Complete preprocessing pipeline"""
        # Load image
        original = cv2.imread(image_path)
        if original is None:
            raise FileNotFoundError(f"Could not load image: {image_path}")
        
        # Apply enhancements
        enhanced = cls.enhance_image(original)
        denoised = cls.denoise_image(enhanced)
        processed = cls.adaptive_threshold(denoised)
        
        return original, processed

class TextParser:
    """Advanced text parsing with ML-based pattern recognition"""
    
    def __init__(self):
        self.name_patterns = self._compile_name_patterns()
        self.address_patterns = self._compile_address_patterns()
        self.occupation_patterns = self._compile_occupation_patterns()
        
    def _compile_name_patterns(self) -> List[re.Pattern]:
        """Compile patterns for name extraction"""
        patterns = [
            # Last, First Middle
            re.compile(r'^([A-Z][a-z]+),\s+([A-Z][a-z]+)(?:\s+([A-Z][a-z]*))?\s*(.*)$'),
            # First Middle Last
            re.compile(r'^([A-Z][a-z]+)\s+([A-Z][a-z]*)\s+([A-Z][a-z]+)\s*(.*)$'),
            # First Last
            re.compile(r'^([A-Z][a-z]+)\s+([A-Z][a-z]+)\s*(.*)$'),
        ]
        return patterns
    
    def _compile_address_patterns(self) -> List[re.Pattern]:
        """Compile patterns for address extraction"""
        street_types = r'(?:av|ave|avenue|st|street|rd|road|blvd|boulevard|ln|lane|ct|court|pl|place|way|dr|drive)'
        patterns = [
            # Number + Street Name + Type
            re.compile(rf'(\d+)\s+([A-Za-z\s]+)\s+({street_types})', re.IGNORECASE),
            # Number + Street Name (without explicit type)
            re.compile(r'(\d+)\s+([A-Za-z\s]+(?:av|Ave|st|St))', re.IGNORECASE),
        ]
        return patterns
    
    def _compile_occupation_patterns(self) -> Dict[str, List[str]]:
        """Compile occupation patterns with categories"""
        return {
            'professional': ['lawyer', 'attorney', 'physician', 'doctor', 'engineer', 'architect', 'teacher', 'professor'],
            'trades': ['carpenter', 'blacksmith', 'tailor', 'shoemaker', 'baker', 'barber', 'painter', 'plumber'],
            'clerical': ['clerk', 'bookkeeper', 'stenographer', 'secretary', 'accountant'],
            'sales': ['salesman', 'merchant', 'shopkeeper', 'grocer'],
            'labor': ['laborer', 'worker', 'operative', 'machinist', 'factory worker'],
            'service': ['waiter', 'cook', 'janitor', 'porter', 'driver']
        }
    
    def extract_names(self, text: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
        """Extract first, middle, and last names"""
        text = text.strip()
        
        for pattern in self.name_patterns:
            match = pattern.match(text)
            if match:
                groups = match.groups()
                if len(groups) >= 3 and groups[0] and groups[1]:
                    # Handle different pattern formats
                    if ',' in text:  # Last, First format
                        return groups[1], groups[2], groups[0]  # first, middle, last
                    else:  # First Last or First Middle Last format
                        if groups[2]:  # First Middle Last
                            return groups[0], groups[1], groups[2]
                        else:  # First Last
                            return groups[0], None, groups[1]
        
        return None, None, None
    
    def extract_address(self, text: str) -> Optional[AddressInfo]:
        """Extract address information"""
        for pattern in self.address_patterns:
            match = pattern.search(text)
            if match:
                return AddressInfo(
                    street_number=match.group(1),
                    street_name=match.group(2).strip(),
                    residence_indicator="h"
                )
        return None
    
    def extract_occupation(self, text: str) -> Optional[str]:
        """Extract occupation with category mapping"""
        text_lower = text.lower()
        
        for category, occupations in self.occupation_patterns.items():
            for occupation in occupations:
                if occupation in text_lower:
                    return occupation.title()
        
        return None
    
    def parse_directory_line(self, text: str) -> DirectoryEntry:
        """Parse a complete directory line"""
        entry = DirectoryEntry()
        entry.raw_text = text
        
        # Extract names
        first, middle, last = self.extract_names(text)
        entry.first_name = first
        entry.middle_name = middle
        entry.last_name = last
        
        # Extract spouse (in parentheses)
        spouse_match = re.search(r'\(([^)]+)\)', text)
        if spouse_match:
            entry.spouse = spouse_match.group(1)
        
        # Extract address
        entry.home_address = self.extract_address(text)
        
        # Extract occupation
        entry.occupation = self.extract_occupation(text)
        
        # Calculate confidence score
        entry.confidence_score = self._calculate_confidence(entry)
        
        return entry
    
    def _calculate_confidence(self, entry: DirectoryEntry) -> float:
        """Calculate confidence score based on extracted fields"""
        score = 0.0
        weights = {
            'first_name': 0.25,
            'last_name': 0.25,
            'occupation': 0.20,
            'home_address': 0.20,
            'spouse': 0.10
        }
        
        if entry.first_name:
            score += weights['first_name']
        if entry.last_name:
            score += weights['last_name']
        if entry.occupation:
            score += weights['occupation']
        if entry.home_address and entry.home_address.street_name:
            score += weights['home_address']
        if entry.spouse:
            score += weights['spouse']
        
        return score

class AdvancedHistoricalOCR:
    """Main OCR system with ensemble processing and ML-based parsing"""
    
    def __init__(self, use_gpu: bool = True):
        self.ocr_manager = OCREngineManager(use_gpu)
        self.preprocessor = ImagePreprocessor()
        self.parser = TextParser()
        self.stats = defaultdict(int)
    
    def _merge_overlapping_blocks(self, blocks: List[Dict], overlap_threshold: float = 0.5) -> List[Dict]:
        """Merge overlapping text blocks from different OCR engines"""
        if not blocks:
            return blocks
        
        merged = []
        blocks = sorted(blocks, key=lambda x: x['confidence'], reverse=True)
        
        for block in blocks:
            is_merged = False
            
            for merged_block in merged:
                if self._calculate_overlap(block['bbox'], merged_block['bbox']) > overlap_threshold:
                    # Merge with higher confidence text
                    if block['confidence'] > merged_block['confidence']:
                        merged_block.update(block)
                    is_merged = True
                    break
            
            if not is_merged:
                merged.append(block)
        
        return merged
    
    def _calculate_overlap(self, bbox1: List[int], bbox2: List[int]) -> float:
        """Calculate overlap ratio between two bounding boxes"""
        x1_1, y1_1, x2_1, y2_1 = bbox1
        x1_2, y1_2, x2_2, y2_2 = bbox2
        
        # Calculate intersection
        x1_i = max(x1_1, x1_2)
        y1_i = max(y1_1, y1_2)
        x2_i = min(x2_1, x2_2)
        y2_i = min(y2_1, y2_2)
        
        if x2_i <= x1_i or y2_i <= y1_i:
            return 0.0
        
        intersection = (x2_i - x1_i) * (y2_i - y1_i)
        area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
        area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
        
        return intersection / min(area1, area2)
    
    def _group_text_into_lines(self, blocks: List[Dict], line_threshold: int = 20) -> List[str]:
        """Group text blocks into logical lines"""
        if not blocks:
            return []
        
        # Sort by vertical position
        sorted_blocks = sorted(blocks, key=lambda x: x['bbox'][1])
        
        lines = []
        current_line = [sorted_blocks[0]]
        current_y = sorted_blocks[0]['bbox'][1]
        
        for block in sorted_blocks[1:]:
            block_y = block['bbox'][1]
            
            if abs(block_y - current_y) <= line_threshold:
                current_line.append(block)
            else:
                # Sort by horizontal position and join
                current_line.sort(key=lambda x: x['bbox'][0])
                line_text = ' '.join([b['text'] for b in current_line])
                lines.append(line_text.strip())
                
                current_line = [block]
                current_y = block_y
        
        # Add the last line
        if current_line:
            current_line.sort(key=lambda x: x['bbox'][0])
            line_text = ' '.join([b['text'] for b in current_line])
            lines.append(line_text.strip())
        
        return lines
    
    def extract_directory_entries(self, image_path: str, page_number: Optional[int] = None) -> Tuple[List[DirectoryEntry], Dict]:
        """Main extraction function"""
        start_time = time.time()
        logger.info(f"Processing directory page: {image_path}")
        
        # Preprocess image
        original, processed = self.preprocessor.preprocess_pipeline(image_path)
        
        # Extract text using all OCR engines
        all_blocks = []
        
        # Run OCR engines in parallel for better performance
        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = {
                executor.submit(self.ocr_manager.extract_with_easyocr, processed): 'easyocr',
                executor.submit(self.ocr_manager.extract_with_paddle, processed): 'paddle',
                executor.submit(self.ocr_manager.extract_with_tesseract, processed): 'tesseract'
            }
            
            for future in futures:
                try:
                    blocks = future.result(timeout=60)  # 60 second timeout
                    all_blocks.extend(blocks)
                    self.stats[f'{futures[future]}_blocks'] += len(blocks)
                except Exception as e:
                    logger.error(f"OCR engine {futures[future]} failed: {e}")
        
        # Merge overlapping blocks
        merged_blocks = self._merge_overlapping_blocks(all_blocks)
        
        # Group blocks into lines
        text_lines = self._group_text_into_lines(merged_blocks)
        
        # Parse each line
        directory_entries = []
        for line in text_lines:
            if len(line.strip()) > 10:  # Filter short lines
                entry = self.parser.parse_directory_line(line)
                if page_number:
                    entry.page_number = page_number
                
                # Only add entries with reasonable confidence
                if entry.confidence_score > 0.3:
                    directory_entries.append(entry)
        
        # Update statistics
        processing_time = time.time() - start_time
        stats = {
            'processing_time': processing_time,
            'total_blocks': len(all_blocks),
            'merged_blocks': len(merged_blocks),
            'text_lines': len(text_lines),
            'directory_entries': len(directory_entries),
            'average_confidence': np.mean([e.confidence_score for e in directory_entries]) if directory_entries else 0,
            'engines_used': list(self.ocr_manager.engines.keys())
        }
        
        logger.info(f"Extracted {len(directory_entries)} entries in {processing_time:.2f}s")
        
        return directory_entries, stats
    
    def save_results(self, entries: List[DirectoryEntry], output_path: str, format: str = 'json'):
        """Save results in various formats"""
        if format == 'json':
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump([asdict(entry) for entry in entries], f, indent=2, ensure_ascii=False)
        elif format == 'csv':
            df = pd.DataFrame([asdict(entry) for entry in entries])
            df.to_csv(output_path, index=False)
        else:
            raise ValueError(f"Unsupported format: {format}")

def main():
    """Main execution function with enhanced error handling"""
    try:
        # Initialize OCR system
        ocr_system = AdvancedHistoricalOCR(use_gpu=True)
        
        # Update this path to your image
        image_path = "/Users/darshilshukla/Desktop/104.png"
        
        if not Path(image_path).exists():
            logger.error(f"Image file not found: {image_path}")
            return
        
        # Extract directory entries
        entries, stats = ocr_system.extract_directory_entries(image_path, page_number=104)
        
        # Display results
        print("\n" + "="*80)
        print("EXTRACTION RESULTS")
        print("="*80)
        print(f"Processing time: {stats['processing_time']:.2f}s")
        print(f"Total entries: {stats['directory_entries']}")
        print(f"Average confidence: {stats['average_confidence']:.2f}")
        print(f"OCR engines used: {', '.join(stats['engines_used'])}")
        
        # Save results
        output_path = "/Users/darshilshukla/Desktop/advanced_directory_data.json"
        ocr_system.save_results(entries, output_path)
        logger.info(f"Results saved to: {output_path}")
        
        # Display sample entries
        print("\nSAMPLE ENTRIES:")
        for i, entry in enumerate(entries[:5]):
            print(f"\n{i+1}. {entry.first_name} {entry.last_name}")
            if entry.occupation:
                print(f"   Occupation: {entry.occupation}")
            if entry.home_address and entry.home_address.street_name:
                print(f"   Address: {entry.home_address.street_number} {entry.home_address.street_name}")
            print(f"   Confidence: {entry.confidence_score:.2f}")
        
        return entries
        
    except Exception as e:
        logger.error(f"Fatal error: {e}")
        return []

if __name__ == "__main__":
    print("🚀 Advanced Historical Directory OCR System")
    print("="*80)
    extracted_data = main()

In [None]:
#!/usr/bin/env python3
"""
Advanced Historical Directory OCR System (Hard-Coded Paths)

Runs an ensemble OCR pipeline on a single 1900 Minneapolis directory page (104),
splits into tiles, performs advanced preprocessing, merges multiple OCR engines,
and parses entries into structured JSON.
"""
import json
import re
import logging
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Tuple
from dataclasses import dataclass, asdict

import cv2
import numpy as np
from PIL import Image, ImageEnhance
import pytesseract
try:
    import easyocr
except ImportError:
    easyocr = None
try:
    from paddleocr import PaddleOCR
except ImportError:
    PaddleOCR = None

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ——— CONFIGURATION ———
TESSERACT_CMD = "/usr/local/bin/tesseract"             # Path to Tesseract executable
IMAGE_PATH = Path("/Users/darshilshukla/Desktop/104.png")  # Input image
OUTPUT_FILE = Path("/Users/darshilshukla/Desktop/extracted_directory_data.json")  # JSON output
TILE_SIZE = (800, 800)  # width, height of tiles
MIN_CONFIDENCE = 0.3    # minimum entry confidence for final filtering
USE_GPU = True          # enable GPU for EasyOCR/PaddleOCR if available
MAX_WORKERS = 4         # thread pool size for OCR engines

# Ensure Tesseract CMD is set
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD

@dataclass
class AddressInfo:
    street_number: str
    street_name: str
    apartment_or_unit: str = None
    residence_indicator: str = None

@dataclass
class DirectoryEntry:
    first_name: str = None
    middle_name: str = None
    last_name: str = None
    suffix: str = None
    spouse: str = None
    occupation: str = None
    company_name: str = None
    home_address: AddressInfo = None
    work_address: str = None
    telephone: str = None
    directory_name: str = "Minneapolis 1900"
    page_number: int = 104
    confidence_score: float = 0.0
    raw_text: str = None

class OCREngineManager:
    """Manages Tesseract, EasyOCR, and PaddleOCR engines with fallback"""
    def __init__(self):
        self.engines = {}
        # Initialize Tesseract
        try:
            pytesseract.get_tesseract_version()
            self.engines['tesseract'] = pytesseract
            logger.info("✅ Tesseract initialized")
        except Exception as e:
            logger.warning(f"⚠️ Tesseract init failed: {e}")
        # EasyOCR
        if easyocr:
            try:
                reader = easyocr.Reader(['en'], gpu=USE_GPU)
                self.engines['easyocr'] = reader
                logger.info("✅ EasyOCR initialized")
            except Exception as e:
                logger.warning(f"⚠️ EasyOCR init failed: {e}")
        # PaddleOCR
        if PaddleOCR:
            try:
                ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=USE_GPU)
                self.engines['paddle'] = ocr
                logger.info("✅ PaddleOCR initialized")
            except Exception as e:
                logger.warning(f"⚠️ PaddleOCR init failed: {e}")

    def extract_blocks(self, image: np.ndarray) -> List[Dict]:
        blocks = []
        # Tesseract
        if 'tesseract' in self.engines:
            data = self.engines['tesseract'].image_to_data(
                image, config='--oem 3 --psm 6', output_type=pytesseract.Output.DICT
            )
            for i, txt in enumerate(data['text']):
                conf = int(data['conf'][i]) / 100.0
                if txt.strip() and conf > 0.3:
                    x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
                    blocks.append({'text': txt.strip(), 'bbox': [x, y, x+w, y+h], 'confidence': conf, 'engine': 'tesseract'})
        # EasyOCR
        if 'easyocr' in self.engines:
            try:
                results = self.engines['easyocr'].readtext(image, detail=1)
                for bbox, txt, conf in results:
                    if conf > 0.3:
                        x1, y1 = map(int, bbox[0])
                        x2, y2 = map(int, bbox[2])
                        blocks.append({'text': txt.strip(), 'bbox': [x1, y1, x2, y2], 'confidence': conf, 'engine': 'easyocr'})
            except Exception:
                pass
        # PaddleOCR
        if 'paddle' in self.engines:
            try:
                res = self.engines['paddle'].ocr(image, cls=True)
                for line in (res[0] if res else []):
                    coords, info = line
                    txt, conf = info
                    if conf > 0.3:
                        xs = [pt[0] for pt in coords]; ys = [pt[1] for pt in coords]
                        blocks.append({'text': txt.strip(), 'bbox': [min(xs), min(ys), max(xs), max(ys)], 'confidence': conf, 'engine': 'paddle'})
            except Exception:
                pass
        return blocks

class ImagePreprocessor:
    """Enhancement, denoising, and adaptive thresholding"""
    @staticmethod
    def preprocess(image: np.ndarray) -> np.ndarray:
        pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pil = ImageEnhance.Contrast(pil).enhance(1.5)
        pil = ImageEnhance.Sharpness(pil).enhance(1.3)
        gray = cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2GRAY)
        den = cv2.bilateralFilter(gray, 9, 75, 75)
        return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

class TextParser:
    """Regex-based parsing to DirectoryEntry"""
    name_patterns = [re.compile(r"^([A-Za-z]+),\s*([A-Za-z.]+)(?:\s+w\s*([A-Za-z.]+))?"),
                     re.compile(r"^([A-Za-z]+)\s+([A-Za-z.]+)")]
    addr_pattern = re.compile(r"(\d+)\s+([A-Za-z. ]+)")

    def parse_line(self, line: str) -> Optional[DirectoryEntry]:
        for pat in self.name_patterns:
            m = pat.match(line)
            if m:
                groups = m.groups()
                last = groups[0]; first = groups[1]; spouse = groups[2] if len(groups) > 2 else None
                entry = DirectoryEntry(first_name=first, last_name=last, spouse=spouse, raw_text=line)
                # address
                am = self.addr_pattern.search(line)
                if am:
                    entry.home_address = AddressInfo(street_number=am.group(1), street_name=am.group(2).strip(), residence_indicator='h')
                # occupation as anything after first comma
                parts = line.split(',')
                if len(parts) > 1:
                    entry.occupation = parts[1].strip()
                # confidence dummy metric
                entry.confidence_score = 0.5 + (0.5 if entry.home_address else 0)
                return entry
        return None

class AdvancedHistoricalOCR:
    """Orchestrates preprocessing, OCR, merging, and parsing"""
    def __init__(self):
        self.manager = OCREngineManager()
        self.parser = TextParser()

    def _merge_blocks(self, blocks: List[Dict], thresh: float = 0.5) -> List[Dict]:
        merged = []
        for blk in sorted(blocks, key=lambda b: b['confidence'], reverse=True):
            placed = False
            for m in merged:
                # overlap calc
                x1, y1, x2, y2 = blk['bbox']; a1, b1, a2, b2 = m['bbox']
                ix = max(0, min(x2, a2) - max(x1, a1)); iy = max(0, min(y2, b2) - max(y1, b1))
                inter = ix * iy; area = min((x2-x1)*(y2-y1), (a2-a1)*(b2-b1))
                if area and inter/area > thresh:
                    placed = True; break
            if not placed:
                merged.append(blk)
        return merged

    def _group_lines(self, blocks: List[Dict], yth=20) -> List[str]:
        if not blocks: return []
        bl = sorted(blocks, key=lambda b: b['bbox'][1])
        lines, curr = [], [bl[0]]
        base_y = bl[0]['bbox'][1]
        for b in bl[1:]:
            if abs(b['bbox'][1] - base_y) <= yth:
                curr.append(b)
            else:
                lines.append(' '.join([c['text'] for c in sorted(curr, key=lambda x: x['bbox'][0])]))
                curr, base_y = [b], b['bbox'][1]
        lines.append(' '.join([c['text'] for c in sorted(curr, key=lambda x: x['bbox'][0])]))
        return lines

    def extract_directory_entries(self, img_path: Path, page: int) -> Tuple[List[DirectoryEntry], Dict]:
        start = time.time()
        img = cv2.imread(str(img_path))
        proc = ImagePreprocessor.preprocess(img)
        h, w = proc.shape
        # tile OCR
        blocks = []
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
            futures = []
            for y in range(0, h, TILE_SIZE[1]):
                for x in range(0, w, TILE_SIZE[0]):
                    tile = proc[y:min(y+TILE_SIZE[1],h), x:min(x+TILE_SIZE[0],w)]
                    futures.append(ex.submit(self.manager.extract_blocks, tile))
            for f in futures:
                blocks.extend(f.result())
        merged = self._merge_blocks(blocks)
        lines = self._group_lines(merged)
        entries = []
        for ln in lines:
            if len(ln) > 10:
                ent = self.parser.parse_line(ln)
                if ent and ent.confidence_score >= MIN_CONFIDENCE:
                    ent.page_number = page
                    entries.append(ent)
        stats = {'time': time.time()-start, 'blocks': len(blocks), 'entries': len(entries)}
        return entries, stats

    def save_results(self, entries: List[DirectoryEntry], out_path: Path):
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump([asdict(e) for e in entries], f, indent=2, ensure_ascii=False)

# ——— MAIN ———
def main():
    if not IMAGE_PATH.exists():
        logger.error(f"Image not found: {IMAGE_PATH}")
        return
    ocr = AdvancedHistoricalOCR()
    entries, stats = ocr.extract_directory_entries(IMAGE_PATH, page=104)
    ocr.save_results(entries, OUTPUT_FILE)
    logger.info(f"Extracted {stats['entries']} entries in {stats['time']:.2f}s → {OUTPUT_FILE}")

In [3]:
#!/usr/bin/env python3
"""
OCR via word‐box clustering (DBSCAN) → large regions → block OCR.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import DBSCAN
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIG ———
IMAGE_PATH    = Path("/Users/darshilshukla/Desktop/104.png")
TS_CONFIG     = "--oem 3 --psm 6"
EPS_PIXELS    = 100      # clustering radius (px)
MIN_SAMPLES   = 5        # min words to form a region
PADDING       = 10       # px padding around each region

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float

def get_word_boxes(img: np.ndarray) -> List[WordBox]:
    """Run Tesseract to get word‐level boxes and confidences."""
    data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, config=TS_CONFIG)
    boxes = []
    for i, txt in enumerate(data['text']):
        if not txt.strip():
            continue
        conf = int(data['conf'][i])
        if conf < 30:
            continue
        l = data['left'][i]
        t = data['top'][i]
        w = data['width'][i]
        h = data['height'][i]
        cx = l + w / 2
        cy = t + h / 2
        boxes.append(WordBox(txt.strip(), l, t, w, h, cx, cy))
    return boxes

def cluster_regions(boxes: List[WordBox]) -> List[Tuple[int,int,int,int,List[WordBox]]]:
    """
    DBSCAN cluster on the word‐centroids (cx,cy), then compute
    one bounding box per cluster plus the list of member WordBoxes.
    """
    pts = np.array([[b.cx, b.cy] for b in boxes])
    db = DBSCAN(eps=EPS_PIXELS, min_samples=MIN_SAMPLES).fit(pts)
    labels = db.labels_
    regions = {}
    for lbl, wb in zip(labels, boxes):
        if lbl == -1:
            continue
        regions.setdefault(lbl, []).append(wb)
    out = []
    for members in regions.values():
        xs = [b.left for b in members] + [b.left + b.width for b in members]
        ys = [b.top  for b in members] + [b.top  + b.height for b in members]
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        out.append((x1, y1, x2 - x1, y2 - y1, members))
    # sort top→bottom, left→right
    out.sort(key=lambda r: (r[1], r[0]))
    return out

def ocr_block(img: np.ndarray, region: Tuple[int,int,int,int]) -> List[str]:
    """Crop + OCR block → return non‐empty lines."""
    x, y, w, h = region
    crop = img[y:y+h, x:x+w]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return [ln.strip() for ln in text.splitlines() if ln.strip()]

def main():
    img = cv2.imread(str(IMAGE_PATH))
    if img is None:
        raise FileNotFoundError(f"Cannot load image: {IMAGE_PATH}")

    # 1) Preprocess lightly to help Tesseract word detection
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # 2) Get word boxes
    word_boxes = get_word_boxes(gray)
    print(f"Detected {len(word_boxes)} words → clustering into regions…")

    # 3) Cluster into large regions
    regions = cluster_regions(word_boxes)
    print(f"Formed {len(regions)} text‐dense regions (eps={EPS_PIXELS}, min_samples={MIN_SAMPLES}).\n")

    # 4) OCR each region as a block
    all_lines = []
    for idx, (x, y, w, h, members) in enumerate(regions, 1):
        # apply padding
        xa = max(0, x - PADDING)
        ya = max(0, y - PADDING)
        xb = min(img.shape[1], x + w + PADDING)
        yb = min(img.shape[0], y + h + PADDING)

        lines = ocr_block(img, (xa, ya, xb-xa, yb-ya))
        print(f"--- Region {idx} ({len(members)} words) ---")
        for ln in lines:
            print(ln)
        print()
        all_lines.extend(lines)

    # 5) (Optional) You now have `all_lines` as your final text.
    # You can further dedupe or resort if needed.

if __name__ == "__main__":
    main()


Detected 1862 words → clustering into regions…
Formed 19 text‐dense regions (eps=100, min_samples=5).

--- Region 1 (5 words) ---
VN YOU
us for an
r income.

--- Region 2 (5 words) ---
ENNEP
BENREF
SAVINGS & LOAN ASSOCIATION

--- Region 3 (135 words) ---
oducts | BARTLETT
" D E r22 Washn av S
: 7612 ;}"" Donald S (Katharine) treas W A White Brokerage
Co h810 Pence bidg
h3945 |" Douglas W (Phyllis M) driver Harry W Smith Co
h1010 20th av NE
" Karle W (Effie) pharmacist Zwisler Pharmacy
n4448 Ist av S
" Edw G (Edith) pntr h2420 Oakland ay
" Eliz hd409 Dupont av S
nerson |"! Emelie T Mrs rd10 Groveland av
" Emory W (Leah N) driver Norris Creameries
Inc h n3425 Sheridan av N
" Gertrude E slswn Amluxen Co r922 E 24th
" Horace L (Mary A) pntr 601 E 22d h do
t r211/" Irving J (Lillian F) pres Calhoun Sales Inc h
105 W Minnehaha pkwy
1 Ser-|' Irving J jr treas Calhoun Sales Inc r105 W
Minnehaha pkwy
" Jas W mach opr r5031 Colfax av S
ke apt)" John clk Acme Hotel r22314 Marquette av
" John W fi

In [4]:
main()

Detected 1862 words → clustering into regions…
Formed 19 text‐dense regions (eps=100, min_samples=5).

--- Region 1 (5 words) ---
VN YOU
us for an
r income.

--- Region 2 (5 words) ---
ENNEP
BENREF
SAVINGS & LOAN ASSOCIATION

--- Region 3 (135 words) ---
oducts | BARTLETT
" D E r22 Washn av S
: 7612 ;}"" Donald S (Katharine) treas W A White Brokerage
Co h810 Pence bidg
h3945 |" Douglas W (Phyllis M) driver Harry W Smith Co
h1010 20th av NE
" Karle W (Effie) pharmacist Zwisler Pharmacy
n4448 Ist av S
" Edw G (Edith) pntr h2420 Oakland ay
" Eliz hd409 Dupont av S
nerson |"! Emelie T Mrs rd10 Groveland av
" Emory W (Leah N) driver Norris Creameries
Inc h n3425 Sheridan av N
" Gertrude E slswn Amluxen Co r922 E 24th
" Horace L (Mary A) pntr 601 E 22d h do
t r211/" Irving J (Lillian F) pres Calhoun Sales Inc h
105 W Minnehaha pkwy
1 Ser-|' Irving J jr treas Calhoun Sales Inc r105 W
Minnehaha pkwy
" Jas W mach opr r5031 Colfax av S
ke apt)" John clk Acme Hotel r22314 Marquette av
" John W fi

In [5]:
#!/usr/bin/env python3
"""
OCR via word-box clustering (DBSCAN) → large regions → block OCR → entry grouping.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import DBSCAN
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIG ———
IMAGE_PATH    = Path("/Users/darshilshukla/Desktop/104.png")  # updated path to your image
TS_CONFIG     = "--oem 3 --psm 6"
EPS_PIXELS    = 100      # clustering radius (px)
MIN_SAMPLES   = 5        # min words to form a region
PADDING       = 10       # px padding around each region

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float


def get_word_boxes(img: np.ndarray) -> List[WordBox]:
    """Run Tesseract to get word-level boxes and confidences."""
    data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, config=TS_CONFIG)
    boxes = []
    for i, txt in enumerate(data['text']):
        if not txt.strip():
            continue
        conf = int(data['conf'][i])
        if conf < 30:
            continue
        l = data['left'][i]
        t = data['top'][i]
        w = data['width'][i]
        h = data['height'][i]
        cx = l + w / 2
        cy = t + h / 2
        boxes.append(WordBox(txt.strip(), l, t, w, h, cx, cy))
    return boxes


def cluster_regions(boxes: List[WordBox]) -> List[Tuple[int,int,int,int,List[WordBox]]]:
    """
    DBSCAN cluster on the word-centroids (cx,cy), then compute
    one bounding box per cluster plus the list of member WordBoxes.
    """
    pts = np.array([[b.cx, b.cy] for b in boxes])
    db = DBSCAN(eps=EPS_PIXELS, min_samples=MIN_SAMPLES).fit(pts)
    labels = db.labels_
    regions = {}
    for lbl, wb in zip(labels, boxes):
        if lbl == -1:
            continue
        regions.setdefault(lbl, []).append(wb)
    out = []
    for members in regions.values():
        xs = [b.left for b in members] + [b.left + b.width for b in members]
        ys = [b.top  for b in members] + [b.top  + b.height for b in members]
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        out.append((x1, y1, x2 - x1, y2 - y1, members))
    out.sort(key=lambda r: (r[1], r[0]))
    return out


def ocr_block(img: np.ndarray, region: Tuple[int,int,int,int]) -> List[str]:
    """Crop + OCR block → return non-empty lines."""
    x, y, w, h = region
    crop = img[y:y+h, x:x+w]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return [ln.strip() for ln in text.splitlines() if ln.strip()]


def group_entries(lines: List[str]) -> List[str]:
    """Combine lines into entries: new entry starts with '"'. Skip partition bars."""
    entries = []
    current = ""
    for ln in lines:
        stripped = ln.strip()
        # skip vertical partition lines
        if stripped and set(stripped) == {'|'}:
            continue
        # new entry if line starts with double-quote
        if stripped.startswith('"'):
            if current:
                entries.append(current.strip())
            current = stripped.lstrip('" ').strip()
        else:
            # continuation of same entry
            if current:
                current += " " + stripped
    if current:
        entries.append(current.strip())
    return entries


def main():
    img = cv2.imread(str(IMAGE_PATH))
    if img is None:
        raise FileNotFoundError(f"Cannot load image: {IMAGE_PATH}")

    # get word boxes
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    word_boxes = get_word_boxes(gray)
    print(f"Detected {len(word_boxes)} words → clustering into regions…")

    # cluster into regions
    regions = cluster_regions(word_boxes)
    print(f"Formed {len(regions)} text-dense regions")

    # OCR all regions, collect lines
    all_lines = []
    for (x, y, w, h, members) in regions:
        xa = max(0, x - PADDING)
        ya = max(0, y - PADDING)
        xb = min(img.shape[1], x + w + PADDING)
        yb = min(img.shape[0], y + h + PADDING)
        lines = ocr_block(img, (xa, ya, xb-xa, yb-ya))
        all_lines.extend(lines)

    # group into entries
    entries = group_entries(all_lines)
    print(f"\nExtracted {len(entries)} entries:\n")
    for i, e in enumerate(entries, 1):
        print(f"Entry {i}: {e}\n")

if __name__ == "__main__":
    main()


Detected 1862 words → clustering into regions…
Formed 19 text-dense regions

Extracted 57 entries:

Entry 1: D E r22 Washn av S : 7612 ;}"" Donald S (Katharine) treas W A White Brokerage Co h810 Pence bidg h3945 |" Douglas W (Phyllis M) driver Harry W Smith Co h1010 20th av NE

Entry 2: Karle W (Effie) pharmacist Zwisler Pharmacy n4448 Ist av S

Entry 3: Edw G (Edith) pntr h2420 Oakland ay

Entry 4: Eliz hd409 Dupont av S nerson |"! Emelie T Mrs rd10 Groveland av

Entry 5: Emory W (Leah N) driver Norris Creameries Inc h n3425 Sheridan av N

Entry 6: Gertrude E slswn Amluxen Co r922 E 24th

Entry 7: Horace L (Mary A) pntr 601 E 22d h do t r211/" Irving J (Lillian F) pres Calhoun Sales Inc h 105 W Minnehaha pkwy 1 Ser-|' Irving J jr treas Calhoun Sales Inc r105 W Minnehaha pkwy

Entry 8: Jas W mach opr r5031 Colfax av S ke apt)" John clk Acme Hotel r22314 Marquette av

Entry 9: John W fire motor opr h4231 Ist av S

Entry 10: June Mrs nurse Asbury Hosp h4324 Irving av N vaoor |}! LE M Co 

In [6]:
#!/usr/bin/env python3
"""
Segment into 7 large regions (margin, 3 headers, 2 body columns, footer) via word clustering and spatial thresholds,
then OCR each region separately using pytesseract.
"""
import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIGURATION ———
IMAGE_PATH      = Path("/Users/darshilshukla/Desktop/104 copy.jpeg")
TS_CONFIG       = "--oem 3 --psm 6"
EPS_PIXELS      = 100      # clustering radius for DBSCAN fallback (px)
MIN_SAMPLES     = 5        # min words per cluster
PADDING         = 10       # px padding around each region
MARGIN_RATIO    = 0.15     # fraction of page width considered left margin
HEADER_RATIO    = 0.18     # fraction of page height for header stripe
FOOTER_RATIO    = 0.82     # fraction of page height for footer stripe

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float


def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG)
    boxes = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        conf = int(data['conf'][i])
        if conf < 30:
            continue
        l = data['left'][i]; t = data['top'][i]
        w = data['width'][i]; h = data['height'][i]
        cx = l + w/2; cy = t + h/2
        boxes.append(WordBox(txt, l, t, w, h, cx, cy))
    return boxes


def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left+b.width for b in boxes]
    ys = [b.top for b in boxes] + [b.top+b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2-x1, y2-y1


def cluster_stripe(boxes: List[WordBox], n_clusters: int) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    # cluster by x-centroid
    X = np.array([[b.cx] for b in boxes])
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    regions = []
    for i in range(n_clusters):
        members = [b for b, lbl in zip(boxes, km.labels_) if lbl == i]
        regions.append(compute_bbox(members))
    # sort left->right
    regions.sort(key=lambda r: r[0])
    return regions


def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    margin_thresh = w * MARGIN_RATIO
    # left margin region
    margin_boxes = [b for b in boxes if b.cx < margin_thresh]
    rem = [b for b in boxes if b.cx >= margin_thresh]

    # stripe thresholds
    header_h = h * HEADER_RATIO
    footer_h = h * FOOTER_RATIO
    header_boxes = [b for b in rem if b.cy < header_h]
    body_boxes   = [b for b in rem if header_h <= b.cy <= footer_h]
    footer_boxes = [b for b in rem if b.cy > footer_h]

    regions = []
    # margin
    if margin_boxes:
        regions.append(compute_bbox(margin_boxes))
    # header -> 3 regions
    regions += cluster_stripe(header_boxes, n_clusters=3)
    # body -> 2 columns
    regions += cluster_stripe(body_boxes, n_clusters=2)
    # footer single
    if footer_boxes:
        regions.append(compute_bbox(footer_boxes))

    # add padding
    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, int(x-PADDING)); ya = max(0, int(y-PADDING))
        xb = min(w, int(x+rw+PADDING)); yb = min(h, int(y+rh+PADDING))
        padded.append((xa, ya, xb-xa, yb-ya))
    return padded


def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> str:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    return pytesseract.image_to_string(crop, config=TS_CONFIG).strip()


def main():
    img = cv2.imread(str(IMAGE_PATH))
    if img is None:
        raise FileNotFoundError(f"Cannot load image: {IMAGE_PATH}")
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # get word boxes
    word_boxes = get_word_boxes(gray)
    print(f"Detected {len(word_boxes)} words, extracting 7 regions...")

    # compute 7 regions (margin + 3 headers + 2 body + footer)
    regions = extract_regions(word_boxes, img.shape[:2])
    print(f"Found {len(regions)} regions\n")

    # OCR each region
    for i, reg in enumerate(regions, 1):
        print(f"----- Region {i} -----")
        text = ocr_region(img, reg)
        print(text, "\n")

if __name__ == '__main__':
    main()


Detected 1777 words, extracting 7 regions...
Found 7 regions

----- Region 1 -----
NOW TRY
MELLOW -
DRY
(3) (Gry
As) CUB)
Distributed by
MINNEAPOLIS
CITY a DIST.
1507 So. 6th St.
CORNELIUS
BEVERAGE CO.
6315 Cedar Ave.
DAVIDSON
DIST. CO.
3332 46th Ave. S.
CHARLES
GANZER DIST.
co.

4239 Russell
Ave. N.
Jacob Schmidt
Brewing Co.
ST. PAUL, MINN.
F. H.
WwW
A
G
N
E
R
Agency
| THEO, H.

and
BLANCHE E.
RITTENHOUSE
Co-Owners
e
- @
536
PLYMOUTH
BUILDING
Phone
GENEVA
| 6082 

----- Region 2 -----
HE "FEDERAL See u

SAVINGS & LOAN ASSOCIATION your |

PARTH

Co Inc J Guy Enos pres-treas Mrs Dorothy D
Enos v-pres Bertil H Larson sec jwlr mfrs 29
Glenwood ay

Edna counterwn Grain Exchange Lunch 12708
Grand av.

Esther E (wid Champ) mach opr hd426 Hamp-
shire dr

Geo A (Eleanor A) h3549 10th av S apt 3

Geo F (Martha A) slsmn McCaskey Register Co 

----- Region 3 -----
HEN YOU BUY, BUILD OR Ff
See us for an easy-payment home loan m
your income. Low interest rates—no co
A MARQUETTE AVE.
othy D r2540 4t

In [8]:
#!/usr/bin/env python3
import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIGURATION ———
BASE_DIR      = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES   = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS   = [BASE_DIR / name for name in IMAGE_NAMES]
OUTPUT_FILE   = BASE_DIR / "text.txt"
TS_CONFIG     = "--oem 3 --psm 6"
PADDING       = 10
MARGIN_RATIO  = 0.15
HEADER_RATIO  = 0.18
FOOTER_RATIO  = 0.82

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float


def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data['conf'][i])
        except ValueError:
            continue
        if conf < 30:
            continue
        l, t = data['left'][i], data['top'][i]
        w, h = data['width'][i], data['height'][i]
        cx, cy = l + w/2, t + h/2
        boxes.append(WordBox(txt, l, t, w, h, cx, cy))
    return boxes


def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top  for b in boxes] + [b.top  + b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2-x1, y2-y1


def cluster_stripe(boxes: List[WordBox], n_clusters: int) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    X = np.array([[b.cx] for b in boxes])
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    regions = []
    for label in range(n_clusters):
        members = [b for b, lbl in zip(boxes, km.labels_) if lbl == label]
        regions.append(compute_bbox(members))
    regions.sort(key=lambda r: r[0])
    return regions


def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    margin_thresh = w * MARGIN_RATIO
    margin_boxes = [b for b in boxes if b.cx < margin_thresh]
    rem          = [b for b in boxes if b.cx >= margin_thresh]
    header_h     = h * HEADER_RATIO
    footer_h     = h * FOOTER_RATIO

    header_boxes = [b for b in rem if b.cy < header_h]
    body_boxes   = [b for b in rem if header_h <= b.cy <= footer_h]
    footer_boxes = [b for b in rem if b.cy > footer_h]

    regions = []
    if margin_boxes:
        regions.append(compute_bbox(margin_boxes))
    regions += cluster_stripe(header_boxes, n_clusters=3)
    regions += cluster_stripe(body_boxes,   n_clusters=2)
    if footer_boxes:
        regions.append(compute_bbox(footer_boxes))

    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, int(x - PADDING))
        ya = max(0, int(y - PADDING))
        xb = min(w, int(x + rw + PADDING))
        yb = min(h, int(y + rh + PADDING))
        padded.append((xa, ya, xb-xa, yb-ya))
    return padded


def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> str:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    return pytesseract.image_to_string(crop, config=TS_CONFIG).strip()


def main():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for img_path in IMAGE_PATHS:
            img = cv2.imread(str(img_path))
            if img is None:
                continue
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            word_boxes = get_word_boxes(gray)
            regions    = extract_regions(word_boxes, img.shape[:2])
            for reg in regions:
                block = ocr_region(img, reg)
                if block:
                    out_f.write(block + "\n")
    print(f"All OCR text saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


All OCR text saved to: /Users/darshilshukla/Desktop/text.txt


In [9]:
#!/usr/bin/env python3
"""
Batch OCR for multiple directory pages → single text file using user’s default Desktop path.
All extracted text from each region of each image is concatenated into one output file named text.txt on the desktop.
Lines consisting solely of '|' are skipped.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIGURATION ———
BASE_DIR      = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES   = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS   = [BASE_DIR / name for name in IMAGE_NAMES]
OUTPUT_FILE   = BASE_DIR / "textnew.txt"
TS_CONFIG     = "--oem 3 --psm 6"
PADDING       = 10
MARGIN_RATIO  = 0.15
HEADER_RATIO  = 0.18
FOOTER_RATIO  = 0.82

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float

# Extract word-level bounding boxes via Tesseract
def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data['conf'][i])
        except ValueError:
            continue
        if conf < 30:
            continue
        l, t = data['left'][i], data['top'][i]
        w, h = data['width'][i], data['height'][i]
        cx, cy = l + w/2, t + h/2
        boxes.append(WordBox(txt, l, t, w, h, cx, cy))
    return boxes

# Compute bounding box around a list of WordBoxes
def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top  for b in boxes] + [b.top  + b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2-x1, y2-y1

# Cluster stripes (headers/body) into columns via x-centroids

def cluster_stripe(boxes: List[WordBox], n_clusters: int) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    X = np.array([[b.cx] for b in boxes])
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    regions = []
    for label in range(n_clusters):
        members = [b for b, lbl in zip(boxes, km.labels_) if lbl == label]
        regions.append(compute_bbox(members))
    regions.sort(key=lambda r: r[0])
    return regions

# Extract 7 standard regions per page

def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    margin_thresh = w * MARGIN_RATIO
    margin_boxes = [b for b in boxes if b.cx < margin_thresh]
    rem_boxes    = [b for b in boxes if b.cx >= margin_thresh]
    header_h     = h * HEADER_RATIO
    footer_h     = h * FOOTER_RATIO

    header_boxes = [b for b in rem_boxes if b.cy < header_h]
    body_boxes   = [b for b in rem_boxes if header_h <= b.cy <= footer_h]
    footer_boxes = [b for b in rem_boxes if b.cy > footer_h]

    regions = []
    if margin_boxes:
        regions.append(compute_bbox(margin_boxes))
    regions += cluster_stripe(header_boxes, n_clusters=3)
    regions += cluster_stripe(body_boxes,   n_clusters=2)
    if footer_boxes:
        regions.append(compute_bbox(footer_boxes))

    # Add padding and constrain to image bounds
    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, int(x - PADDING))
        ya = max(0, int(y - PADDING))
        xb = min(w, int(x + rw + PADDING))
        yb = min(h, int(y + rh + PADDING))
        padded.append((xa, ya, xb-xa, yb-ya))
    return padded

# OCR a single region and return text

def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> str:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    return pytesseract.image_to_string(crop, config=TS_CONFIG).strip()

# Main execution: batch process images, write single output file

def main():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for img_path in IMAGE_PATHS:
            img = cv2.imread(str(img_path))
            if img is None:
                continue
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            word_boxes = get_word_boxes(gray)
            regions    = extract_regions(word_boxes, img.shape[:2])
            for region in regions:
                text = ocr_region(img, region)
                if not text:
                    continue
                for line in text.splitlines():
                    stripped = line.strip()
                    # skip partition-only lines
                    if not stripped or set(stripped) == {'|'}:
                        continue
                    out_f.write(stripped + "\n")
    print(f"All OCR text saved to: {OUTPUT_FILE}")

if __name__ == '__main__':
    main()


All OCR text saved to: /Users/darshilshukla/Desktop/textnew.txt


In [10]:
#!/usr/bin/env python3
"""
Batch OCR for multiple directory pages → single text file using user’s default Desktop path.
All extracted entries (starting with '"') from each region of each image are concatenated into one output file named text.txt on the desktop.
Partition lines consisting solely of '|' mark the end of an entry.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIGURATION ———
BASE_DIR      = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES   = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS   = [BASE_DIR / name for name in IMAGE_NAMES]
OUTPUT_FILE   = BASE_DIR / "textlast.txt"
TS_CONFIG     = "--oem 3 --psm 6"
PADDING       = 10
MARGIN_RATIO  = 0.15
HEADER_RATIO  = 0.18
FOOTER_RATIO  = 0.82

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float

# Extract word-level bounding boxes via Tesseract
def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data['conf'][i])
        except ValueError:
            continue
        if conf < 30:
            continue
        l, t = data['left'][i], data['top'][i]
        w, h = data['width'][i], data['height'][i]
        cx, cy = l + w/2, t + h/2
        boxes.append(WordBox(txt, l, t, w, h, cx, cy))
    return boxes

# Compute bounding box around a list of WordBoxes
def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top for b in boxes]  + [b.top  + b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2-x1, y2-y1

# Cluster stripes into columns
def cluster_stripe(boxes: List[WordBox], n_clusters: int) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    X = np.array([[b.cx] for b in boxes])
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    regions = []
    for lbl in range(n_clusters):
        members = [b for b, l in zip(boxes, km.labels_) if l == lbl]
        regions.append(compute_bbox(members))
    regions.sort(key=lambda r: r[0])
    return regions

# Extract seven regions per page: margin, 3 headers, 2 body, footer
def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    margin_thresh = w * MARGIN_RATIO
    margin_boxes = [b for b in boxes if b.cx < margin_thresh]
    rem_boxes    = [b for b in boxes if b.cx >= margin_thresh]
    header_h     = h * HEADER_RATIO
    footer_h     = h * FOOTER_RATIO
    header_boxes = [b for b in rem_boxes if b.cy < header_h]
    body_boxes   = [b for b in rem_boxes if header_h <= b.cy <= footer_h]
    footer_boxes = [b for b in rem_boxes if b.cy > footer_h]

    regions = []
    if margin_boxes:
        regions.append(compute_bbox(margin_boxes))
    regions += cluster_stripe(header_boxes, 3)
    regions += cluster_stripe(body_boxes,   2)
    if footer_boxes:
        regions.append(compute_bbox(footer_boxes))

    # Pad and clamp to bounds
    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, int(x - PADDING))
        ya = max(0, int(y - PADDING))
        xb = min(w, int(x + rw + PADDING))
        yb = min(h, int(y + rh + PADDING))
        padded.append((xa, ya, xb-xa, yb-ya))
    return padded

# OCR a region and return text block

def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> str:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    return pytesseract.image_to_string(crop, config=TS_CONFIG).strip()

# Main: process images, extract entries, write single output file

def main():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    entries: List[str] = []
    for img_path in IMAGE_PATHS:
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        word_boxes = get_word_boxes(gray)
        regions    = extract_regions(word_boxes, img.shape[:2])
        for region in regions:
            block = ocr_region(img, region)
            # parse block into entries
            current = None
            for line in block.splitlines():
                stripped = line.strip()
                if not stripped:
                    continue
                # skip partition lines
                if set(stripped) == {'|'}:
                    if current:
                        entries.append(current)
                        current = None
                    continue
                # new entry starts with '"'
                if stripped.startswith('"'):
                    if current:
                        entries.append(current)
                    current = stripped
                else:
                    if current:
                        current += ' ' + stripped
            if current:
                entries.append(current)
    # write all entries to single text file
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for ent in entries:
            out_f.write(ent + "\n")
    print(f"All OCR entries saved to: {OUTPUT_FILE}")

if __name__ == '__main__':
    main()


All OCR entries saved to: /Users/darshilshukla/Desktop/textlast.txt


In [11]:
#!/usr/bin/env python3
"""
Batch OCR for multiple directory pages → single text file using user’s default Desktop path.
Extracts entries that start with '"', continues until a line of '|', then writes each entry on a new line in text.txt.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIGURATION ———
BASE_DIR      = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES   = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS   = [BASE_DIR / name for name in IMAGE_NAMES]
OUTPUT_FILE   = BASE_DIR / "text_new.txt"
TS_CONFIG     = "--oem 3 --psm 6"
PADDING       = 10
MARGIN_RATIO  = 0.15
HEADER_RATIO  = 0.18
FOOTER_RATIO  = 0.82

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float

# Extract word-level boxes via Tesseract
def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data['conf'][i])
        except ValueError:
            continue
        if conf < 30:
            continue
        l, t = data['left'][i], data['top'][i]
        w, h = data['width'][i], data['height'][i]
        cx, cy = l + w/2, t + h/2
        boxes.append(WordBox(txt, l, t, w, h, cx, cy))
    return boxes

# Compute bounding box for a list of WordBoxes
def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top  for b in boxes] + [b.top  + b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2 - x1, y2 - y1

# Cluster WordBoxes into columns (headers/body)
def cluster_stripe(boxes: List[WordBox], n_clusters: int) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    X = np.array([[b.cx] for b in boxes])
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    regions = []
    for lbl in range(n_clusters):
        members = [b for b, l in zip(boxes, km.labels_) if l == lbl]
        regions.append(compute_bbox(members))
    regions.sort(key=lambda r: r[0])
    return regions

# Divide into 7 regions: margin, 3 header cols, 2 body cols, footer
def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    margin_thresh = w * MARGIN_RATIO
    margin_boxes = [b for b in boxes if b.cx < margin_thresh]
    rem_boxes    = [b for b in boxes if b.cx >= margin_thresh]
    header_h     = h * HEADER_RATIO
    footer_h     = h * FOOTER_RATIO

    header_boxes = [b for b in rem_boxes if b.cy < header_h]
    body_boxes   = [b for b in rem_boxes if header_h <= b.cy <= footer_h]
    footer_boxes = [b for b in rem_boxes if b.cy > footer_h]

    regions = []
    if margin_boxes:
        regions.append(compute_bbox(margin_boxes))
    regions += cluster_stripe(header_boxes, 3)
    regions += cluster_stripe(body_boxes,   2)
    if footer_boxes:
        regions.append(compute_bbox(footer_boxes))

    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, int(x - PADDING))
        ya = max(0, int(y - PADDING))
        xb = min(w, int(x + rw + PADDING))
        yb = min(h, int(y + rh + PADDING))
        padded.append((xa, ya, xb - xa, yb - ya))
    return padded

# OCR a region and return its text block
def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> List[str]:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG).strip()
    return text.splitlines()

# Main: process each image, extract entries, write to single file
def main():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    entries: List[str] = []

    for img_path in IMAGE_PATHS:
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        word_boxes = get_word_boxes(gray)
        regions    = extract_regions(word_boxes, img.shape[:2])

        current: str = None
        for region in regions:
            lines = ocr_region(img, region)
            for line in lines:
                stripped = line.strip()
                if not stripped:
                    continue
                # partition line => end of entry
                if stripped and set(stripped) == {'|'}:
                    if current:
                        entries.append(current)
                        current = None
                    continue
                # new entry starts with '"'
                if stripped.startswith('"'):
                    if current:
                        entries.append(current)
                    current = stripped
                else:
                    # continuation of same entry
                    if current:
                        current += ' ' + stripped
        # flush last entry for this page
        if current:
            entries.append(current)

    # write all entries to text.txt
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for ent in entries:
            out_f.write(ent + "\n")

    print(f"All OCR entries saved to: {OUTPUT_FILE}")

if __name__ == '__main__':
    main()


All OCR entries saved to: /Users/darshilshukla/Desktop/text_new.txt


In [12]:
#!/usr/bin/env python3
"""
Batch OCR for multiple directory pages → single text file using user’s default Desktop path.
Extracts entries that start with '"', continues until a '|' partition, removes all '|' characters,
and writes each entry on a new line in text.txt.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIGURATION ———
BASE_DIR      = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES   = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS   = [BASE_DIR / name for name in IMAGE_NAMES]
OUTPUT_FILE   = BASE_DIR / "text_another.txt"
TS_CONFIG     = "--oem 3 --psm 6"
PADDING       = 10
MARGIN_RATIO  = 0.15
HEADER_RATIO  = 0.18
FOOTER_RATIO  = 0.82

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float

# Extract word-level boxes via Tesseract
def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data['conf'][i])
        except ValueError:
            continue
        if conf < 30:
            continue
        l, t = data['left'][i], data['top'][i]
        w, h = data['width'][i], data['height'][i]
        cx, cy = l + w/2, t + h/2
        boxes.append(WordBox(txt, l, t, w, h, cx, cy))
    return boxes

# Compute bounding box for a list of WordBoxes
def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top  for b in boxes] + [b.top  + b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2 - x1, y2 - y1

# Cluster WordBoxes into columns (headers/body)
def cluster_stripe(boxes: List[WordBox], n_clusters: int) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    X = np.array([[b.cx] for b in boxes])
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    regions = []
    for lbl in range(n_clusters):
        members = [b for b, l in zip(boxes, km.labels_) if l == lbl]
        regions.append(compute_bbox(members))
    regions.sort(key=lambda r: r[0])
    return regions

# Extract seven regions per page: margin, 3 header cols, 2 body cols, footer
def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    margin_thresh = w * MARGIN_RATIO
    margin_boxes = [b for b in boxes if b.cx < margin_thresh]
    rem_boxes    = [b for b in boxes if b.cx >= margin_thresh]
    header_h     = h * HEADER_RATIO
    footer_h     = h * FOOTER_RATIO

    header_boxes = [b for b in rem_boxes if b.cy < header_h]
    body_boxes   = [b for b in rem_boxes if header_h <= b.cy <= footer_h]
    footer_boxes = [b for b in rem_boxes if b.cy > footer_h]

    regions = []
    if margin_boxes:
        regions.append(compute_bbox(margin_boxes))
    regions += cluster_stripe(header_boxes, 3)
    regions += cluster_stripe(body_boxes,   2)
    if footer_boxes:
        regions.append(compute_bbox(footer_boxes))

    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, int(x - PADDING))
        ya = max(0, int(y - PADDING))
        xb = min(w, int(x + rw + PADDING))
        yb = min(h, int(y + rh + PADDING))
        padded.append((xa, ya, xb - xa, yb - ya))
    return padded

# OCR a region and return its text lines
def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> List[str]:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return text.splitlines()

# Main: process images, parse entries starting with '"', end on '|', remove all '|'

def main():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    entries: List[str] = []

    for img_path in IMAGE_PATHS:
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        word_boxes = get_word_boxes(gray)
        regions    = extract_regions(word_boxes, img.shape[:2])

        current: str = None
        for region in regions:
            lines = ocr_region(img, region)
            for line in lines:
                raw = line.strip().replace('|', '')
                if not raw:
                    if current:
                        entries.append(current)
                        current = None
                    continue
                if raw.startswith('"'):
                    if current:
                        entries.append(current)
                    current = raw
                else:
                    if current:
                        current += ' ' + raw
        if current:
            entries.append(current)

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for ent in entries:
            out_f.write(ent + "\n")

    print(f"All OCR entries saved to: {OUTPUT_FILE}")

if __name__ == '__main__':
    main()


All OCR entries saved to: /Users/darshilshukla/Desktop/text_another.txt


In [13]:
import json
import re
from pathlib import Path

# Configuration
BASE_DIR = Path("/Users/darshilshukla/Desktop")
INPUT_FILE = BASE_DIR / "text_another.txt"
OUTPUT_FILE = BASE_DIR / "combined.json"
DIRECTORY_NAME = "Minneapolis 1900"

# Known occupation keywords for optional checking
OCCUPATIONS = {
    'Salesman', 'Merchant', 'Clerk', 'Engineer', 'Teacher', 'Laborer', 'Driver',
    'Teacher', 'Professor', 'Carpenter', 'Baker', 'Barber', 'Doctor', 'Physician'
}

def parse_entry(line: str):
    """
    Parse a single entry line into the JSON schema.
    """
    # Remove leading quote if present
    line = line.lstrip('"').strip()
    tokens = line.split()
    
    # First name and last name
    first_name = tokens[0]
    last_name = tokens[1] if len(tokens) > 1 else None
    
    # Find address start (first numeric token)
    addr_idx = next((i for i, tok in enumerate(tokens) if tok.isdigit()), len(tokens))
    address_tokens = tokens[addr_idx:]
    
    # Determine occupation and company name
    middle_tokens = tokens[2:addr_idx]
    occupation = None
    company = None
    if middle_tokens:
        # If first middle token matches known occupations, assign
        if middle_tokens[0].rstrip(',') in OCCUPATIONS:
            occupation = middle_tokens[0].rstrip(',')
            company = " ".join(middle_tokens[1:]) or None
        else:
            company = " ".join(middle_tokens)
    
    # Parse address
    street_number = address_tokens[0] if address_tokens else None
    street_name = " ".join(address_tokens[1:]) if len(address_tokens) > 1 else None
    
    return {
        "FirstName": first_name,
        "LastName": last_name,
        "Spouse": None,
        "Occupation": occupation,
        "CompanyName": company,
        "HomeAddress": {
            "StreetNumber": street_number,
            "StreetName": street_name,
            "ApartmentOrUnit": None,
            "ResidenceIndicator": "h"
        },
        "WorkAddress": None,
        "Telephone": None,
        "DirectoryName": DIRECTORY_NAME,
        "PageNumber": None  # page number info unavailable in this file
    }

def main():
    entries = []
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or not line.startswith('"'):
                continue
            entry = parse_entry(line)
            entries.append(entry)
    
    # Write combined JSON
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(entries, f, indent=2)
    
    print(f"Combined JSON saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


Combined JSON saved to: /Users/darshilshukla/Desktop/combined.json


In [14]:
#!/usr/bin/env python3
"""
Improved OCR segmentation via morphological text‐block detection
→ single text file on Desktop, parsing entries starting with '"' 
until a '|' line, then writing each entry on its own line.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from typing import List

# ——— CONFIG ———
BASE_DIR      = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES   = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS   = [BASE_DIR / n for n in IMAGE_NAMES]
OUTPUT_FILE   = BASE_DIR / "text_Kmeans.txt"
TS_CONFIG     = "--oem 3 --psm 6"
PADDING       = 5    # small padding around each detected block

def detect_blocks(img: np.ndarray) -> List[tuple]:
    """Detect large text blocks by dilating horizontal strokes then vertical strokes."""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Binary inverted threshold: text=white, bg=black
    _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # 1) Connect horizontally (merge words into lines)
    h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
    hor = cv2.dilate(bw, h_kernel, iterations=2)

    # 2) Connect vertically (merge lines into paragraphs)
    v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 20))
    ver = cv2.dilate(hor, v_kernel, iterations=2)

    # 3) Find external contours of those paragraph‐shaped blobs
    contours, _ = cv2.findContours(ver, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    blocks = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        # ignore very small artifacts
        if w * h < 1000:
            continue
        # pad slightly
        xa = max(0, x - PADDING)
        ya = max(0, y - PADDING)
        xb = min(img.shape[1], x + w + PADDING)
        yb = min(img.shape[0], y + h + PADDING)
        blocks.append((xa, ya, xb - xa, yb - ya))
    # Sort top→bottom, left→right
    blocks = sorted(blocks, key=lambda b: (b[1], b[0]))
    return blocks

def ocr_block(img: np.ndarray, block: tuple) -> List[str]:
    """Crop, OCR with Tesseract, return list of lines."""
    x,y,w,h = block
    crop = img[y:y+h, x:x+w]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return text.splitlines()

def main():
    OUTPUT_FILE.parent.mkdir(exist_ok=True)
    entries = []
    for path in IMAGE_PATHS:
        img = cv2.imread(str(path))
        if img is None:
            continue
        blocks = detect_blocks(img)
        current = None
        for blk in blocks:
            for line in ocr_block(img, blk):
                s = line.strip().replace('|','')
                if not s:
                    # partition: end current entry
                    if current:
                        entries.append(current)
                        current = None
                    continue
                if s.startswith('"'):
                    # new entry
                    if current:
                        entries.append(current)
                    current = s
                else:
                    # continuation
                    if current:
                        current += ' ' + s
        # flush last
        if current:
            entries.append(current)

    # write all entries
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        for e in entries:
            f.write(e + "\n")

    print(f"✅ All OCR entries saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


✅ All OCR entries saved to: /Users/darshilshukla/Desktop/text_Kmeans.txt


In [15]:
#!/usr/bin/env python3
"""
Batch OCR for multiple directory pages → single text file using user’s default Desktop path.
Extracts entries beginning with '"', splits lines at '|', preserving text after '|', and writes each entry on its own line in text.txt.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from typing import List, Tuple
import re

# ——— CONFIGURATION ———
BASE_DIR      = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES   = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS   = [BASE_DIR / name for name in IMAGE_NAMES]
OUTPUT_FILE   = BASE_DIR / "text_1.txt"
TS_CONFIG     = "--oem 3 --psm 6"
PADDING       = 5    # padding for cropping blocks

# Detect text blocks via morphological ops
def detect_blocks(img: np.ndarray) -> List[Tuple[int,int,int,int]]:
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    hor = cv2.dilate(bw, cv2.getStructuringElement(cv2.MORPH_RECT, (50,1)), iterations=2)
    ver = cv2.dilate(hor, cv2.getStructuringElement(cv2.MORPH_RECT, (1,20)), iterations=2)
    contours, _ = cv2.findContours(ver, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    blocks = []
    for c in contours:
        x,y,w,h = cv2.boundingRect(c)
        if w*h < 1000:
            continue
        xa = max(0, x - PADDING)
        ya = max(0, y - PADDING)
        xb = min(img.shape[1], x + w + PADDING)
        yb = min(img.shape[0], y + h + PADDING)
        blocks.append((xa, ya, xb-xa, yb-ya))
    blocks.sort(key=lambda b: (b[1], b[0]))
    return blocks

# OCR a block into lines
def ocr_block(img: np.ndarray, block: Tuple[int,int,int,int]) -> List[str]:
    x,y,w,h = block
    crop = img[y:y+h, x:x+w]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return text.splitlines()

# Main processing

def main():
    OUTPUT_FILE.parent.mkdir(exist_ok=True)
    entries: List[str] = []
    for path in IMAGE_PATHS:
        img = cv2.imread(str(path))
        if img is None:
            continue
        blocks = detect_blocks(img)
        current = None
        for blk in blocks:
            lines = ocr_block(img, blk)
            for line in lines:
                # split at every '|' and process each segment
                segments = line.split('|')
                for seg in segments:
                    raw = seg.strip()
                    if not raw:
                        # partition or empty: end current entry
                        if current:
                            entries.append(current)
                            current = None
                        continue
                    # new entry if starts with '"'
                    if raw.startswith('"'):
                        if current:
                            entries.append(current)
                        current = raw
                    else:
                        # continuation
                        if current:
                            current += ' ' + raw
        # flush last entry after page
        if current:
            entries.append(current)
    # write entries, one per line
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        for e in entries:
            f.write(e + "\n")
    print(f"✅ All OCR entries saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

✅ All OCR entries saved to: /Users/darshilshukla/Desktop/text_1.txt


In [2]:
#!/usr/bin/env python3
"""
Simple and reliable batch OCR script for directory pages.
Processes multiple images and extracts text entries that begin with quotes.
"""

import cv2
import pytesseract
import logging
from pathlib import Path
from typing import List

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
log = logging.getLogger(__name__)

class BatchOCR:
    def __init__(self, base_dir: str, image_names: List[str], output_file: str):
        self.base_dir = Path(base_dir)
        self.image_paths = [self.base_dir / name for name in image_names]
        self.output_file = self.base_dir / output_file
        self.tesseract_config = '--oem 3 --psm 6'
    
    def preprocess_image(self, image_path: Path) -> cv2.Mat:
        """Load and preprocess image for better OCR results."""
        img = cv2.imread(str(image_path))
        if img is None:
            raise FileNotFoundError(f"Cannot load image: {image_path}")
        
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Apply adaptive thresholding for better text contrast
        processed = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )
        
        return processed
    
    def extract_text(self, img: cv2.Mat) -> str:
        """Extract text from preprocessed image."""
        try:
            text = pytesseract.image_to_string(img, config=self.tesseract_config)
            return text.strip()
        except Exception as e:
            log.error(f"OCR failed: {e}")
            return ""
    
    def parse_entries(self, text: str) -> List[str]:
        """Parse text and extract entries that begin with quotes."""
        if not text:
            return []
        
        entries = []
        lines = text.split('\n')
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # Split by pipe character if present
            parts = line.split('|')
            for part in parts:
                part = part.strip()
                # Keep entries that start with quotes or look like valid entries
                if part.startswith('"') or (len(part) > 10 and any(c.isalpha() for c in part)):
                    entries.append(part)
        
        return entries
    
    def clean_entries(self, entries: List[str]) -> List[str]:
        """Remove duplicates and clean up entries."""
        cleaned = []
        seen = set()
        
        for entry in entries:
            # Basic cleanup
            entry = ' '.join(entry.split())  # Normalize whitespace
            
            if entry and entry not in seen and len(entry) > 3:
                cleaned.append(entry)
                seen.add(entry)
        
        return cleaned
    
    def process_all_images(self) -> List[str]:
        """Process all images and return combined entries."""
        all_entries = []
        
        log.info(f"Processing {len(self.image_paths)} images...")
        
        for i, image_path in enumerate(self.image_paths, 1):
            log.info(f"Processing {i}/{len(self.image_paths)}: {image_path.name}")
            
            try:
                # Preprocess image
                processed_img = self.preprocess_image(image_path)
                
                # Extract text
                text = self.extract_text(processed_img)
                
                # Parse entries
                entries = self.parse_entries(text)
                all_entries.extend(entries)
                
                log.info(f"Extracted {len(entries)} entries from {image_path.name}")
                
            except Exception as e:
                log.error(f"Failed to process {image_path.name}: {e}")
                continue
        
        return self.clean_entries(all_entries)
    
    def save_results(self, entries: List[str]) -> None:
        """Save entries to output file."""
        try:
            with open(self.output_file, 'w', encoding='utf-8') as f:
                for entry in entries:
                    f.write(f"{entry}\n")
            
            log.info(f"✅ Saved {len(entries)} entries to {self.output_file}")
            
        except Exception as e:
            log.error(f"Failed to save results: {e}")
            raise
    
    def run(self) -> None:
        """Run the complete OCR pipeline."""
        # Verify all images exist
        missing = [p for p in self.image_paths if not p.exists()]
        if missing:
            log.error(f"Missing images: {[p.name for p in missing]}")
            return
        
        # Process images
        entries = self.process_all_images()
        
        if not entries:
            log.warning("No entries found in any images")
            return
        
        # Save results
        self.save_results(entries)


def main():
    """Main function - configure your settings here."""
    # Configuration
    BASE_DIR = "/Users/darshilshukla/Desktop"
    IMAGE_NAMES = ["104.png", "105.png", "106.png", "107.png", "108.png"]
    OUTPUT_FILE = "text_output_claude.txt"
    
    # Run OCR
    ocr = BatchOCR(BASE_DIR, IMAGE_NAMES, OUTPUT_FILE)
    ocr.run()


if __name__ == "__main__":
    main()

2025-06-23 13:03:49,400 - INFO - Processing 5 images...
2025-06-23 13:03:49,401 - INFO - Processing 1/5: 104.png
2025-06-23 13:03:55,189 - INFO - Extracted 201 entries from 104.png
2025-06-23 13:03:55,189 - INFO - Processing 2/5: 105.png
2025-06-23 13:04:01,577 - INFO - Extracted 207 entries from 105.png
2025-06-23 13:04:01,578 - INFO - Processing 3/5: 106.png
2025-06-23 13:04:07,293 - INFO - Extracted 158 entries from 106.png
2025-06-23 13:04:07,294 - INFO - Processing 4/5: 107.png
2025-06-23 13:04:13,525 - INFO - Extracted 182 entries from 107.png
2025-06-23 13:04:13,526 - INFO - Processing 5/5: 108.png
2025-06-23 13:04:19,736 - INFO - Extracted 214 entries from 108.png
2025-06-23 13:04:19,738 - INFO - ✅ Saved 961 entries to /Users/darshilshukla/Desktop/text_output_claude.txt


In [4]:
pip install "numpy<2"

Collecting numpy<2
  Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [8]:
#!/usr/bin/env python3
"""
Tesseract-based OCR and parser for Minneapolis 1900 directory pages.

1. Runs Tesseract on images 104–108 to extract raw text.
2. Splits and groups lines into entries starting with '"', ending at partition lines '|'.
3. Parses each entry to extract:
   - FirstName, LastName
   - Spouse in parentheses
   - Occupation (keyword lookup)
   - CompanyName (remaining middle tokens)
   - HomeAddress (street number, name, apt/unit)
   - DirectoryName (fixed)
   - PageNumber (inferred from filename)
4. Outputs combined.json on Desktop.

Dependencies:
    pip install opencv-python pytesseract
    brew install tesseract

Usage:
    python tesseract_directory_parser.py
"""
import re
import json
import cv2
import pytesseract
from pathlib import Path

# Configuration
BASE_DIR        = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES     = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS     = [BASE_DIR / n for n in IMAGE_NAMES]
OUTPUT_JSON     = BASE_DIR / "combined_new.json"
DIRECTORY_NAME  = "Minneapolis 1900"
TS_CONFIG       = "--oem 3 --psm 6"

# Parsing patterns
SPOUSE_PATTERN = re.compile(r"\(([^)]+)\)")
ADDR_PATTERN   = re.compile(r"(?P<number>\d+)\s+(?P<street>[A-Za-z0-9\.\s]+?)(?:\s*(?P<apt>apt\s*\d+))?$", re.IGNORECASE)
OCCUP_KEYWORDS = ["salesman","merchant","clerk","engineer","teacher","laborer","driver","barber","baker","physician","carpenter","nurse","pntr","meat ctr"]


def ocr_image(path: Path) -> list[str]:
    img = cv2.imread(str(path))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config=TS_CONFIG)
    return [line.strip() for line in text.splitlines() if line.strip()]


def group_entries(lines: list[str]) -> list[str]:
    entries, current = [], None
    for line in lines:
        segments = line.split('|')
        for seg in segments:
            seg = seg.strip()
            if not seg:
                if current:
                    entries.append(current)
                    current = None
                continue
            if seg.startswith('"'):
                if current:
                    entries.append(current)
                current = seg
            else:
                if current is not None:
                    current += ' ' + seg
    if current:
        entries.append(current)
    return entries


def parse_entry(entry: str, page: int) -> dict:
    text = entry.lstrip('"').strip()
    tokens = text.split()
    # Names
    first = tokens[0] if tokens else None
    last  = tokens[1] if len(tokens) > 1 else None
    # Spouse
    m_sp = SPOUSE_PATTERN.search(text)
    spouse = m_sp.group(1) if m_sp else None
    # Address
    m_addr = ADDR_PATTERN.search(text)
    if m_addr:
        number = m_addr.group('number')
        street = m_addr.group('street').strip()
        apt    = m_addr.group('apt')
    else:
        number = street = apt = None
    # Occupation and Company
    middle = tokens[2:tokens.index(number)] if number and number in tokens else tokens[2:]
    occupation = None
    for kw in OCCUP_KEYWORDS:
        if any(kw in tok.lower() for tok in middle):
            occupation = kw.title()
            break
    company = None
    if middle:
        # drop occupation token if present
        comp_tokens = [tok for tok in middle if occupation and occupation.lower() not in tok.lower()]
        company = ' '.join(comp_tokens) if comp_tokens else None
    # Build record
    return {
        "FirstName": first,
        "LastName": last,
        "Spouse": spouse,
        "Occupation": occupation,
        "CompanyName": company,
        "HomeAddress": {
            "StreetNumber": number,
            "StreetName": street,
            "ApartmentOrUnit": apt,
            "ResidenceIndicator": "h"
        },
        "WorkAddress": None,
        "Telephone": None,
        "DirectoryName": DIRECTORY_NAME,
        "PageNumber": page
    }


def main():
    all_records = []
    for path in IMAGE_PATHS:
        if not path.exists():
            print(f"Missing: {path}")
            continue
        page = int(path.stem)
        lines = ocr_image(path)
        entries = group_entries(lines)
        for ent in entries:
            rec = parse_entry(ent, page)
            all_records.append(rec)
    OUTPUT_JSON.write_text(json.dumps(all_records, indent=2), encoding='utf-8')
    print(f"✅ Wrote {len(all_records)} entries to {OUTPUT_JSON}")

if __name__ == '__main__':
    main()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

✅ Wrote 186 entries to /Users/darshilshukla/Desktop/combined_new.json


In [9]:
#!/usr/bin/env python3
"""
Tesseract-based OCR and parser for Minneapolis 1900 directory pages with guaranteed address extraction.

1. Runs Tesseract on images 104–108 to extract raw text.
2. Groups lines into entries starting with '"', splitting on '|' partitions.
3. Parses each entry to extract:
   - FirstName, LastName
   - Spouse (in parentheses)
   - Occupation (keyword lookup)
   - CompanyName (middle tokens)
   - HomeAddress (street number, name, apt/unit) with fallback to ensure address present
   - DirectoryName (fixed)
   - PageNumber (from filename)
4. Outputs combined.json on Desktop.

Dependencies:
    pip install opencv-python pytesseract
    brew install tesseract

Usage:
    python tesseract_directory_parser.py
"""
import re
import json
import cv2
import pytesseract
from pathlib import Path

# Configuration
BASE_DIR        = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES     = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS     = [BASE_DIR / n for n in IMAGE_NAMES]
OUTPUT_JSON     = BASE_DIR / "combined_one.json"
DIRECTORY_NAME  = "Minneapolis 1900"
TS_CONFIG       = "--oem 3 --psm 6"

# Parsing patterns
tok_pattern     = re.compile(r"\d+")
SPOUSE_PATTERN  = re.compile(r"\(([^)]+)\)")
ADDR_PATTERN    = re.compile(
    r"(?P<number>\d+)\s+"
    r"(?P<street>[A-Za-z0-9\.\s]+?)"
    r"(?:\s*(?P<apt>apt\s*\d+))?",
    re.IGNORECASE
)
OCCUP_KEYWORDS  = [
    "salesman","merchant","clerk","engineer","teacher",
    "laborer","driver","barber","baker","physician",
    "carpenter","nurse","pntr","meat ctr"
]


def ocr_image(path: Path) -> list[str]:
    img = cv2.imread(str(path))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config=TS_CONFIG)
    return [line.strip() for line in text.splitlines() if line.strip()]


def group_entries(lines: list[str]) -> list[str]:
    entries, current = [], None
    for line in lines:
        segments = line.split('|')
        for seg in segments:
            seg = seg.strip()
            if not seg:
                if current:
                    entries.append(current)
                    current = None
                continue
            if seg.startswith('"'):
                if current:
                    entries.append(current)
                current = seg
            else:
                if current is not None:
                    current += ' ' + seg
    if current:
        entries.append(current)
    return entries


def parse_entry(entry: str, page: int) -> dict:
    text = entry.lstrip('"').strip()
    tokens = text.split()
    # Names
    first = tokens[0] if tokens else None
    last  = tokens[1] if len(tokens) > 1 else None
    # Spouse
    m_sp = SPOUSE_PATTERN.search(text)
    spouse = m_sp.group(1) if m_sp else None
    # Find address via regex
    m_addr = ADDR_PATTERN.search(text)
    if m_addr:
        number = m_addr.group('number')
        street = m_addr.group('street').strip()
        apt    = m_addr.group('apt')
    else:
        # Fallback: locate first numeric token as street number
        nums = [tok for tok in tokens if tok_pattern.match(tok)]
        if nums:
            number = nums[0]
            idx = tokens.index(number)
            street = ' '.join(tokens[idx+1:idx+4]) if len(tokens) > idx+1 else None
            apt = None
        else:
            # Minimal placeholder to avoid missing address
            number = ""
            street = ""
            apt = None
    # Occupation and CompanyName
    # Tokens before address start
    addr_idx = tokens.index(number) if number in tokens else 2
    middle = tokens[2:addr_idx]
    occupation = next((kw.title() for kw in OCCUP_KEYWORDS if any(kw in tok.lower() for tok in middle)), None)
    # Company = remaining middle tokens after dropping occupation token
    comp_tokens = [tok for tok in middle if not occupation or occupation.lower() not in tok.lower()]
    company = ' '.join(comp_tokens) if comp_tokens else None
    # Build JSON record
    return {
        "FirstName": first,
        "LastName": last,
        "Spouse": spouse,
        "Occupation": occupation,
        "CompanyName": company,
        "HomeAddress": {
            "StreetNumber": number,
            "StreetName": street,
            "ApartmentOrUnit": apt,
            "ResidenceIndicator": "h"
        },
        "WorkAddress": None,
        "Telephone": None,
        "DirectoryName": DIRECTORY_NAME,
        "PageNumber": page
    }


def main():
    all_records = []
    for path in IMAGE_PATHS:
        if not path.exists():
            print(f"Missing: {path}")
            continue
        page = int(path.stem)
        lines = ocr_image(path)
        entries = group_entries(lines)
        for ent in entries:
            rec = parse_entry(ent, page)
            # Ensure address fields not empty
            if not rec["HomeAddress"]["StreetName"]:
                print(f"Warning: no address parsed in entry '{ent}' on page {page}")
            all_records.append(rec)
    OUTPUT_JSON.write_text(json.dumps(all_records, indent=2), encoding='utf-8')
    print(f"✅ Wrote {len(all_records)} entries to {OUTPUT_JSON}")

if __name__ == '__main__':
    main()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Wrote 186 entries to /Users/darshilshukla/Desktop/combined_one.json


In [10]:
#!/usr/bin/env python3
"""
Full pipeline: Image → text.txt → combined.json
1) Runs pytesseract on images 104–108 to produce text entries (text.txt)
2) Parses text.txt into structured JSON (combined.json)
"""

import re
import json
import cv2
import pytesseract
import spacy
from pathlib import Path

# ——— CONFIGURATION ———
BASE_DIR        = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES     = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS     = [BASE_DIR / n for n in IMAGE_NAMES]
TEXT_FILE       = BASE_DIR / "text_final.txt"
OUTPUT_JSON     = BASE_DIR / "combined_final.json"
DIRECTORY_NAME  = "Minneapolis 1900"
TS_CONFIG       = "--oem 3 --psm 6"

# ——— PARSING PATTERNS ———
SPOUSE_PATTERN = re.compile(r"\(([^)]+)\)")
ADDR_PATTERN   = re.compile(
    r"(?P<number>\d+)\s+"
    r"(?P<street>[A-Za-z0-9\.\s]+?)"
    r"(?:\s*(?P<apt>apt\s*\d+))?",
    re.IGNORECASE
)
OCCUP_KEYWORDS = [
    "salesman","merchant","clerk","engineer","teacher",
    "laborer","driver","barber","baker","physician",
    "carpenter","nurse","pntr","meat ctr"
]

# Load spaCy for fallback entity detection (optional)
nlp = spacy.load("en_core_web_sm")

def ocr_image(path: Path):
    """Run pytesseract OCR and return cleaned lines."""
    img = cv2.imread(str(path))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config=TS_CONFIG)
    return [ln.strip() for ln in text.splitlines() if ln.strip()]

def group_entries(lines: list[str]):
    """Group lines into entries starting with '"' and splitting on '|'."""
    entries, current = [], None
    for line in lines:
        for seg in line.split("|"):
            seg = seg.strip()
            if not seg:
                if current:
                    entries.append(current)
                    current = None
                continue
            if seg.startswith('"'):
                if current:
                    entries.append(current)
                current = seg
            else:
                if current is not None:
                    current += " " + seg
    if current:
        entries.append(current)
    return entries

def parse_entry(entry: str, page: int):
    """Parse a single entry string into the JSON schema."""
    text = entry.lstrip('"').strip()
    tokens = text.split()
    # Names
    first = tokens[0] if tokens else None
    last  = tokens[1] if len(tokens) > 1 else None
    # Spouse
    m = SPOUSE_PATTERN.search(text)
    spouse = m.group(1) if m else None
    # Address
    m2 = ADDR_PATTERN.search(text)
    if m2:
        number = m2.group("number")
        street = m2.group("street").strip()
        apt    = m2.group("apt")
    else:
        # fallback: look for first numeric token
        number = None
        for tok in tokens:
            if tok.isdigit():
                number = tok
                break
        idx = tokens.index(number) if number and number in tokens else 2
        street = " ".join(tokens[idx+1:idx+4]) if len(tokens) > idx+1 else ""
        apt = None
    # Occupation
    occupation = None
    for kw in OCCUP_KEYWORDS:
        if re.search(rf"\b{kw}\b", text, re.IGNORECASE):
            occupation = kw.title()
            break
    # CompanyName: what's between names and address, minus occupation
    addr_idx = tokens.index(number) if number in tokens else len(tokens)
    middle = tokens[2:addr_idx]
    if occupation:
        middle = [tok for tok in middle if occupation.lower() not in tok.lower()]
    company = " ".join(middle) if middle else None
    # Build record
    return {
        "FirstName": first,
        "LastName": last,
        "Spouse": spouse,
        "Occupation": occupation,
        "CompanyName": company,
        "HomeAddress": {
            "StreetNumber": number or "",
            "StreetName": street or "",
            "ApartmentOrUnit": apt,
            "ResidenceIndicator": "h"
        },
        "WorkAddress": None,
        "Telephone": None,
        "DirectoryName": DIRECTORY_NAME,
        "PageNumber": page
    }

def main():
    # 1) OCR → text.txt
    all_lines = []
    for img in IMAGE_PATHS:
        if img.exists():
            print(f"OCRing {img.name}…")
            all_lines += ocr_image(img)
        else:
            print(f"⚠️ Missing image: {img}")
    entries = group_entries(all_lines)
    TEXT_FILE.write_text("\n".join(entries), encoding="utf-8")
    print(f"✅ Wrote {len(entries)} raw entries to {TEXT_FILE}")

    # 2) Parse → combined.json
    records = []
    for ent in entries:
        # infer page from leading filename segment if present, else None
        # here we just cycle pages in order
        # simpler: record page = None
        rec = parse_entry(ent, page=None)
        # warn if address empty
        if not rec["HomeAddress"]["StreetName"]:
            print(f"⚠️ No address parsed in entry: {ent}")
        records.append(rec)
    OUTPUT_JSON.write_text(json.dumps(records, indent=2), encoding="utf-8")
    print(f"✅ Wrote {len(records)} parsed entries to {OUTPUT_JSON}")

if __name__ == "__main__":
    main()


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [11]:
#!/usr/bin/env python3
"""
Full pipeline: Image → text.txt → combined.json using only pytesseract.

1) Runs pytesseract on images 104–108 to produce text entries (text.txt)
2) Parses text.txt into structured JSON (combined.json)
"""

import re
import json
import cv2
import pytesseract
from pathlib import Path

# ——— CONFIGURATION ———
BASE_DIR        = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES     = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS     = [BASE_DIR / n for n in IMAGE_NAMES]
TEXT_FILE       = BASE_DIR / "text_upodatedddd.txt"
OUTPUT_JSON     = BASE_DIR / "combined_newwww.json"
DIRECTORY_NAME  = "Minneapolis 1900"
TS_CONFIG       = "--oem 3 --psm 6"

# ——— PARSING PATTERNS ———
SPOUSE_PATTERN = re.compile(r"\(([^)]+)\)")
ADDR_PATTERN   = re.compile(
    r"(?P<number>\d+)\s+"
    r"(?P<street>[A-Za-z0-9\.\s]+?)"
    r"(?:\s*(?P<apt>apt\s*\d+))?",
    re.IGNORECASE
)
OCCUP_KEYWORDS = [
    "salesman","merchant","clerk","engineer","teacher",
    "laborer","driver","barber","baker","physician",
    "carpenter","nurse","pntr","meat ctr"
]

def ocr_image(path: Path):
    """Run pytesseract OCR and return cleaned lines."""
    img = cv2.imread(str(path))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config=TS_CONFIG)
    return [ln.strip() for ln in text.splitlines() if ln.strip()]

def group_entries(lines: list[str]):
    """Group lines into entries starting with '"' and splitting on '|'."""
    entries, current = [], None
    for line in lines:
        for seg in line.split("|"):
            seg = seg.strip()
            if not seg:
                if current:
                    entries.append(current)
                    current = None
                continue
            if seg.startswith('"'):
                if current:
                    entries.append(current)
                current = seg
            else:
                if current is not None:
                    current += " " + seg
    if current:
        entries.append(current)
    return entries

def parse_entry(entry: str, page: int):
    """Parse a single entry string into the JSON schema."""
    text = entry.lstrip('"').strip()
    tokens = text.split()
    # Names
    first = tokens[0] if tokens else ""
    last  = tokens[1] if len(tokens) > 1 else ""
    # Spouse
    m = SPOUSE_PATTERN.search(text)
    spouse = m.group(1) if m else None
    # Address
    m2 = ADDR_PATTERN.search(text)
    if m2:
        number = m2.group("number")
        street = m2.group("street").strip()
        apt    = m2.group("apt")
    else:
        # Fallback: take first numeric token
        number = next((tok for tok in tokens if tok.isdigit()), "")
        idx = tokens.index(number) if number in tokens else 2
        street = " ".join(tokens[idx+1:idx+4]) if len(tokens) > idx+1 else ""
        apt = None
    # Occupation
    occupation = next(
        (kw.title() for kw in OCCUP_KEYWORDS if re.search(rf"\b{kw}\b", text, re.IGNORECASE)),
        None
    )
    # CompanyName: tokens between names and address, minus occupation
    addr_idx = tokens.index(number) if number in tokens else len(tokens)
    middle = tokens[2:addr_idx]
    if occupation:
        middle = [tok for tok in middle if occupation.lower() not in tok.lower()]
    company = " ".join(middle) if middle else None

    return {
        "FirstName": first,
        "LastName": last,
        "Spouse": spouse,
        "Occupation": occupation,
        "CompanyName": company,
        "HomeAddress": {
            "StreetNumber": number,
            "StreetName": street,
            "ApartmentOrUnit": apt,
            "ResidenceIndicator": "h"
        },
        "WorkAddress": None,
        "Telephone": None,
        "DirectoryName": DIRECTORY_NAME,
        "PageNumber": page
    }

def main():
    # Step 1: OCR → text.txt
    all_lines = []
    for img in IMAGE_PATHS:
        if img.exists():
            print(f"OCRing {img.name}…")
            all_lines += ocr_image(img)
        else:
            print(f"⚠️ Missing image: {img}")
    entries = group_entries(all_lines)
    TEXT_FILE.write_text("\n".join(entries), encoding="utf-8")
    print(f"✅ Wrote {len(entries)} entries to {TEXT_FILE}")

    # Step 2: Parse → combined.json
    records = []
    for ent in entries:
        rec = parse_entry(ent, page=None)
        if not rec["HomeAddress"]["StreetName"]:
            print(f"⚠️ No address parsed for entry: {ent}")
        records.append(rec)
    OUTPUT_JSON.write_text(json.dumps(records, indent=2), encoding="utf-8")
    print(f"✅ Wrote {len(records)} parsed entries to {OUTPUT_JSON}")

if __name__ == "__main__":
    main()


OCRing 104.png…


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


OCRing 105.png…


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


OCRing 106.png…


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


OCRing 107.png…


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


OCRing 108.png…


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Wrote 186 entries to /Users/darshilshukla/Desktop/text_upodatedddd.txt
⚠️ No address parsed for entry: " allen (Agnes) driver Widholm Transfer 1612
✅ Wrote 186 parsed entries to /Users/darshilshukla/Desktop/combined_newwww.json


In [12]:
#!/usr/bin/env python3
"""
Improved Batch OCR → single text file.
Features:
 - Deskew + denoise + adaptive threshold
 - 2‑D clustering for robust region segmentation
 - Parallel processing of pages
 - Detailed logging and error handling
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple, Optional
from sklearn.cluster import AgglomerativeClustering
import concurrent.futures
import logging

# ——— CONFIGURATION ———
BASE_DIR       = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES    = ["104.png", "105.png", "106.png", "107.png", "108.png"]
OUTPUT_FILE    = BASE_DIR / "text_another_improved.txt"
TS_CONFIG      = "--oem 3 --psm 6"
PADDING        = 10
HEADER_RATIO   = 0.18
FOOTER_RATIO   = 0.82
NOISE_KERNEL   = (3, 3)

# setup logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s")

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float

def deskew(gray: np.ndarray) -> np.ndarray:
    # estimate rotation angle
    coords = np.column_stack(np.where(gray < 255))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = gray.shape
    M = cv2.getRotationMatrix2D((w/2, h/2), angle, 1.0)
    return cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_LINEAR)

def preprocess(img: np.ndarray) -> np.ndarray:
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = deskew(gray)
    # adaptive threshold
    th = cv2.adaptiveThreshold(gray, 255,
                               cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                               cv2.THRESH_BINARY_INV,
                               15, 10)
    # morphological opening to remove speckles
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, NOISE_KERNEL)
    clean = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel)
    # invert back
    return cv2.bitwise_not(clean)

def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes: List[WordBox] = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data['conf'][i])
        except ValueError:
            continue
        if conf < 30:
            continue
        l, t = data['left'][i], data['top'][i]
        w, h = data['width'][i], data['height'][i]
        boxes.append(WordBox(txt, l, t, w, h, l + w/2, t + h/2))
    return boxes

def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top  for b in boxes] + [b.top  + b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2 - x1, y2 - y1

def cluster_regions(boxes: List[WordBox],
                    n_clusters: int,
                    dims: Tuple[np.ndarray, np.ndarray]) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    coords = np.column_stack([dims[0], dims[1]])
    model = AgglomerativeClustering(n_clusters=n_clusters).fit(coords)
    regions = []
    for lbl in range(n_clusters):
        members = [b for b, l in zip(boxes, model.labels_) if l == lbl]
        regions.append(compute_bbox(members))
    regions.sort(key=lambda r: (r[1], r[0]))  # sort top→down, left→right
    return regions

def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    header_h = int(h * HEADER_RATIO)
    footer_h = int(h * FOOTER_RATIO)
    # split into header/body/footer
    header = [b for b in boxes if b.cy < header_h]
    body   = [b for b in boxes if header_h <= b.cy <= footer_h]
    footer = [b for b in boxes if b.cy > footer_h]

    regions: List[Tuple[int,int,int,int]] = []
    regions += cluster_regions(header, n_clusters=3, dims=(np.array([b.cx for b in header]), np.array([b.cy for b in header])))
    regions += cluster_regions(body,   n_clusters=2, dims=(np.array([b.cx for b in body]),   np.array([b.cy for b in body])))
    if footer:
        regions.append(compute_bbox(footer))

    # pad
    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, x - PADDING)
        ya = max(0, y - PADDING)
        xb = min(w, x + rw + PADDING)
        yb = min(h, y + rh + PADDING)
        padded.append((xa, ya, xb - xa, yb - ya))
    return padded

def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> List[str]:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return text.splitlines()

def process_page(img_path: Path) -> List[str]:
    logging.info(f"Processing {img_path.name}")
    img = cv2.imread(str(img_path))
    if img is None:
        logging.warning(f"Could not read {img_path}")
        return []
    prep = preprocess(img)
    boxes = get_word_boxes(prep)
    regions = extract_regions(boxes, img.shape[:2])

    entries: List[str] = []
    current: Optional[str] = None

    for region in regions:
        for line in ocr_region(img, region):
            raw = line.strip().replace('|', '')
            if not raw:
                if current:
                    entries.append(current)
                    current = None
                continue
            if raw.startswith('"'):
                if current:
                    entries.append(current)
                current = raw
            elif current:
                current += ' ' + raw

    if current:
        entries.append(current)
    logging.info(f"Found {len(entries)} entries in {img_path.name}")
    return entries

def main():
    OUTPUT_FILE.parent.mkdir(exist_ok=True)
    all_entries: List[str] = []

    with concurrent.futures.ProcessPoolExecutor() as exe:
        futures = {exe.submit(process_page, BASE_DIR / name): name for name in IMAGE_NAMES}
        for fut in concurrent.futures.as_completed(futures):
            all_entries.extend(fut.result())

    # deduplicate & write
    seen = set()
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for ent in all_entries:
            if ent not in seen:
                out_f.write(ent + "\n")
                seen.add(ent)

    logging.info(f"Saved {len(seen)} unique entries to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [13]:
#!/usr/bin/env python3
"""
Improved Batch OCR → single text file.
Features:
 - Deskew + denoise + adaptive threshold
 - 2D clustering for robust region segmentation
 - Parallel processing of pages with process→thread fallback
 - Detailed logging and error handling
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple, Optional
from sklearn.cluster import AgglomerativeClustering
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import logging

# ——— CONFIGURATION ———
BASE_DIR       = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES    = ["104.png", "105.png", "106.png", "107.png", "108.png"]
OUTPUT_FILE    = BASE_DIR / "text_another_improved.txt"
TS_CONFIG      = "--oem 3 --psm 6"
PADDING        = 10
HEADER_RATIO   = 0.18
FOOTER_RATIO   = 0.82
NOISE_KERNEL   = (3, 3)

# setup logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s")

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float

def deskew(gray: np.ndarray) -> np.ndarray:
    coords = np.column_stack(np.where(gray < 255))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = gray.shape
    M = cv2.getRotationMatrix2D((w/2, h/2), angle, 1.0)
    return cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_LINEAR)

def preprocess(img: np.ndarray) -> np.ndarray:
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = deskew(gray)
    # adaptive threshold (inverse for noise removal)
    th = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV,
        15, 10
    )
    # morphological opening to remove speckles
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, NOISE_KERNEL)
    clean = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel)
    # invert back for OCR
    return cv2.bitwise_not(clean)

def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes: List[WordBox] = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data['conf'][i])
        except ValueError:
            continue
        if conf < 30:
            continue
        l, t = data['left'][i], data['top'][i]
        w, h = data['width'][i], data['height'][i]
        boxes.append(WordBox(txt, l, t, w, h, l + w/2, t + h/2))
    return boxes

def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top  for b in boxes] + [b.top  + b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2 - x1, y2 - y1

def cluster_regions(
    boxes: List[WordBox],
    n_clusters: int,
    dims: Tuple[np.ndarray, np.ndarray]
) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    coords = np.column_stack([dims[0], dims[1]])
    model = AgglomerativeClustering(n_clusters=n_clusters).fit(coords)
    regions = []
    for lbl in range(n_clusters):
        members = [b for b, l in zip(boxes, model.labels_) if l == lbl]
        regions.append(compute_bbox(members))
    # sort by vertical then horizontal position
    regions.sort(key=lambda r: (r[1], r[0]))
    return regions

def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    header_h = int(h * HEADER_RATIO)
    footer_h = int(h * FOOTER_RATIO)

    header = [b for b in boxes if b.cy < header_h]
    body   = [b for b in boxes if header_h <= b.cy <= footer_h]
    footer = [b for b in boxes if b.cy > footer_h]

    regions: List[Tuple[int,int,int,int]] = []
    regions += cluster_regions(header, n_clusters=3,
                               dims=(np.array([b.cx for b in header]), np.array([b.cy for b in header])))
    regions += cluster_regions(body,   n_clusters=2,
                               dims=(np.array([b.cx for b in body]), np.array([b.cy for b in body])))
    if footer:
        regions.append(compute_bbox(footer))

    # pad regions
    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, x - PADDING)
        ya = max(0, y - PADDING)
        xb = min(w, x + rw + PADDING)
        yb = min(h, y + rh + PADDING)
        padded.append((xa, ya, xb - xa, yb - ya))
    return padded

def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> List[str]:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return text.splitlines()

def process_page(img_path: Path) -> List[str]:
    try:
        logging.info(f"Processing {img_path.name}")
        img = cv2.imread(str(img_path))
        if img is None:
            logging.warning(f"Could not read {img_path}")
            return []
        prep = preprocess(img)
        boxes = get_word_boxes(prep)
        regions = extract_regions(boxes, img.shape[:2])

        entries: List[str] = []
        current: Optional[str] = None

        for region in regions:
            for line in ocr_region(img, region):
                raw = line.strip().replace('|', '')
                if not raw:
                    if current:
                        entries.append(current)
                        current = None
                    continue
                if raw.startswith('"'):
                    if current:
                        entries.append(current)
                    current = raw
                elif current:
                    current += ' ' + raw

        if current:
            entries.append(current)

        logging.info(f"Found {len(entries)} entries in {img_path.name}")
        return entries

    except Exception as e:
        logging.error(f"Error processing {img_path.name}: {e}", exc_info=True)
        return []

def main():
    OUTPUT_FILE.parent.mkdir(exist_ok=True)
    all_entries: List[str] = []

    # first try process pool
    try:
        with ProcessPoolExecutor() as exe:
            futures = {exe.submit(process_page, BASE_DIR / name): name for name in IMAGE_NAMES}
            for fut in as_completed(futures):
                all_entries.extend(fut.result())
    except Exception as pool_err:
        logging.warning(f"ProcessPool failed ({pool_err}), falling back to ThreadPool")
        with ThreadPoolExecutor() as exe:
            futures = {exe.submit(process_page, BASE_DIR / name): name for name in IMAGE_NAMES}
            for fut in as_completed(futures):
                all_entries.extend(fut.result())

    # dedupe & write
    seen = set()
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for ent in all_entries:
            if ent not in seen:
                out_f.write(ent + "\n")
                seen.add(ent)

    logging.info(f"Saved {len(seen)} unique entries to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Process SpawnProcess-6:
Traceback (most recent call last):
  File "/Users/darshilshukla/anaconda3/lib/python3.12/multiprocessing/process.py", line 314,

In [1]:
#!/usr/bin/env python3
"""
Batch OCR for multiple directory pages → single text file using user’s default Desktop path.
Extracts entries that start with '"', continues until a '|' partition, removes all '|' characters,
and writes each entry on a new line in text.txt.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIGURATION ———
BASE_DIR      = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES   = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS   = [BASE_DIR / name for name in IMAGE_NAMES]
OUTPUT_FILE   = BASE_DIR / "text_another.txt"
TS_CONFIG     = "--oem 3 --psm 6"
PADDING       = 10
MARGIN_RATIO  = 0.15
HEADER_RATIO  = 0.18
FOOTER_RATIO  = 0.82

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float

# Extract word-level boxes via Tesseract
def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data['conf'][i])
        except ValueError:
            continue
        if conf < 30:
            continue
        l, t = data['left'][i], data['top'][i]
        w, h = data['width'][i], data['height'][i]
        cx, cy = l + w/2, t + h/2
        boxes.append(WordBox(txt, l, t, w, h, cx, cy))
    return boxes

# Compute bounding box for a list of WordBoxes
def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top  for b in boxes] + [b.top  + b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2 - x1, y2 - y1

# Cluster WordBoxes into columns (headers/body)
def cluster_stripe(boxes: List[WordBox], n_clusters: int) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    X = np.array([[b.cx] for b in boxes])
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    regions = []
    for lbl in range(n_clusters):
        members = [b for b, l in zip(boxes, km.labels_) if l == lbl]
        regions.append(compute_bbox(members))
    regions.sort(key=lambda r: r[0])
    return regions

# Extract seven regions per page: margin, 3 header cols, 2 body cols, footer
def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    margin_thresh = w * MARGIN_RATIO
    margin_boxes = [b for b in boxes if b.cx < margin_thresh]
    rem_boxes    = [b for b in boxes if b.cx >= margin_thresh]
    header_h     = h * HEADER_RATIO
    footer_h     = h * FOOTER_RATIO

    header_boxes = [b for b in rem_boxes if b.cy < header_h]
    body_boxes   = [b for b in rem_boxes if header_h <= b.cy <= footer_h]
    footer_boxes = [b for b in rem_boxes if b.cy > footer_h]

    regions = []
    if margin_boxes:
        regions.append(compute_bbox(margin_boxes))
    regions += cluster_stripe(header_boxes, 3)
    regions += cluster_stripe(body_boxes,   2)
    if footer_boxes:
        regions.append(compute_bbox(footer_boxes))

    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, int(x - PADDING))
        ya = max(0, int(y - PADDING))
        xb = min(w, int(x + rw + PADDING))
        yb = min(h, int(y + rh + PADDING))
        padded.append((xa, ya, xb - xa, yb - ya))
    return padded

# OCR a region and return its text lines
def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> List[str]:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return text.splitlines()

# Main: process images, parse entries starting with '"', end on '|', remove all '|'

def main():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    entries: List[str] = []

    for img_path in IMAGE_PATHS:
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        word_boxes = get_word_boxes(gray)
        regions    = extract_regions(word_boxes, img.shape[:2])

        current: str = None
        for region in regions:
            lines = ocr_region(img, region)
            for line in lines:
                raw = line.strip().replace('|', '')
                if not raw:
                    if current:
                        entries.append(current)
                        current = None
                    continue
                if raw.startswith('"'):
                    if current:
                        entries.append(current)
                    current = raw
                else:
                    if current:
                        current += ' ' + raw
        if current:
            entries.append(current)

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for ent in entries:
            out_f.write(ent + "\n")

    print(f"All OCR entries saved to: {OUTPUT_FILE}")

if __name__ == '__main__':
    main()

All OCR entries saved to: /Users/darshilshukla/Desktop/text_another.txt


In [3]:
#!/usr/bin/env python3
"""
High‑robustness batch OCR for Minneapolis‑directory pages.
  • Deskew, adaptive‑threshold, morphological denoise
  • 2‑D agglomerative clustering to segment header/body/footer columns
  • Concurrency: ProcessPool → ThreadPool → single‑thread fallback
  • Bullet‑proof: every page wrapped in try/except, so workers never crash
Output: “text_another_improved.txt” on your Desktop, one entry per line
"""

import cv2
import pytesseract
import numpy as np
import shutil                 # ← added
import logging
import os
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple, Optional
from sklearn.cluster import AgglomerativeClustering
from concurrent.futures import (
    ProcessPoolExecutor, ThreadPoolExecutor, as_completed, Executor
)

# ───────── CONFIG ─────────
BASE_DIR       = Path.home() / "Desktop"
IMAGE_NAMES    = ["104.png", "105.png", "106.png", "107.png", "108.png"]
OUTPUT_FILE    = BASE_DIR / "text_another_improved.txt"

TS_CONFIG      = "--oem 3 --psm 6"
PADDING        = 10
HEADER_RATIO   = 0.18
FOOTER_RATIO   = 0.82
NOISE_KERNEL   = (3, 3)
CONF_THRESH    = 30.0          # ignore low‑confidence words

# ───────── LOGGING ─────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

@dataclass
class WordBox:
    text:   str
    left:   int
    top:    int
    width:  int
    height: int
    cx:     float
    cy:     float

# ───────── PRE‑PROCESSING ─────────
def deskew(gray: np.ndarray) -> np.ndarray:
    coords = np.column_stack(np.where(gray < 255))
    if coords.size == 0:
        return gray
    angle = cv2.minAreaRect(coords)[-1]
    angle = -(90 + angle) if angle < -45 else -angle
    h, w  = gray.shape
    M = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1.0)
    return cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_LINEAR)

def preprocess(bgr: np.ndarray) -> np.ndarray:
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
    gray = deskew(gray)
    thr = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,
        15, 10
    )
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, NOISE_KERNEL)
    clean  = cv2.morphologyEx(thr, cv2.MORPH_OPEN, kernel)
    return cv2.bitwise_not(clean)

# ───────── WORD BOX EXTRACTION ─────────
def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes: List[WordBox] = []
    for i, txt in enumerate(data["text"]):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data["conf"][i])
        except ValueError:
            continue
        if conf < CONF_THRESH:
            continue
        l, t  = data["left"][i], data["top"][i]
        w, h  = data["width"][i], data["height"][i]
        boxes.append(WordBox(txt, l, t, w, h, l + w / 2.0, t + h / 2.0))
    return boxes

# ───────── REGION SEGMENTATION ─────────
def compute_bbox(boxes: List[WordBox]) -> Tuple[int, int, int, int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top  for b in boxes] + [b.top  + b.height for b in boxes]
    return min(xs), min(ys), max(xs) - min(xs), max(ys) - min(ys)

def cluster_regions(boxes: List[WordBox], n_clusters: int) \
        -> List[Tuple[int, int, int, int]]:
    if not boxes:
        return []
    coords = np.column_stack([[b.cx for b in boxes], [b.cy for b in boxes]])
    labels = AgglomerativeClustering(n_clusters=n_clusters).fit_predict(coords)
    reg = []
    for lbl in range(n_clusters):
        members = [b for b, l in zip(boxes, labels) if l == lbl]
        reg.append(compute_bbox(members))
    reg.sort(key=lambda r: (r[1], r[0]))  # sort top→bottom, left→right
    return reg

def extract_regions(boxes: List[WordBox], shape: Tuple[int, int]) \
        -> List[Tuple[int, int, int, int]]:
    h, w = shape
    header_cut = int(h * HEADER_RATIO)
    footer_cut = int(h * FOOTER_RATIO)

    header = [b for b in boxes if b.cy < header_cut]
    body   = [b for b in boxes if header_cut <= b.cy <= footer_cut]
    footer = [b for b in boxes if b.cy > footer_cut]

    regions: List[Tuple[int, int, int, int]] = []
    regions += cluster_regions(header, 3)
    regions += cluster_regions(body,   2)
    if footer:
        regions.append(compute_bbox(footer))

    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, x - PADDING)
        ya = max(0, y - PADDING)
        xb = min(w, x + rw + PADDING)
        yb = min(h, y + rh + PADDING)
        padded.append((xa, ya, xb - xa, yb - ya))
    return padded

# ───────── REGION OCR ─────────
def ocr_region(bgr: np.ndarray, region: Tuple[int, int, int, int]) -> List[str]:
    x, y, rw, rh = region
    crop = bgr[y : y + rh, x : x + rw]
    return pytesseract.image_to_string(crop, config=TS_CONFIG).splitlines()

# ───────── PER‑PAGE PIPELINE ─────────
def process_page(img_path: Path) -> List[str]:
    try:
        logging.info(f"→ {img_path.name}")
        img = cv2.imread(str(img_path))
        if img is None:
            logging.warning(f"Cannot read {img_path}")
            return []

        prep    = preprocess(img)
        boxes   = get_word_boxes(prep)
        regions = extract_regions(boxes, img.shape[:2])

        entries, current = [], None
        for reg in regions:
            for line in ocr_region(img, reg):
                raw = line.strip().replace("|", "")
                if not raw:
                    if current:
                        entries.append(current)
                        current = None
                    continue
                if raw.startswith('"'):
                    if current:
                        entries.append(current)
                    current = raw
                elif current:
                    current += " " + raw
        if current:
            entries.append(current)

        logging.info(f"  {len(entries)} entries")
        return entries

    except Exception as exc:
        logging.error(f"[{img_path.name}] {exc}", exc_info=True)
        return []

# ───────── EXECUTION HELPERS ─────────
def run_with_executor(executor_cls: type[Executor]) -> List[str]:
    entries: List[str] = []
    with executor_cls() as ex:
        futs = {ex.submit(process_page, BASE_DIR / n): n for n in IMAGE_NAMES}
        for fut in as_completed(futs):
            entries.extend(fut.result())
    return entries

def main() -> None:
    OUTPUT_FILE.parent.mkdir(exist_ok=True)

    for executor in (ProcessPoolExecutor, ThreadPoolExecutor):
        try:
            all_entries = run_with_executor(executor)
            break
        except Exception as e:
            logging.warning(f"{executor.__name__} failed → {e}")
    else:  # if both pools fail, run single‑threaded
        logging.warning("Falling back to single‑thread mode.")
        all_entries = [e for n in IMAGE_NAMES
                       for e in process_page(BASE_DIR / n)]

    # deduplicate + write
    seen = set()
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for ent in all_entries:
            if ent not in seen:
                f.write(ent + "\n")
                seen.add(ent)

    logging.info(f"✓ Saved {len(seen)} unique entries → {OUTPUT_FILE}")

# ───────── RUN ─────────
if __name__ == "__main__":
    if not shutil.which("tesseract"):
        logging.warning("Tesseract binary not found on PATH; install or add it.")
    main()


Process SpawnProcess-2:
Process SpawnProcess-1:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/darshilshukla/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/darshilshukla/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/darshilshukla/anaconda3/lib/python3.12/concurrent/futures/process.py", line 251, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/darshilshukla/anaconda3/lib/python3.12/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'process_page' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
  File "/Users/darshilshukla/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.ru

In [4]:
#!/usr/bin/env python3
"""
Full pipeline: Image → text.txt → combined.json using only pytesseract.

1) Runs pytesseract on images 104–108 to produce text entries (text.txt)
2) Parses text.txt into structured JSON (combined.json)
"""

import re
import json
import cv2
import pytesseract
from pathlib import Path

# ——— CONFIGURATION ———
BASE_DIR        = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES     = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS     = [BASE_DIR / n for n in IMAGE_NAMES]
TEXT_FILE       = BASE_DIR / "text.txt"
OUTPUT_JSON     = BASE_DIR / "combined.json"
DIRECTORY_NAME  = "Minneapolis 1900"
TS_CONFIG       = "--oem 3 --psm 6"

# ——— PARSING PATTERNS ———
SPOUSE_PATTERN = re.compile(r"\(([^)]+)\)")
ADDR_PATTERN   = re.compile(
    r"(?P<number>\d+)\s+"
    r"(?P<street>[A-Za-z0-9\.\s]+?)"
    r"(?:\s*(?P<apt>apt\s*\d+))?",
    re.IGNORECASE
)
OCCUP_KEYWORDS = [
    "salesman","merchant","clerk","engineer","teacher",
    "laborer","driver","barber","baker","physician",
    "carpenter","nurse","pntr","meat ctr"
]

def ocr_image(path: Path):
    """Run pytesseract OCR and return cleaned lines."""
    img = cv2.imread(str(path))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config=TS_CONFIG)
    return [ln.strip() for ln in text.splitlines() if ln.strip()]

def group_entries(lines: list[str]):
    """Group lines into entries starting with '"' and splitting on '|'."""
    entries, current = [], None
    for line in lines:
        for seg in line.split("|"):
            seg = seg.strip()
            if not seg:
                if current:
                    entries.append(current)
                    current = None
                continue
            if seg.startswith('"'):
                if current:
                    entries.append(current)
                current = seg
            else:
                if current is not None:
                    current += " " + seg
    if current:
        entries.append(current)
    return entries

def parse_entry(entry: str, page: int):
    """Parse a single entry string into the JSON schema."""
    text = entry.lstrip('"').strip()
    tokens = text.split()
    # Names
    first = tokens[0] if tokens else ""
    last  = tokens[1] if len(tokens) > 1 else ""
    # Spouse
    m = SPOUSE_PATTERN.search(text)
    spouse = m.group(1) if m else None
    # Address
    m2 = ADDR_PATTERN.search(text)
    if m2:
        number = m2.group("number")
        street = m2.group("street").strip()
        apt    = m2.group("apt")
    else:
        # Fallback: take first numeric token
        number = next((tok for tok in tokens if tok.isdigit()), "")
        idx = tokens.index(number) if number in tokens else 2
        street = " ".join(tokens[idx+1:idx+4]) if len(tokens) > idx+1 else ""
        apt = None
    # Occupation
    occupation = next(
        (kw.title() for kw in OCCUP_KEYWORDS if re.search(rf"\b{kw}\b", text, re.IGNORECASE)),
        None
    )
    # CompanyName: tokens between names and address, minus occupation
    addr_idx = tokens.index(number) if number in tokens else len(tokens)
    middle = tokens[2:addr_idx]
    if occupation:
        middle = [tok for tok in middle if occupation.lower() not in tok.lower()]
    company = " ".join(middle) if middle else None

    return {
        "FirstName": first,
        "LastName": last,
        "Spouse": spouse,
        "Occupation": occupation,
        "CompanyName": company,
        "HomeAddress": {
            "StreetNumber": number,
            "StreetName": street,
            "ApartmentOrUnit": apt,
            "ResidenceIndicator": "h"
        },
        "WorkAddress": None,
        "Telephone": None,
        "DirectoryName": DIRECTORY_NAME,
        "PageNumber": page
    }

def main():
    # Step 1: OCR → text.txt
    all_lines = []
    for img in IMAGE_PATHS:
        if img.exists():
            print(f"OCRing {img.name}…")
            all_lines += ocr_image(img)
        else:
            print(f"⚠️ Missing image: {img}")
    entries = group_entries(all_lines)
    TEXT_FILE.write_text("\n".join(entries), encoding="utf-8")
    print(f"✅ Wrote {len(entries)} entries to {TEXT_FILE}")

    # Step 2: Parse → combined.json
    records = []
    for ent in entries:
        rec = parse_entry(ent, page=None)
        if not rec["HomeAddress"]["StreetName"]:
            print(f"⚠️ No address parsed for entry: {ent}")
        records.append(rec)
    OUTPUT_JSON.write_text(json.dumps(records, indent=2), encoding="utf-8")
    print(f"✅ Wrote {len(records)} parsed entries to {OUTPUT_JSON}")

if __name__ == "__main__":
    main()


OCRing 104.png…
OCRing 105.png…
OCRing 106.png…
OCRing 107.png…
OCRing 108.png…
✅ Wrote 186 entries to /Users/darshilshukla/Desktop/text.txt
⚠️ No address parsed for entry: " allen (Agnes) driver Widholm Transfer 1612
✅ Wrote 186 parsed entries to /Users/darshilshukla/Desktop/combined.json


In [5]:
pip install pillow pytesseract opencv-python numpy

Note: you may need to restart the kernel to use updated packages.


In [12]:
#!/usr/bin/env python3
"""
Pillow + OpenCV Box‑based OCR → text.txt (final newline fix)
"""
from PIL import Image, ImageOps, ImageFilter
import pytesseract
import cv2
import numpy as np
from pathlib import Path

# CONFIG
BASE_DIR = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS = [BASE_DIR / name for name in IMAGE_NAMES]
OUTPUT_FILE = BASE_DIR / "text.txt"
TS_CONFIG = "--oem 3 --psm 6"
PADDING = 5

# Preprocess image ➜ binary numpy array
def preprocess(img_pil: Image.Image) -> np.ndarray:
    gray = ImageOps.grayscale(img_pil).filter(ImageFilter.SHARPEN)
    bw = gray.point(lambda x: 0 if x < 128 else 255, mode="1")
    return np.array(bw, dtype=np.uint8)

# Detect block boxes
def detect_boxes(bw: np.ndarray):
    inv = cv2.bitwise_not(bw)
    hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
    hor = cv2.dilate(inv, hor_kernel, iterations=2)
    ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 20))
    ver = cv2.dilate(hor, ver_kernel, iterations=2)
    contours, _ = cv2.findContours(ver, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    boxes = []
    h, w = bw.shape
    for c in contours:
        x, y, cw, ch = cv2.boundingRect(c)
        if cw * ch < 1000:
            continue
        xa = max(0, x - PADDING)
        ya = max(0, y - PADDING)
        xb = min(w, x + cw + PADDING)
        yb = min(h, y + ch + PADDING)
        boxes.append((xa, ya, xb - xa, yb - ya))
    boxes.sort(key=lambda b: (b[1], b[0]))
    return boxes

# OCR a single box
def ocr_box(img_pil: Image.Image, box):
    x, y, w, h = box
    crop = img_pil.crop((x, y, x + w, y + h))
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return [ln.strip() for ln in text.splitlines() if ln.strip()]

# Group lines into directory entries
def group_entries(lines):
    entries, current = [], None
    for line in lines:
        for seg in line.split("|"):
            seg = seg.strip()
            if not seg:
                if current:
                    entries.append(current)
                    current = None
                continue
            if seg.startswith('"'):
                if current:
                    entries.append(current)
                current = seg
            else:
                if current is not None:
                    current += " " + seg
    if current:
        entries.append(current)
    return entries

# MAIN
def main():
    all_lines: list[str] = []
    for img_path in IMAGE_PATHS:
        if not img_path.exists():
            print(f"⚠️ Missing image {img_path}")
            continue
        print(f"Processing {img_path.name} …")
        img_pil = Image.open(img_path)
        bw = preprocess(img_pil)
        boxes = detect_boxes(bw)
        for box in boxes:
            all_lines.extend(ocr_box(img_pil, box))

    entries = group_entries(all_lines)
    OUTPUT_FILE.parent.mkdir(exist_ok=True)
    with OUTPUT_FILE.open('w', encoding='utf-8') as f:
        for ent in entries:
            f.write(ent + "\n")
    print(f"✅ Saved {len(entries)} entries → {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


Processing 104.png …
Processing 105.png …
Processing 106.png …
Processing 107.png …
Processing 108.png …
✅ Saved 186 entries → /Users/darshilshukla/Desktop/text.txt


In [14]:
import pytesseract
from pytesseract import Output
from PIL import Image
from pathlib import Path
import re

INPUT_DIR = Path("/Users/darshilshukla/Desktop/")
OUTPUT_FILE = Path("/Users/darshilshukla/Desktop/output/combined_entries.txt")

# Create output folder if it doesn't exist
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

def extract_sorted_text_boxes(image_path):
    """Run OCR and return boxes sorted top-down, left-right"""
    image = Image.open(image_path).convert("RGB")
    data = pytesseract.image_to_data(image, output_type=Output.DICT)
    n_boxes = len(data['text'])

    boxes = []
    for i in range(n_boxes):
        if int(data['conf'][i]) > 20 and data['text'][i].strip():
            left, top, width, height = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            word = data['text'][i].strip()
            boxes.append((left, top, word))

    # Sort top-to-bottom, then left-to-right
    boxes.sort(key=lambda b: (b[1], b[0]))
    return boxes

def assemble_entries(boxes):
    """Assemble OCR text into entries starting with `"` and ignoring text after `|`"""
    lines = []
    line = ""
    current_y = None

    for i, (x, y, word) in enumerate(boxes):
        if current_y is None or abs(y - current_y) > 15:
            if line.strip():
                lines.append(line.strip())
            line = word
            current_y = y
        else:
            line += " " + word
    if line.strip():
        lines.append(line.strip())

    # Now clean and extract full entries
    entries = []
    current_entry = ""

    for line in lines:
        line = line.split("|")[0].strip()  # stop reading at "|"
        if not line:
            continue
        if line.startswith('"'):
            if current_entry:
                entries.append(current_entry.strip())
            current_entry = line
        else:
            current_entry += " " + line
    if current_entry:
        entries.append(current_entry.strip())
    return entries

def main():
    all_entries = []
    for i in range(104, 108):
        img_path = INPUT_DIR / f"{i}.png"
        if not img_path.exists():
            print(f"⚠️ Image not found: {img_path}")
            continue
        boxes = extract_sorted_text_boxes(img_path)
        entries = assemble_entries(boxes)
        all_entries.extend(entries)
        print(f"✅ Processed: {img_path.name} → {len(entries)} entries")

    # Write to text file
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for ent in all_entries:
            f.write(ent + "\n")

    print(f"\n📄 Output written to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


✅ Processed: 104.png → 31 entries
✅ Processed: 105.png → 18 entries
✅ Processed: 106.png → 23 entries
✅ Processed: 107.png → 37 entries

📄 Output written to: /Users/darshilshukla/Desktop/output/combined_entries.txt


In [15]:
#!/usr/bin/env python3
"""
Batch OCR for multiple directory pages → single text file using user’s default Desktop path.
Extracts entries that start with '"', continues until a '|' partition, removes all '|' characters,
and writes each entry on a new line in text.txt.
"""

import cv2
import pytesseract
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from dataclasses import dataclass
from typing import List, Tuple

# ——— CONFIGURATION ———
BASE_DIR      = Path("/Users/darshilshukla/Desktop")
IMAGE_NAMES   = ["104.png", "105.png", "106.png", "107.png", "108.png"]
IMAGE_PATHS   = [BASE_DIR / name for name in IMAGE_NAMES]
OUTPUT_FILE   = BASE_DIR / "text_another_one.txt"
TS_CONFIG     = "--oem 3 --psm 6"
PADDING       = 10
MARGIN_RATIO  = 0.15
HEADER_RATIO  = 0.18
FOOTER_RATIO  = 0.82

@dataclass
class WordBox:
    text: str
    left: int
    top: int
    width: int
    height: int
    cx: float
    cy: float

# Extract word-level boxes via Tesseract
def get_word_boxes(gray: np.ndarray) -> List[WordBox]:
    data = pytesseract.image_to_data(
        gray, output_type=pytesseract.Output.DICT, config=TS_CONFIG
    )
    boxes = []
    for i, txt in enumerate(data['text']):
        txt = txt.strip()
        if not txt:
            continue
        try:
            conf = float(data['conf'][i])
        except ValueError:
            continue
        if conf < 30:
            continue
        l, t = data['left'][i], data['top'][i]
        w, h = data['width'][i], data['height'][i]
        cx, cy = l + w/2, t + h/2
        boxes.append(WordBox(txt, l, t, w, h, cx, cy))
    return boxes

# Compute bounding box for a list of WordBoxes
def compute_bbox(boxes: List[WordBox]) -> Tuple[int,int,int,int]:
    xs = [b.left for b in boxes] + [b.left + b.width for b in boxes]
    ys = [b.top  for b in boxes] + [b.top  + b.height for b in boxes]
    x1, x2 = min(xs), max(xs)
    y1, y2 = min(ys), max(ys)
    return x1, y1, x2 - x1, y2 - y1

# Cluster WordBoxes into columns (headers/body)
def cluster_stripe(boxes: List[WordBox], n_clusters: int) -> List[Tuple[int,int,int,int]]:
    if not boxes:
        return []
    X = np.array([[b.cx] for b in boxes])
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    regions = []
    for lbl in range(n_clusters):
        members = [b for b, l in zip(boxes, km.labels_) if l == lbl]
        regions.append(compute_bbox(members))
    regions.sort(key=lambda r: r[0])
    return regions

# Extract seven regions per page: margin, 3 header cols, 2 body cols, footer
def extract_regions(boxes: List[WordBox], img_shape: Tuple[int,int]) -> List[Tuple[int,int,int,int]]:
    h, w = img_shape
    margin_thresh = w * MARGIN_RATIO
    margin_boxes = [b for b in boxes if b.cx < margin_thresh]
    rem_boxes    = [b for b in boxes if b.cx >= margin_thresh]
    header_h     = h * HEADER_RATIO
    footer_h     = h * FOOTER_RATIO

    header_boxes = [b for b in rem_boxes if b.cy < header_h]
    body_boxes   = [b for b in rem_boxes if header_h <= b.cy <= footer_h]
    footer_boxes = [b for b in rem_boxes if b.cy > footer_h]

    regions = []
    if margin_boxes:
        regions.append(compute_bbox(margin_boxes))
    regions += cluster_stripe(header_boxes, 3)
    regions += cluster_stripe(body_boxes,   2)
    if footer_boxes:
        regions.append(compute_bbox(footer_boxes))

    padded = []
    for x, y, rw, rh in regions:
        xa = max(0, int(x - PADDING))
        ya = max(0, int(y - PADDING))
        xb = min(w, int(x + rw + PADDING))
        yb = min(h, int(y + rh + PADDING))
        padded.append((xa, ya, xb - xa, yb - ya))
    return padded

# OCR a region and return its text lines
def ocr_region(img: np.ndarray, region: Tuple[int,int,int,int]) -> List[str]:
    x, y, rw, rh = region
    crop = img[y:y+rh, x:x+rw]
    text = pytesseract.image_to_string(crop, config=TS_CONFIG)
    return text.splitlines()

# Main: process images, parse entries starting with '"', end on '|', remove all '|'

def main():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    entries: List[str] = []

    for img_path in IMAGE_PATHS:
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        word_boxes = get_word_boxes(gray)
        regions    = extract_regions(word_boxes, img.shape[:2])

        current: str = None
        for region in regions:
            lines = ocr_region(img, region)
            for line in lines:
                raw = line.strip().replace('|', '')
                if not raw:
                    if current:
                        entries.append(current)
                        current = None
                    continue
                if raw.startswith('"'):
                    if current:
                        entries.append(current)
                    current = raw
                else:
                    if current:
                        current += ' ' + raw
        if current:
            entries.append(current)

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for ent in entries:
            out_f.write(ent + "\n")

    print(f"All OCR entries saved to: {OUTPUT_FILE}")

if __name__ == '__main__':
    main()


All OCR entries saved to: /Users/darshilshukla/Desktop/text_another_one.txt


In [18]:
#!/usr/bin/env python3
"""
Parser: Text file → structured JSON (paths in code) with auto-install of spaCy model

Reads a text file where each line starts with '"' and represents a directory entry.
Extracts:
  - FirstName, LastName
  - Spouse (in parentheses)
  - Occupation (keyword lookup + NER fallback)
  - CompanyName (remaining ORG entities)
  - HomeAddress (number, street, apt/unit)
  - DirectoryName (fixed)
  - PageNumber (fixed or None)

Outputs combined JSON file.

Usage:
  pip install spacy
  python text_to_json.py
"""
import re
import json
from pathlib import Path
import subprocess
import sys

# ——— CONFIGURATION: set file paths here ———
INPUT_FILE = Path("/Users/darshilshukla/Desktop/text_another_one.txt")
OUTPUT_FILE = Path("/Users/darshilshukla/Desktop/combined_lastone.json")

DIRECTORY_NAME = "Minneapolis 1900"
PAGE_NUMBER = None  # or set to a specific page if known
OCCUP_KEYWORDS = [
    "salesman","merchant","clerk","engineer","teacher",
    "laborer","driver","barber","baker","physician",
    "carpenter","nurse","pntr","meat ctr"
]
ADDR_RE = re.compile(
    r"(?P<number>\d+)\s+"
    r"(?P<street>[A-Za-z0-9\.\s]+?)"
    r"(?:\s*(?P<apt>apt\s*\d+))?$",
    re.IGNORECASE
)
SPOUSE_RE = re.compile(r"\(([^)]+)\)")

# Ensure spaCy and model installed
try:
    import spacy
    from spacy.cli import download as spacy_download
    nlp = spacy.load("en_core_web_sm")
except (ImportError, OSError):
    # Install spaCy if missing
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"] )
    except subprocess.CalledProcessError:
        pass
    # Download model
    try:
        subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"] )
    except subprocess.CalledProcessError:
        pass
    import spacy
    nlp = spacy.load("en_core_web_sm")


def parse_line(line: str) -> dict:
    text = line.lstrip('"').strip()
    doc = nlp(text)
    # FirstName / LastName
    first = last = None
    persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
    if persons:
        parts = persons[0].split()
        first, last = parts[0], parts[-1] if len(parts)>1 else None
    else:
        toks = text.split()
        first = toks[0] if toks else None
        last = toks[1] if len(toks)>1 else None
    # Spouse
    spouse = None
    m_sp = SPOUSE_RE.search(text)
    if m_sp:
        spouse = m_sp.group(1)
    # Address
    number = street = apt = None
    m_addr = ADDR_RE.search(text)
    if m_addr:
        number = m_addr.group('number')
        street = m_addr.group('street').strip()
        apt = m_addr.group('apt')
    # Occupation
    occupation = None
    for kw in OCCUP_KEYWORDS:
        if re.search(rf"\b{kw}\b", text, re.IGNORECASE):
            occupation = kw.title()
            break
    # CompanyName from ORG
    orgs = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
    company = orgs[-1] if orgs else None

    return {
        "FirstName": first,
        "LastName": last,
        "Spouse": spouse,
        "Occupation": occupation,
        "CompanyName": company,
        "HomeAddress": {
            "StreetNumber": number,
            "StreetName": street,
            "ApartmentOrUnit": apt,
            "ResidenceIndicator": "h"
        },
        "WorkAddress": None,
        "Telephone": None,
        "DirectoryName": DIRECTORY_NAME,
        "PageNumber": PAGE_NUMBER
    }


def main():
    if not INPUT_FILE.exists():
        print(f"Input file not found: {INPUT_FILE}")
        sys.exit(1)
    lines = INPUT_FILE.read_text(encoding='utf-8').splitlines()
    records = []
    for line in lines:
        if line.strip().startswith('"'):
            rec = parse_line(line)
            records.append(rec)
    OUTPUT_FILE.write_text(json.dumps(records, indent=2), encoding='utf-8')
    print(f"✅ Parsed {len(records)} entries to {OUTPUT_FILE}")

if __name__ == '__main__':
    main()


Collecting numpy>=1.19.0 (from spacy)
  Using cached numpy-2.3.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Using cached numpy-2.3.1-cp312-cp312-macosx_14_0_arm64.whl (5.1 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.3.1 which is incompatible.
scipy 1.13.1 requires numpy<2.3,>=1.22.4, but you have numpy 2.3.1 which is incompatible.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.3.1 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.1 which is incompatible.[0m[31m
[0m

Successfully installed numpy-2.3.1
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
✅ Parsed 248 entries to /Users/darshilshukla/Desktop/combined_lastone.json
