In [None]:
import requests
import re
from pathlib import Path

import cv2
import numpy as np
import pandas as pd
import pytesseract
from PIL import Image

OCR setup - Install dependencies

pip install pytesseract pillow opencv-python pandas

Get image URLs from the NARA API

https://catalog.archives.gov/api/v1?naIds=110154899&objects=true

Configuration

In [None]:
# National Archives Identifier for this record group
NAID = "110154899"

# Base URL for the National Archives Catalog API
API_URL = "https://catalog.archives.gov/api/v1"

# Directory where downloaded images will be stored
IMG_DIR = Path("images")
IMG_DIR.mkdir(exist_ok = True)

# Output CSV filename
CSV_OUTPUT = "census_cards.csv"

In [None]:
# CSV column headers (two pages per image, six columns per page)
HEADERS = [
    "No_1", "Name_1", "Age_1", "Sex_1", "Blood_1", "CensusCardNo_1",
    "No_2", "Name_2", "Age_2", "Sex_2", "Blood_2", "CensusCardNo_2",
]

# Zero-based column indices that correspond to the Blood columns
BLOOD_COL_INDICES = [4, 10]

# Relative column widths for each page (must sum to ~1.0)
COL_FRACS = [0.07, 0.33, 0.07, 0.07, 0.10, 0.36]

Fetch Image URLS from the API response

In [None]:
def get_jpg_urls():
    # Call the NARA API and extract direct JPEG image URLs.
    params = {"naIds": NAID, "objects": "true"}
    r = requests.get(API_URL, params = params)
    r.raise_for_status()
    data = r.json()

    urls = []

    # Traverse the API response structure
    results = data["opaResponse"]["results"]["result"]
    for r in results:
        objects = r.get("objects", {}).get("object", [])
        for obj in objects:
            for f in obj.get("file", []):
                if f.get("@mime", "").lower() == "image/jpeg":
                    urls.append(f["@url"])

    return urls

Download each image

In [None]:
def download_image(url, out_path):
    # Download a single image and save it to disk.
    r = requests.get(url)
    r.raise_for_status()
    out_path.write_bytes(r.content)

Preprocess + OCR

In [None]:
def preprocess(img):
    """
    Convert image to grayscale and apply thresholding
    to improve OCR accuracy.
    """
    gray = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    return Image.fromarray(thresh)

Split image into columns

In [None]:
def split_columns(img):
    """
    Split an image into 12 columns:
    - 2 pages per image
    - 6 columns per page
    """
    width, height = img.size
    mid = width // 2  # vertical divider line between pages

    pages = [(0, mid), (mid, width)]
    columns = []

    for left, right in pages:
        page_width = right - left
        x = left

        # Split page into fixed-width columns
        for frac in COL_FRACS:
            w = int(page_width * frac)
            box = (x, 0, x + w, height)
            columns.append(img.crop(box))
            x += w

    return columns

OCR Functions

In [None]:
def ocr_general(img):
    # OCR for most columns (names, numbers, etc.).
    return pytesseract.image_to_string(
        img,
        config = "--psm 6 -c preserve_interword_spaces=1"
    )

In [None]:
def ocr_blood(img):
    # OCR specifically tuned for Blood fractions.
    return pytesseract.image_to_string(
        img,
        config="--psm 6 -c tessedit_char_whitelist=Full0123456789/"
    ) 

Cleanup OCR text

In [None]:
def clean_text(text):
    # Basic cleanup for OCR text.
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()

In [None]:
def clean_blood(text):
    """
    Normalize Blood values:
    - Fix OCR character errors
    - Accept 'Full' or any fraction n/d
    """
    t = text.strip().lower()

    # Normalize common OCR mistakes
    t = t.replace("i", "1").replace("l", "1").replace("|", "1")
    t = t.replace(" ", "")

    # Handle 'Full'
    if "full" in t:
        return "Full"

    # Handle fractions like 1/2, 3/4, 5/8, etc.
    m = re.search(r"(\d{1,2})/?(\d{1,2})", t)
    if m:
        return f"{m.group(1)}/{m.group(2)}"

    return ""

Process Image

In [None]:
def process_image(img_path):
    """
    Process one image:
    - Split into columns
    - OCR each column
    - Apply column-specific cleanup
    """
    img = Image.open(img_path)
    columns = split_columns(img)

    row = []

    for idx, col in enumerate(columns):
        pre = preprocess(col)

        if idx in BLOOD_COL_INDICES:
            txt = clean_blood(ocr_blood(pre))
        else:
            txt = clean_text(ocr_general(pre))

        row.append(txt)

    return row

MAIN

In [None]:
def main():
    print("Fetching image URLs from API...")
    urls = get_jpg_urls()
    print(f"Found {len(urls)} images")

    print("Downloading images...")
    for i, url in enumerate(urls, start=1):
        out = IMG_DIR / f"page_{i:04}.jpg"
        if not out.exists():
            download_image(url, out)

    print("Running OCR on images...")
    rows = []
    for img_path in sorted(IMG_DIR.glob("page_*.jpg")):
        print(f"OCR {img_path.name}")
        rows.append(process_image(img_path))

    print("Writing CSV...")
    df = pd.DataFrame(rows, columns = HEADERS)
    df.to_csv(CSV_OUTPUT, index = False)

    print(f"Done. Output saved to {CSV_OUTPUT}")

Run Script

In [None]:
if __name__ == "__main__":
    main()