In [13]:
import os
import glob
from pathlib import Path
from scripts.extract import process_img, get_file_id
from scripts.mapIndex import bulk_index

In [None]:
# extract from image
model = "claude-3-7-sonnet-20250219" # or replace with other model
api_key = os.getenv("API_KEY") # or replace with a str api_key

book_number = 1
input_dir = Path(f"/projectnb/sparkgrp/mass-sec-state-deeds-data/1720-1780/{book_number}") # replace with path to your image folder
output_dir = f"/projectnb/sparkgrp/dd4g-reg-of-deeds/Carrie/Storage/{book_number}" # replace with path to your output folder
os.makedirs(output_dir, exist_ok=True)

# Get list of already processed files to avoid reprocessing
processed_files = set()
for json_path in glob.glob(f"{output_dir}/*.json"):
    file_id = os.path.basename(json_path).replace('.json', '')
    processed_files.add(file_id)
print(f"Found {len(processed_files)} already processed files")

# Extract the information from images
for tif_path in input_dir.glob('*.TIF'):
    try:
        file_id = get_file_id(tif_path)
        if file_id in processed_files:
            continue
            
        process_img(tif_path, output_dir, model, api_key)
        processed_files.add(file_id)  # Add to processed set
    except Exception as e:
        print(f"Error processing {tif_path.name}: {str(e)}")

print("Done.")

In [None]:
# sample output from book 1, first deed:
process_img('../dataset/sample-images/000001-0001.TIF', 'sample-output/', api_key)

JSON saved in: sample-output/000001-0001.json


In [None]:
# bulk index from csv file into elasticsearch cloud
cloud_id = os.getenv("CLOUD_ID")
api_key = os.getenv("ELASTIC_API_KEY")
index_name = "land_deeds"
csv_path = "../dataset/sample-output/land_deeds_data.csv"

bulk_index(cloud_id, api_key, index_name, csv_path)

Connected to Elasticsearch successfully!
Index 'land_deeds' already exists.
Data indexed successfully!


In [None]:
# run the demo search
%cd scripts
!streamlit run app.py
# demo at 'https://huggingface.co/spaces/carrief0908/LandDeeds'