In [22]:
#!pip install --upgrade pymupdf
#!pip install --upgrade pytesseract

In [23]:
pdf_path = "./sample.pdf"
output_dir = "./output"
output_filename = "result"
tesseract_path = "C:/Program Files/Tesseract-OCR/tesseract.exe" # Default installation path of Tesseract

In [24]:
# Create output directories
from pathlib import Path
debug_dir = f"{output_dir}/debug"
Path(debug_dir).mkdir(exist_ok=True, parents=True)

### Convert pdf to image
PDFs can have incorrect text encoding or contain images withing itself, it is better to 
convert pdf pages to images and then perform OCR to extract the text.

We convert only the first page to an image since we assume that it's the title page.

In [25]:
import fitz
from PIL import Image
from io import BytesIO

dpi = 300
zoom_factor = dpi / 72

# Read first page
doc = fitz.open(pdf_path)
page = doc[0]

# Convert to image
mat = fitz.Matrix(zoom_factor, zoom_factor)
pix = page.get_pixmap(matrix=mat)
image_data = pix.tobytes("png")
image = Image.open(BytesIO(image_data))
image.save(f"{debug_dir}/first_page.png", format="png")

### Prepare OCR

https://github.com/UB-Mannheim/tesseract/wiki

In [27]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = tesseract_path
ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

In [None]:
def get_all_strings(json_obj, strings=[]):
    if isinstance(json_obj, dict):
        for value in json_obj.values():
            get_all_strings(value, strings)
    elif isinstance(json_obj, list):
        for item in json_obj:
            get_all_strings(item, strings)
    elif isinstance(json_obj, str):
        strings.append(json_obj)
    return strings

def get_lowest_level_keys(json_doc):
    def recurse(obj, keys, parent_key=None):
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    recurse(v, keys, k)
                else:
                    keys.append(k)
        elif isinstance(obj, list):
            for item in obj:
                if isinstance(item, (dict, list)):
                    recurse(item, keys, parent_key)
                else:
                    if parent_key:
                        keys.append(parent_key)

    keys = []
    recurse(json_doc, keys)
    return keys

In [None]:
import json
from PIL import ImageDraw
import json

with open("./gpt_output.json") as file:
    gpt_output_contents = file.read()

gpt_json_dict = json.loads(gpt_output_contents)

labeled_image = image.copy()
draw = ImageDraw.Draw(labeled_image)
word_list = [value.split() for value in get_all_strings(gpt_json_dict)]
feature_list = get_lowest_level_keys(gpt_json_dict)

output_json = {}
match_progress_list = [0] * len(word_list)
match_start_indices = [-1] * len(word_list)

# Magic
for i, ocr_word in enumerate(ocr_result["text"]):
    if ocr_word.strip():
        for j, match_word_arr in enumerate(word_list):
            if match_progress_list[j] < len(match_word_arr):
                match_word = match_word_arr[match_progress_list[j]]

                if match_word.lower() in ocr_word.lower():
                    if match_progress_list[j] == 0:
                        match_start_indices[j] = i

                    match_progress_list[j] += 1

                    if match_progress_list[j] == len(match_word_arr):
                        start_index = match_start_indices[j]
                        end_index = i + 1

                        x = ocr_result["left"][start_index]
                        y = ocr_result["top"][start_index]
                        w = sum(ocr_result["width"][start_index:end_index]) + match_progress_list[j] * zoom_factor * 2
                        h = max(ocr_result["height"][start_index:end_index])

                        bbox = (x, y, x + w, y + h)
                        draw.rectangle(bbox, outline="green", width=2)

                        output_json.setdefault(feature_list[j], []).append({"text": " ".join(match_word_arr), "bounding_box": bbox})

                        match_progress_list[j] = 0
                        match_start_indices[j] = -1
                else:
                    match_progress_list[j] = 0
                    match_start_indices[j] = -1

output_path = f"{output_dir}/{output_filename}"

with open(f"{output_path}.json", 'w') as json_file:
    json.dump(output_json, json_file)

labeled_image.save(f"{output_path}.png")

print(f"Extracted features JSON saved to: {output_path}.json")
print(f"Annotated image saved to: {output_path}.png")