In [1]:
#!pip install --upgrade pymupdf
#!pip install --upgrade pytesseract
#!pip install --upgrade spacy

In [2]:
pdf_path = "./sample.pdf"
output_dir = "./output"
spacy_model = "en_core_web_md"
tesseract_path = "C:/Program Files/Tesseract-OCR/tesseract.exe" # Default installation path of Tesseract

In [3]:
# Create output directories
from pathlib import Path
debug_dir = f"{output_dir}/debug"
Path(debug_dir).mkdir(exist_ok=True, parents=True)

### Convert pdf to image
PDFs can have incorrect text encoding or contain images withing itself, it is better to 
convert pdf pages to images and then perform OCR to extract the text.

We convert only the first page to an image since we assume that it's the title page.

In [4]:
import fitz
from PIL import Image
from io import BytesIO

dpi = 300
zoom_factor = dpi / 72

# Read first page
doc = fitz.open(pdf_path)
page = doc[0]

# Convert to image
mat = fitz.Matrix(zoom_factor, zoom_factor)
pix = page.get_pixmap(matrix=mat)
image_data = pix.tobytes("png")
image = Image.open(BytesIO(image_data))
image.save(f"{debug_dir}/first_page.png", format="png")

### Prepare NLP model

In [5]:
import spacy

try:
    nlp = spacy.load(spacy_model)
except IOError as e:
    if str(e).startswith("[E050] Can't find model"):
        !python -m spacy download {spacy_model}
        nlp = spacy.load(spacy_model)

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
                                              0.0/42.8 MB ? eta -:--:--
                                              0.0/42.8 MB ? eta -:--:--
                                             0.0/42.8 MB 262.6 kB/s eta 0:02:43
                                             0.0/42.8 MB 245.8 kB/s eta 0:02:54
                                             0.1/42.8 MB 357.2 kB/s eta 0:02:00
                                             0.1/42.8 MB 467.6 kB/s eta 0:01:32
                                             0.1/42.8 MB 500.5 kB/s eta 0:01:26
                                             0.2/42.8 MB 523.5 kB/s eta 0:01:22
                                             0.2/42.8 MB 535.8 kB/s eta 0:01:20
                                             0.2/42.8 MB 550.0 kB/s eta 0:01:18
                                        

### Prepare OCR

https://github.com/UB-Mannheim/tesseract/wiki

In [6]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = tesseract_path
ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

In [7]:
from PIL import ImageDraw
import json

result_filename = "result"

# Process the OCR data with spaCy to identify entities
draw = ImageDraw.Draw(image)

features = {}
for i, word in enumerate(ocr_result["text"]):
    if word.strip():  # Skip empty strings
        x, y, w, h = ocr_result["left"][i], ocr_result["top"][i], ocr_result["width"][i], ocr_result["height"][i]
        bbox = (x, y, x + w, y + h)
        # Process each word with spaCy
        spacy_doc = nlp(word)
        for ent in spacy_doc.ents:
            # Check if the entity is what we're looking for
            if ent.label_ in ["PERSON", "ORG"]:
                features.setdefault("authors", []).append({"text": ent.text, "bounding_box": bbox})
                draw.rectangle(bbox, outline="blue", width=2)  # Draw rectangle for authors
            elif ent.label_ == 'DATE':
                features.setdefault("dates", []).append({"text": ent.text, "bounding_box": bbox})
                draw.rectangle(bbox, outline="red", width=2)  # Draw rectangle for dates
            # Add other conditions based on your requirements

output_path = f"{output_dir}/{result_filename}"

# Save the features and their bounding boxes to a JSON file
with open(f"{output_path}.json", 'w') as json_file:
    json.dump(features, json_file)

# Save the annotated image
image.save(f"{output_path}.png")

print(f"Extracted features JSON saved to: {output_path}.json")
print(f"Annotated image saved to: {output_path}.png")

# Close the document
doc.close()

NameError: name 'nlp' is not defined

In [None]:
# import fitz  # PyMuPDF
# import pytesseract
# from PIL import Image, ImageDraw
# import io
# import json

# # Load a spaCy model for Named Entity Recognition
# import spacy
# nlp = spacy.load("en_core_web_sm")

# # Define the path to the PDF
# pdf_path = './sample.pdf'

# # Open the PDF with PyMuPDF
# doc = fitz.open(pdf_path)
# page = doc[0]

# # Set the zoom factor for rendering the page
# zoom_factor = 2  # Increase this factor for a higher resolution image
# mat = fitz.Matrix(zoom_factor, zoom_factor)

# # Convert the PDF page to a high-resolution image
# pix = page.get_pixmap(matrix=mat)
# image_bytes = pix.tobytes("png")
# image = Image.open(io.BytesIO(image_bytes))

# # Use PyTesseract to do OCR on the image
# ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

# # Prepare to draw rectangles
# draw = ImageDraw.Draw(image)

# # Function to calculate the bounding box for an entity
# def calculate_entity_bbox(entity, words):
#     # Find the words that make up the entity
#     entity_words = [word for word in words if entity.start_char >= word['start_char'] and entity.end_char <= word['end_char']]
#     if not entity_words:
#         return None

#     # Calculate the bounding box for the entity
#     x_min = min(word['bbox'][0] for word in entity_words)
#     y_min = min(word['bbox'][1] for word in entity_words)
#     x_max = max(word['bbox'][2] for word in entity_words)
#     y_max = max(word['bbox'][3] for word in entity_words)
#     return (x_min, y_min, x_max, y_max)

# # Extract words along with their bounding boxes
# words = []
# features = {}
# current_block_num = -1
# current_line_num = -1
# current_block_text = ""
# current_block_words = []
# for i, text in enumerate(ocr_result['text']):
#     if text.strip():  # Non-empty string
#         if ocr_result['block_num'][i] != current_block_num or ocr_result['line_num'][i] != current_line_num:
#             # New block or line
#             if current_block_words:
#                 # Process the previous block of text
#                 doc = nlp(current_block_text)
#                 for entity in doc.ents:
#                     bbox = calculate_entity_bbox(entity, current_block_words)
#                     if bbox:
#                         draw.rectangle(bbox, outline="green", width=2)
#                         features.setdefault(entity.label_, []).append({'text': entity.text, 'bounding_box': bbox})
#             # Reset for the new block or line
#             current_block_num = ocr_result['block_num'][i]
#             current_line_num = ocr_result['line_num'][i]
#             current_block_text = ""
#             current_block_words = []

#         # Append the word to the current block's text
#         current_block_text += " " + text if current_block_text else text
#         # Store the word's bounding box and its position in the block's text
#         word_bbox = (ocr_result['left'][i], ocr_result['top'][i], ocr_result['width'][i] + ocr_result['left'][i], ocr_result['height'][i] + ocr_result['top'][i])
#         current_block_words.append({'text': text, 'bbox': word_bbox, 'start_char': len(current_block_text) - len(text), 'end_char': len(current_block_text)})

# # Don't forget to process the last block of text
# if current_block_words:
#     doc = nlp(current_block_text)
#     for entity in doc.ents:
#         bbox = calculate_entity_bbox(entity, current_block_words)
#         if bbox:
#             draw.rectangle(bbox, outline="green", width=2)
#             features.setdefault(entity.label_, []).append({'text': entity.text, 'bounding_box': bbox})

# # Save the annotated image
# annotated_image_path = './output/result.png'
# image.save(annotated_image_path)

# # Save the extracted features to a JSON file
# json_path = './output/result.json'
# with open(json_path, 'w') as json_file:
#     json.dump(features, json_file)

# # Print out the paths to the JSON and image files
# print(f"Extracted features JSON saved to: {json_path}")
# print(f"Annotated image saved to: {annotated_image_path}")

# # Close the document
# doc.close()