In [3]:
!pip install PyMuPDF Pillow opencv-python

Collecting opencv-python
  Downloading opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (62.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.2/62.2 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.9.0.80


In [5]:
import os
import base64
import csv
import fitz
import io
import requests
from PIL import Image, ImageDraw
import cv2
import numpy as np
import re

In [6]:
# Function to encode image to base64
def encode_image(image_bytes):
    return base64.b64encode(image_bytes).decode()

# Function to call Google Vision API
def call_vision_api(image_bytes):
    api_key = 'AIzaSyCtiICfcEwJ9YiRnLx7xba714lIlOHPRwQ'  # Replace 'YOUR_API_KEY' with your actual API key
    url = f"https://vision.googleapis.com/v1/images:annotate?key={api_key}"
    headers = {'Content-Type': 'application/json'}
    encoded_image = encode_image(image_bytes)
    body = {
        "requests": [{
            "image": {
                "content": encoded_image
            },
            "features": [{
                "type": "TEXT_DETECTION"
            }]
        }]
    }
    response = requests.post(url, headers=headers, json=body)
    return response.json()

In [7]:
# Function to write data to CSV
def write_to_csv(data, output_csv_filename):
    with open(output_csv_filename, 'w', newline='') as csvfile:
        fieldnames = ['text', 'vertices', 'page']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for item in data:
            writer.writerow({'text': item['text'], 'vertices': item['vertices'], 'page': item['page']})

In [8]:
# Function to extract data from PDF using OCR
def extract_data_from_pdf(pdf_path, output_csv_filename):
    extracted_data = []

    pdf_document = fitz.open(pdf_path)

    for page_number in range(pdf_document.page_count):
        page = pdf_document[page_number]
        pix = page.get_pixmap()

        pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        with io.BytesIO() as output:
            pil_image.save(output, format="JPEG")
            image_bytes = output.getvalue()

        api_response = call_vision_api(image_bytes)
        if 'responses' in api_response:
            for resp in api_response['responses']:
                if 'textAnnotations' in resp:
                    annotations = resp['textAnnotations'][1:]  # Skip the first annotation
                    for annotation in annotations:
                        text = annotation['description']
                        vertices = annotation.get('boundingPoly', {}).get('vertices', [])
                        extracted_data.append({'text': text, 'vertices': vertices, 'page': page_number + 1})
                        print(extracted_data)

    write_to_csv(extracted_data, output_csv_filename)
    return extracted_data

In [9]:
def find_word_coordinates(words, coordinates):
    word_coordinates = []
    # Split words on the basis of space, comma, or period if it's a string
    if isinstance(words, str):
        words = re.split(r'[ ,.]', words)
    for word in words:
        for coord in coordinates:
            if coord["text"].lower() == word.lower():
                word_coordinates.append({
                    "text": coord["text"],
                    "vertices": coord["vertices"],
                    "page": coord["page"]
                })
    return word_coordinates

In [10]:
def draw_bounding_box(pdf_path, search_word):
    """Draw bounding box around the specified word in the PDF."""
    output_csv_filename = os.path.splitext(os.path.basename(pdf_path))[0] + "_coordinates.csv"
    print(f"Extracting data from PDF and storing in '{output_csv_filename}'...")
    extracted_data = extract_data_from_pdf(pdf_path, output_csv_filename)
    print("Data extraction complete.")
    
    print("Finding coordinates of the search word...")
    word_coordinates = find_word_coordinates(search_word, extracted_data)
    if word_coordinates:
         print("Coordinates found.")
    else:
        print("Coordinates not found.")
    
    pdf_document = fitz.open(pdf_path)
    for page_number in range(pdf_document.page_count):
        page = pdf_document[page_number]
        pix = page.get_pixmap()
        pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        with io.BytesIO() as output:
            pil_image.save(output, format="JPEG")
            image_bytes = output.getvalue()
        image_base64 = encode_image(image_bytes)
        image = base64.b64decode(image_base64)
        pil_image = Image.open(io.BytesIO(image))
        draw = ImageDraw.Draw(pil_image)
        for coord in word_coordinates:
            if isinstance(coord, dict) and coord['page'] == page_number + 1:
                vertices = coord['vertices']
                for i in range(4):
                    draw.line([(vertices[i]['x'], vertices[i]['y']), (vertices[(i + 1) % 4]['x'], vertices[(i + 1) % 4]['y'])], fill=(255, 0, 0), width=2)
        output_image_path = f"output_page_{page_number + 1}.jpg"
        if any(isinstance(coord, dict) and coord['page'] == page_number + 1 for coord in word_coordinates):
            print(f"Saving bounding box on page {page_number + 1}...")
            pil_image.save(output_image_path)
            print(f"Bounding box saved as '{output_image_path}'.")
    pdf_document.close()

In [11]:
pdf_path = "test.pdf"
search_word = "abbott, james"
draw_bounding_box(pdf_path, search_word)

Extracting data from PDF and storing in 'test_coordinates.csv'...


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[{'text': 'The', 'vertices': [{'x': 106, 'y': 163}, {'x': 145, 'y': 163}, {'x': 145, 'y': 181}, {'x': 106, 'y': 181}], 'page': 1}, {'text': 'Men', 'vertices': [{'x': 154, 'y': 163}, {'x': 201, 'y': 163}, {'x': 201, 'y': 181}, {'x': 154, 'y': 181}], 'page': 1}, {'text': 'of', 'vertices': [{'x': 212, 'y': 163}, {'x': 230, 'y': 163}, {'x': 230, 'y': 181}, {'x': 212, 'y': 181}], 'page': 1}, {'text': 'Greenock', 'vertices': [{'x': 239, 'y': 163}, {'x': 320, 'y': 163}, {'x': 320, 'y': 181}, {'x': 239, 'y': 181}], 'page': 1}, {'text': 'who', 'vertices': [{'x': 329, 'y': 163}, {'x': 362, 'y': 163}, {'x': 362, 'y': 181}, {'x': 329, 'y': 181}], 'page': 1}, {'text': 'fell', 'vertices': [{'x': 370, 'y': 164}, {'x': 401, 'y': 164}, {'x': 401, 'y': 182}, {'x': 370, 'y': 182}], 'page': 1}, {'text': 'in', 'vertices': [{'x': 410, 'y': 164}, {'x': 428, 'y': 164}, {'x': 428, 'y': 182}, {'x': 410, 'y': 182}], 'page': 1}, {'text': 'the', 'vertices': [{'x': 437, 'y': 164}, {'x': 463, 'y': 164}, {'x': 463, '

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Saving bounding box on page 2...
Bounding box saved as 'output_page_2.jpg'.
