In [1]:
pip install paddleocr

Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install paddlepaddle paddleocr opencv-python

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install protobuf==3.20.*

Collecting protobuf==3.20.*
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
   ---------------------------------------- 0.0/162.1 kB ? eta -:--:--
   ------- -------------------------------- 30.7/162.1 kB 1.3 MB/s eta 0:00:01
   -------------- ------------------------ 61.4/162.1 kB 648.1 kB/s eta 0:00:01
   ----------------------------------- ---- 143.4/162.1 kB 1.1 MB/s eta 0:00:01
   -------------------------------------- 162.1/162.1 kB 967.4 kB/s eta 0:00:00
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.4
    Uninstalling protobuf-4.25.4:
      Successfully uninstalled protobuf-4.25.4
Successfully installed protobuf-3.20.3
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.30.0 requires tenacity<9,>=8.1.0, which is not installed.
streamlit 1.30.0 requires watchdog>=2.1.5; platform_system != "Darwin", which is not installed.
grpcio-status 1.62.2 requires protobuf>=4.21.6, but you have protobuf 3.20.3 which is incompatible.
paddlepaddle 2.6.1 requires protobuf<=3.20.2,>=3.1.0; platform_system == "Windows", but you have protobuf 3.20.3 which is incompatible.
streamlit 1.30.0 requires packaging<24,>=16.8, but you have packaging 24.1 which is incompatible.
tensorflow-metadata 1.15.0 requires protobuf<5,>=4.25.2; python_version >= "3.11", but you have protobuf 3.20.3 which is incompatible.


In [3]:
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"


In [1]:
import os
import cv2
import json
from paddleocr import PaddleOCR

def get_coords_and_text(image_path, pdf_name, output_json_path):
    """
    This function extracts text and coordinates from an image,
    stores it in JSON format, and draws bounding boxes on the image.
    Args:
        image_path: path to the image file e.g. "C:/Users/ASUS/Downloads/page_7.png"
        pdf_name: name of the PDF e.g. "filename.pdf"
        output_json_path: path to save the resulting JSON file
    Returns:
        NONE
    """
    try:
        list_of_pdfs_coor_txt = []
        pages_list = []

        # Initialize PaddleOCR
        ocr = PaddleOCR(
            use_angle_cls=True,  # Use text orientation classification
            lang='en',  # Language model to use
            use_gpu=False  # Ensure GPU is not used to avoid crashes
        )

        # Read image
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Unable to read image from path: {image_path}")

        # Perform OCR using PaddleOCR
        result = ocr.ocr(image_path, cls=True)

        # Check if result is empty
        if not result:
            raise ValueError("No text detected in the image.")

        coor_txt_list = []
        for line in result[0]:
            bbox, (text, conf) = line[:4], line[1]

            # Prepare bounding box in the format required by cv2.rectangle
            bbox_coords = [
                int(bbox[0][0]), int(bbox[0][1]),  # Top-left corner
                int(bbox[2][0]), int(bbox[2][1])   # Bottom-right corner
            ]

            coor_txt_dict = {
                "name": text,
                "shape": "rect",
                "coords": bbox_coords,
                "strokeColor": "#6AFD09",
                "lineWidth": 1,
                "conf_value": conf
            }
            coor_txt_list.append(coor_txt_dict)

            # Draw bounding box on the image
            cv2.rectangle(image, (bbox_coords[0], bbox_coords[1]), (bbox_coords[2], bbox_coords[3]), (0, 255, 0), 2)

        pdf_page_number = os.path.basename(image_path).split(".png")[0]
        image_url = os.path.join(pdf_name + '_' + pdf_page_number)
        pages_list.append({
            "name": pdf_page_number,
            "pageNo": pdf_page_number,
            "imageUrl": image_url,
            "areas": coor_txt_list,
            "height": image.shape[0],
            "width": image.shape[1]
        })

        list_of_pdfs_coor_txt.append({
            "pdfName": pdf_name,
            "pages": pages_list,
            "pageCount": len(pages_list)
        })

        # Save the extracted data as JSON
        with open(output_json_path, 'w') as fp:
            json.dump(list_of_pdfs_coor_txt, fp, indent=4)

        # Save the image with bounding boxes
        output_image_path = os.path.join(os.path.dirname(output_json_path), f"{pdf_page_number}_boxed.png")
        cv2.imwrite(output_image_path, image)
        print(f"Output written to {output_json_path}")
        print(f"Image with bounding boxes saved to {output_image_path}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
image_path = r"C:\PaddleOCR_TextExtraction\page_7.png"
pdf_name = "trial.pdf"
output_json_path = r"C:\PaddleOCR_TextExtraction\page_7.json"

get_coords_and_text(image_path, pdf_name, output_json_path)


[2024/08/11 00:19:35] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ASUS/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ASUS/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6,

: 

In [2]:
from paddleocr import __version__

print(f"PaddleOCR version: {__version__}")


TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates