## Project Directory Structure

- pdf_images/  
  - page_1.png  
  - page_2.png  
  - ...

- pdf_texts/  
  - page_1_text.txt  
  - page_2_text.txt  
  - ...

- output/
  - annotations/
    - page_1_annotation.png
    - page_2_annotation.png
    - ...
  - json/
    - page_1.json
    - page_2.json
    - ...
  - page_1_result.jpg  
  - page_2_result.jpg  
  - ...
  - page_1/  
    - [39, 215, 553, 332]_0.jpg  
    - [132, 44, 621, 222]_1.jpg  
    - ...
  - page_2/  
    - [51, 198, 537, 345]_0.jpg  
    - ...
  
- UnrealText.pdf

In [16]:
%pip install paddlepaddle paddleocr 

python(11901) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Note: you may need to restart the kernel to use updated packages.


In [17]:
%pip install -r PaddleOCR/requirements.txt

python(11932) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Note: you may need to restart the kernel to use updated packages.


In [18]:
%git clone https://github.com/PaddlePaddle/PaddleOCR.git

UsageError: Line magic function `%git` not found.


In [11]:
%pip install matplotlib openpyxl tablepyxl fitz PyMuPDF paddleclas

Collecting paddleclas
  Downloading paddleclas-2.5.2-py3-none-any.whl.metadata (29 kB)
Collecting prettytable (from paddleclas)
  Downloading prettytable-3.11.0-py3-none-any.whl.metadata (30 kB)
Collecting ujson (from paddleclas)
  Using cached ujson-5.10.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.3 kB)
Collecting opencv-python==4.6.0.66 (from paddleclas)
  Downloading opencv_python-4.6.0.66-cp37-abi3-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting visualdl>=2.2.0 (from paddleclas)
  Downloading visualdl-2.5.3-py3-none-any.whl.metadata (25 kB)
Collecting gast==0.3.3 (from paddleclas)
  Downloading gast-0.3.3-py2.py3-none-any.whl.metadata (1.1 kB)
Collecting faiss-cpu (from paddleclas)
  Downloading faiss_cpu-1.8.0.post1-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.7 kB)
Collecting easydict (from paddleclas)
  Downloading easydict-1.13-py3-none-any.whl.metadata (4.2 kB)
Collecting bce-python-sdk (from visualdl>=2.2.0->paddleclas)
  Downloading bce_python_sdk-0.9.23-py3-none-any.

In [12]:
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pymupdf
pdf = pymupdf.open('UnrealText.pdf')
pdf.page_count

13

#### Save individual pages as `.png` files in `pdf_images` folder

In [6]:
import os

output_folder = "pdf_images"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for index, page in enumerate(pdf, start=1):
    pix = page.get_pixmap()  
    img_out_path = os.path.join(output_folder, f"page_{index}.png")
    pix.save(img_out_path)

    print(f"Saved page {index} as {img_out_path}")

Saved page 1 as pdf_images/page_1.png
Saved page 2 as pdf_images/page_2.png
Saved page 3 as pdf_images/page_3.png
Saved page 4 as pdf_images/page_4.png
Saved page 5 as pdf_images/page_5.png
Saved page 6 as pdf_images/page_6.png
Saved page 7 as pdf_images/page_7.png
Saved page 8 as pdf_images/page_8.png
Saved page 9 as pdf_images/page_9.png
Saved page 10 as pdf_images/page_10.png
Saved page 11 as pdf_images/page_11.png
Saved page 12 as pdf_images/page_12.png
Saved page 13 as pdf_images/page_13.png


#### Save individual extracted text in each pages as `.txt` files in `pdf_texts` folder

In [7]:
import os

output_text_folder = "pdf_texts"
if not os.path.exists(output_text_folder):
    os.makedirs(output_text_folder)

for page_num, page in enumerate(pdf, start=1):
    page_text = page.get_text().encode("utf8") 

    output_file_path = os.path.join(output_text_folder, f"page_{page_num}_text.txt")
    with open(output_file_path, "wb") as out_file:
        out_file.write(page_text)

    print(f"Extracted text from page {page_num} and saved as '{output_file_path}'")

Extracted text from page 1 and saved as 'pdf_texts/page_1_text.txt'
Extracted text from page 2 and saved as 'pdf_texts/page_2_text.txt'
Extracted text from page 3 and saved as 'pdf_texts/page_3_text.txt'
Extracted text from page 4 and saved as 'pdf_texts/page_4_text.txt'
Extracted text from page 5 and saved as 'pdf_texts/page_5_text.txt'
Extracted text from page 6 and saved as 'pdf_texts/page_6_text.txt'
Extracted text from page 7 and saved as 'pdf_texts/page_7_text.txt'
Extracted text from page 8 and saved as 'pdf_texts/page_8_text.txt'
Extracted text from page 9 and saved as 'pdf_texts/page_9_text.txt'
Extracted text from page 10 and saved as 'pdf_texts/page_10_text.txt'
Extracted text from page 11 and saved as 'pdf_texts/page_11_text.txt'
Extracted text from page 12 and saved as 'pdf_texts/page_12_text.txt'
Extracted text from page 13 and saved as 'pdf_texts/page_13_text.txt'


#### OCR processing and annotation of all `.png` images in the `pdf_images` folder, saving annotated outputs to `output/annotations`.

In [1]:
import os
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image

ocr = PaddleOCR(use_angle_cls=True, lang='en')

img_folder = './pdf_images'  
font_path = './PaddleOCR/doc/fonts/Ubuntu-L.ttf'  
output_folder = './output/annotations' 

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for img_file in os.listdir(img_folder):
    if img_file.endswith('.png'): 
        image_path = os.path.join(img_folder, img_file)
        
        ocr_result = ocr.ocr(image_path, cls=True)
       
        for idx in range(len(ocr_result)):
            res = ocr_result[idx]
            for line in res:
                print(line)
       
        result = ocr_result[0]

        image = Image.open(image_path).convert('RGB')

        boxes = [elements[0] for elements in result]
        txts = [elements[1][0] for elements in result]
        scores = [elements[1][1] for elements in result]

        im_show = draw_ocr(image, boxes, txts, scores, font_path=font_path)
        im_show = Image.fromarray(im_show)

        output_image_path = os.path.join(output_folder, f"{os.path.basename(image_path).split('.')[0]}_annotation.png")
        im_show.save(output_image_path)

[2024/10/01 22:12:28] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/dhushanthankumararatnam/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/dhushanthankumararatnam/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48

#### Process and extract table structure from `.png` images in `pdf_images` using PaddleOCR's PPStructure, save results as JSON and annotated images in `output/json` and `output/` respectively.

In [1]:
import os
import cv2
import json
from paddleocr import PPStructure, draw_structure_result, save_structure_res
from PIL import Image

table_engine = PPStructure(show_log=True)

save_folder = './output'
img_folder = 'pdf_images'
json_folder = './output/json'  
font_path = 'PaddleOCR/doc/fonts/simfang.ttf' 

if not os.path.exists(save_folder):
    os.makedirs(save_folder)
if not os.path.exists(json_folder):
    os.makedirs(json_folder)

for img_file in os.listdir(img_folder):
    if img_file.endswith('.png'): 
        img_path = os.path.join(img_folder, img_file)

        img = cv2.imread(img_path)

        result = table_engine(img)

        save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])

        for line in result:
            line.pop('img')
        
        json_file_path = os.path.join(json_folder, f'{os.path.basename(img_path).split(".")[0]}.json')
        with open(json_file_path, 'w') as json_file:
            json.dump(result, json_file, ensure_ascii=False, indent=4)

        image = Image.open(img_path).convert('RGB')
        im_show = draw_structure_result(image, result, font_path=font_path)
        im_show = Image.fromarray(im_show)
        im_show.save(os.path.join(save_folder, f'{os.path.basename(img_path).split(".")[0]}_result.jpg'))

[2024/10/01 21:26:05] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/dhushanthankumararatnam/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/dhushanthankumararatnam/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48