In [None]:
!pip install -U layoutparser
!pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
!pip install layoutparser[ocr]
!apt-get update
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev
!apt-get install -y poppler-utils
!pip install pytesseract
!pip install pdf2image
!pip install Pillow==9.5.0

import cv2
import os
import layoutparser as lp
from PIL import Image
import pytesseract
import numpy as np
from pdf2image import convert_from_path

pdf_path = "/content/TTA1.pdf"
images = convert_from_path(pdf_path)
output_dir = "/content/extracted_document"
os.makedirs(output_dir, exist_ok=True)
categories = ["Table", "Figure"]
for category in categories:
    os.makedirs(os.path.join(output_dir, category), exist_ok=True)

main_txt_file = os.path.join(output_dir, "main_document.txt")

model = lp.Detectron2LayoutModel(
    'lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config',
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.65],
    label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)
element_counter = {"Table": 0, "Figure": 0}
with open(main_txt_file, 'w') as main_doc:
    for idx, image in enumerate(images):
        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

        layout = model.detect(image_np)

        layout.sort(key=lambda b: b.coordinates[1])

        for element in layout:
            category = element.type
            x1, y1, x2, y2 = map(int, element.coordinates)
            cropped_image = image_np[y1:y2, x1:x2]

            if category in ["Table", "Figure"]:
                element_counter[category] += 1
                img_filename = f'{category}_{element_counter[category]}.jpg'
                img_path = os.path.join(output_dir, category, img_filename)
                img_pil = Image.fromarray(cropped_image)
                img_pil.save(img_path)

                main_doc.write(f'[Link to {category} {element_counter[category]}: {img_path}]\n')

            elif category in ["Text", "Title", "List"]:
                extracted_text = pytesseract.image_to_string(cropped_image)
                main_doc.write(f'\n{extracted_text.strip()}\n')


Collecting detectron2
  Cloning https://github.com/facebookresearch/detectron2.git (to revision v0.4) to /tmp/pip-install-ujnpi3jb/detectron2_b4b0bb1f71c84ef6ab58e5dfd6c93c8d
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-install-ujnpi3jb/detectron2_b4b0bb1f71c84ef6ab58e5dfd6c93c8d
  Running command git checkout -q 4aca4bdaa9ad48b8e91d7520e0d0815bb8ca0fb1
  Resolved https://github.com/facebookresearch/detectron2.git to commit 4aca4bdaa9ad48b8e91d7520e0d0815bb8ca0fb1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Ign:6 https://r2u.stat.illinois.edu/ubun

config.yaml?dl=1: 8.19kB [00:01, 4.99kB/s]
model_final.pth?dl=1: 856MB [00:56, 15.1MB/s]                           
  return torch.load(f, map_location=torch.device("cpu"))
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
