In [None]:
!apt-get install -y tesseract-ocr
!apt-get install -y poppler-utils
!pip install pdf2image opencv-python pytesseract

!git clone https://github.com/ultralytics/yolov5
%cd yolov5
!pip install -r requirements.txt

import os
import torch
import cv2
import math
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

os.makedirs('/content/pdf_images', exist_ok=True)
os.makedirs('/content/detected_images', exist_ok=True)
os.makedirs('/content/detected_tables', exist_ok=True)
os.makedirs('/content/detected_forms', exist_ok=True)
os.makedirs('/content/detected_charts', exist_ok=True)
os.makedirs('/content/extracted_text', exist_ok=True)

# Load the pre-trained YOLOv5 model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

# Function to convert PDF to images
def convert_pdf_to_images(pdf_file):
    images = convert_from_path(pdf_file)
    image_paths = []
    for i, image in enumerate(images):
        image_path = f"/content/pdf_images/page_{i+1}.png"
        image.save(image_path, 'PNG')
        print(f"Converted page {i+1} to image: {image_path}")
        image_paths.append(image_path)
    return image_paths

def extract_text_from_image(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    print(f"Extracted text from {image_path}")
    return text

def is_table(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    lines = cv2.HoughLinesP(edges, 1, math.pi/180, threshold=100, minLineLength=50, maxLineGap=10)
    return lines is not None and len(lines) > 10

def is_chart(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    circles = cv2.HoughCircles(gray, cv2.HOUGH_GRADIENT, 1, 20, param1=50, param2=30, minRadius=10, maxRadius=100)
    return circles is not None

def is_form(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for contour in contours:
        approx = cv2.approxPolyDP(contour, 0.02 * cv2.arcLength(contour, True), True)
        if len(approx) == 4:
            return True
    return False

def save_detected_elements(results_df, image_path):
    img = cv2.imread(image_path)

    for idx, row in results_df.iterrows():
        xmin, ymin, xmax, ymax = int(row['xmin']), int(row['ymin']), int(row['xmax']), int(row['ymax'])
        cropped_img = img[ymin:ymax, xmin:xmax]

        if is_table(cropped_img):
            cv2.imwrite(f'/content/detected_tables/table_{idx}.png', cropped_img)
            print(f"Saved table from {image_path} as detected_tables/table_{idx}.png")
        elif is_form(cropped_img):
            cv2.imwrite(f'/content/detected_forms/form_{idx}.png', cropped_img)
            print(f"Saved form from {image_path} as detected_forms/form_{idx}.png")
        elif is_chart(cropped_img):
            cv2.imwrite(f'/content/detected_charts/chart_{idx}.png', cropped_img)
            print(f"Saved chart from {image_path} as detected_charts/chart_{idx}.png")
        else:
            cv2.imwrite(f'/content/detected_images/image_{idx}.png', cropped_img)
            print(f"Saved image from {image_path} as detected_images/image_{idx}.png")
pdf_file = '/content/TTA1.pdf'

image_paths = convert_pdf_to_images(pdf_file)
for image_path in image_paths:
    text = extract_text_from_image(image_path)
    text_file_path = image_path.replace('pdf_images', 'extracted_text').replace('.png', '.txt')
    with open(text_file_path, 'w') as text_file:
        text_file.write(text)
    print(f"Saved extracted text to {text_file_path}")
    img = cv2.imread(image_path)
    results = model(img)
    results_df = results.pandas().xyxy[0]
    save_detected_elements(results_df, image_path)

    print(f"Processed {image_path}")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Cloning into 'yolov5'...
remote: Enumerating objects: 16965, done.[K
remote: Counting objects: 100% (160/160), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 16965 (delta 79), reused 102 (delta 50), pack-reused 16805 (from 1)[K
Receiving objects: 100% (16965/16965), 15.71 MiB | 21.30 MiB/s, done.
Resolving deltas: 100% (11617/11617), done.
/content/yolov5/yolov5/yolov5/yolov5/yolov5


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-9-23 Python-3.10.12 torch-2.4.1+cu121 CPU

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...
100%|██████████| 14.1M/14.1M [00:00<00:00, 347MB/s]

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Converted page 1 to image: /content/pdf_images/page_1.png
Converted page 2 to image: /content/pdf_images/page_2.png
Converted page 3 to image: /content/pdf_images/page_3.png
Extracted text from /content/pdf_images/page_1.png
Saved extracted text to /content/extracted_text/page_1.txt


  with amp.autocast(autocast):


Processed /content/pdf_images/page_1.png
Extracted text from /content/pdf_images/page_2.png
Saved extracted text to /content/extracted_text/page_2.txt


  with amp.autocast(autocast):


Processed /content/pdf_images/page_2.png
Extracted text from /content/pdf_images/page_3.png
Saved extracted text to /content/extracted_text/page_3.txt


  with amp.autocast(autocast):


Saved form from /content/pdf_images/page_3.png as detected_forms/form_0.png
Saved form from /content/pdf_images/page_3.png as detected_forms/form_1.png
Saved form from /content/pdf_images/page_3.png as detected_forms/form_2.png
Saved form from /content/pdf_images/page_3.png as detected_forms/form_3.png
Saved form from /content/pdf_images/page_3.png as detected_forms/form_4.png
Processed /content/pdf_images/page_3.png
