In [None]:
!pip install pymupdf pytesseract opencv-python
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev

import fitz
import pytesseract
import cv2
import os
from google.colab import files

def pdf_to_images(pdf_path, output_folder):
    pdf = fitz.open(pdf_path)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    page_image_paths = []
    for page_num in range(len(pdf)):
        page = pdf.load_page(page_num)
        pix = page.get_pixmap()
        image_path = f"{output_folder}/page_{page_num+1}.png"
        pix.save(image_path)
        page_image_paths.append(image_path)

    return page_image_paths

def extract_text_from_images(image_paths, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for image_path in image_paths:
        img = cv2.imread(image_path)
        text = pytesseract.image_to_string(img)
        page_num = os.path.basename(image_path).split('_')[1].split('.')[0]
        text_file_path = os.path.join(output_folder, f'page_{page_num}_text.txt')
        with open(text_file_path, "w") as text_file:
            text_file.write(text)
        print(f"Text extracted from {image_path} and saved to {text_file_path}")
def detect_and_save_images(image_paths, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for image_path in image_paths:
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        edged = cv2.Canny(blurred, 50, 150)

        contours, _ = cv2.findContours(edged, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        page_num = os.path.basename(image_path).split('_')[1].split('.')[0]
        page_folder = os.path.join(output_folder, f'page_{page_num}')
        os.makedirs(page_folder, exist_ok=True)

        for idx, contour in enumerate(contours):
            x, y, w, h = cv2.boundingRect(contour)
            if w > 50 and h > 50:
                cropped_image = img[y:y+h, x:x+w]
                cropped_image_path = os.path.join(page_folder, f'image_{idx+1}.png')
                cv2.imwrite(cropped_image_path, cropped_image)
                print(f"Image saved at: {cropped_image_path}")
def process_pdf(pdf_path):
    image_output_folder = 'pdf_pages'
    text_output_folder = 'extracted_text'
    image_save_folder = 'extracted_images'
    page_image_paths = pdf_to_images(pdf_path, image_output_folder)
    extract_text_from_images(page_image_paths, text_output_folder)
    detect_and_save_images(page_image_paths, image_save_folder)
    !zip -r extracted_results.zip extracted_text extracted_images
    files.download('extracted_results.zip')
pdf_path = '/content/TTA1.pdf'
process_pdf(pdf_path)


Collecting pymupdf
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting PyMuPDFb==1.24.10 (from pymupdf)
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m87.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.10 pymupdf-1.24.10 pytesseract-0.3.13
Reading package lists... Done
Building dependency

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>