## Extracting text from medical books (PDFs with images)

In [1]:
#Install the required libraries to extract text from PDF files containg inmages
!pip install -q langchain langchain-community pymupdf openai python-dotenv numpy paddleocr

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.7/544.7 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.9/161.9 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.8/297.8 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m969.6/969.6 kB[0m [31m46.7 MB/s[0m eta [36

In [2]:
#Check GPU availabiliy with cuda
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
#Install paddle library compatible with GPU
!pip install -q paddlepaddle-gpu==2.6.2 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/cu118.html


In [4]:
#Check that paddle can run with GPU
import paddle
print("Paddle Compiled with CUDA:", paddle.device.is_compiled_with_cuda())
paddle.set_device('gpu')
print("Using device:", paddle.get_device())

Paddle Compiled with CUDA: True
Using device: gpu:0


In [None]:
import fitz  # PyMuPDF
import os
import io
import threading
import time
import concurrent.futures
from datetime import datetime
from PIL import Image, UnidentifiedImageError
import numpy as np
from paddleocr import PaddleOCR

# Initialize PaddleOCR (GPU enabled for better performance)
ocr = PaddleOCR(use_angle_cls=True, lang="en", use_gpu=True)  # Adjust batch size based on memory

# Define folders
pdf_folder = "/content/drive/MyDrive/LLM_RAG_MED/dat_rag/med_books"
output_folder = "/content/drive/MyDrive/LLM_RAG_MED/dat_rag/medbooks_txt"
report_filepath = os.path.join(output_folder, "medbooks_extraction_report.txt")
progress_filepath = os.path.join(output_folder, "extraction_progress_log.txt")

# Dictionary to track progress per PDF file
progress_data = {}

# Lock for writing logs safely
progress_lock = threading.Lock()

# Function to log progress every 3 minutes for all active files
def log_progress():
    while True:
        time.sleep(180)  # Log every 3 minutes
        with progress_lock:
            if progress_data:
                with open(progress_filepath, "a", encoding="utf-8") as log_file:
                    log_file.write(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Progress Update:\n")
                    for pdf, data in progress_data.items():
                        log_file.write(f"- {pdf}: {data['num_pages']} pages, {data['num_images']} images extracted\n")

# Start logging thread
logging_thread = threading.Thread(target=log_progress, daemon=True)
logging_thread.start()

# Function to extract text, including from images, from a PDF
def extract_pdf_text(pdf_path, pdf_name):
    text_content = []
    num_pages = 0
    num_images = 0

    try:
        doc = fitz.open(pdf_path)  # Open the PDF

        num_pages = len(doc)
        # Initialize tracking for this file
        with progress_lock:
            progress_data[pdf_name] = {"num_pages": 0, "num_images": 0}

        for page_num, page in enumerate(doc):
            text_content.append(page.get_text("text"))  # Extract standard text

            # Extract images and apply OCR
            for img_index, img in enumerate(page.get_images(full=True)):
                num_images += 1
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]

                try:
                    image = Image.open(io.BytesIO(image_bytes))
                    # Check if the image is in a valid format (e.g., PNG, JPEG, etc.)
                    if image.format not in ['PNG', 'JPEG', 'JPG']:
                        print(f"Skipping image in {pdf_name} with invalid format: {image.format}")
                        continue

                    image_np = np.array(image)  # Convert PIL image to NumPy array
                    ocr_result = ocr.ocr(image_np, cls=True)

                    if ocr_result and ocr_result[0]:
                        ocr_text = "\n".join([line[1][0] for line in ocr_result[0]])
                    else:
                        ocr_text = ""

                    text_content.append(ocr_text)
                except (UnidentifiedImageError, IOError, OSError, ValueError, MemoryError, TypeError) as e:
                    # Catch errors related to loading images and skipping problematic ones
                    print(f"Error processing image in {pdf_name}: {e}. Skipping this image.")
                    continue

            # Update progress per file
            with progress_lock:
                progress_data[pdf_name]["num_pages"] = page_num + 1
                progress_data[pdf_name]["num_images"] = num_images

    except Exception as e:  # Catch PyMuPDF-related errors
        print(f"Error processing PDF {pdf_name}: {e}. Skipping this PDF.")

    return "\n".join(text_content), num_pages, num_images

# Function to process a single PDF and update the report
def process_pdf(pdf_file):
    pdf_path = os.path.join(pdf_folder, pdf_file)
    txt_filename = os.path.splitext(pdf_file)[0] + ".txt"
    txt_filepath = os.path.join(output_folder, txt_filename)

    # Skip already processed PDFs
    if os.path.exists(txt_filepath):
        print(f"Skipping {pdf_file} (already processed).")
        return

    print(f"Processing {pdf_file}...")

    # Extract text from the PDF
    extracted_text, num_pages, num_images = extract_pdf_text(pdf_path, pdf_file)

    # Save the extracted text
    with open(txt_filepath, "w", encoding="utf-8") as text_file:
        text_file.write(extracted_text)

    # Append extraction details to the report
    with progress_lock:
        with open(report_filepath, "a", encoding="utf-8") as report:
            report.write(f"File: {pdf_file}\n")
            report.write(f"Pages: {num_pages}\n")
            report.write(f"Images Extracted: {num_images}\n")
            report.write("---------------------\n")

    # Remove completed file from progress tracking
    with progress_lock:
        del progress_data[pdf_file]

    print(f"Completed {pdf_file}: {num_pages} pages, {num_images} images.")

# Ensure the report file has a header
if not os.path.exists(report_filepath) or os.stat(report_filepath).st_size == 0:
    with open(report_filepath, "w", encoding="utf-8") as report:
        report.write("Med Books PDF Extraction Report\n")
        report.write("=====================\n\n")

# Get list of PDF files
pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]

# Process PDFs in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:  # Adjust worker count based on system
    executor.map(process_pdf, pdf_files)

print("PDF Extraction Completed!")