In [1]:
import os
import io
import numpy as np
from matplotlib import pyplot as plt

import fitz
from PIL import Image

from tqdm import tqdm

In [2]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

import easyocr
# Initialize EasyOCR reader for English, French, German, Italian
reader = easyocr.Reader(['en', 'fr', 'de', 'it', 'es'], gpu=True)

In [3]:
# Banking dataset path
dataset_path = "documents_banking"
pdf_documents_paths = [os.path.join(dataset_path, pdf_doc) for pdf_doc in os.listdir(dataset_path)]

print(pdf_documents_paths[:2])

['documents_banking\\doc-01.pdf', 'documents_banking\\doc-02.pdf']


In [4]:
# Function to check and extract text w/ Tesseract
def extract_text_from_pdf(pdf_document_path):
    print(f"\n--> Processing: {pdf_document_path}")
    
    text = ""
    try:
        doc = fitz.open(pdf_document_path)

        # First, try extracting embedded text
        for page in doc:
            page_text = page.get_text().strip()
            text += page_text + "\n"

        if text.strip():  # If text is found
            print("✅ Embedded Text Found")
            return text
        else:
            print("⚠ No embedded text detected. Switching to OCR...")
            text = ""  # Reset text
            # OCR for each page
            for page in doc:
                pix = page.get_pixmap()
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                page_text = pytesseract.image_to_string(img)
                text += page_text + "\n"
            print("✅ OCR Extraction Complete")
            return text

    except Exception as e:
        print(f"❌ Error processing {pdf_document_path}: {e}")
        return ""
    
def extract_text_from_pdf_easyOcr(pdf_document_path, reader=None):

    #assert reader is not None

    print(f"\n--> Processing: {pdf_document_path}")
    
    text = ""
    try:
        doc = fitz.open(pdf_document_path)

        # First, try extracting embedded text
        for page in doc:
            page_text = page.get_text().strip()
            text += page_text + "\n"

        if text.strip():  # If embedded text found
            print("✅ Embedded Text Found")
            return text
        else:
            print("⚠ No embedded text detected. Switching to EasyOCR...")
            text = ""  # Reset text

            
            # OCR for each page using EasyOCR
            for page_num, page in enumerate(doc, start=1):
                pix = page.get_pixmap()
                img_bytes = pix.tobytes("png")  # Convert to PNG bytes
                img = Image.open(io.BytesIO(img_bytes))

                results = reader.readtext(np.array(img))
                page_text = " ".join([res[1] for res in results])  # Concatenate detected text
                text += page_text + "\n"
                print(f"✅ OCR processed page {page_num}")

            print("✅ EasyOCR Extraction Complete")
            return text

    except Exception as e:
        print(f"❌ Error processing {pdf_document_path}: {e}")
        return ""

In [5]:
# Folder to save extracted text files
output_folder = "documents_banking_txt"
os.makedirs(output_folder, exist_ok=True)


# Process multiple PDFs
for pdf_document_path in pdf_documents_paths:
    extracted_text_tesseract = extract_text_from_pdf(pdf_document_path)
    #extracted_text_easy = extract_text_from_pdf_easyOcr(pdf_document_path, reader)


    #print("\nExtracted Text w/ Tesseract:\n", extracted_text_tesseract[:1000])
    #print("\nExtracted Text w/ EasyOCR:\n", extracted_text_easy[:1000])

    # Get original file name without extension
    file_name = os.path.splitext(os.path.basename(pdf_document_path))[0]
    output_path = os.path.join(output_folder, f"{file_name}.txt")

    # Save extracted text to file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(extracted_text_tesseract)


--> Processing: documents_banking\doc-01.pdf
⚠ No embedded text detected. Switching to OCR...
✅ OCR Extraction Complete

--> Processing: documents_banking\doc-02.pdf
✅ Embedded Text Found

--> Processing: documents_banking\doc-03.pdf
⚠ No embedded text detected. Switching to OCR...
✅ OCR Extraction Complete

--> Processing: documents_banking\doc-04.pdf
✅ Embedded Text Found

--> Processing: documents_banking\doc-05.pdf
⚠ No embedded text detected. Switching to OCR...
✅ OCR Extraction Complete

--> Processing: documents_banking\doc-06.pdf
✅ Embedded Text Found

--> Processing: documents_banking\doc-07.pdf
⚠ No embedded text detected. Switching to OCR...
✅ OCR Extraction Complete

--> Processing: documents_banking\doc-08.pdf
✅ Embedded Text Found

--> Processing: documents_banking\doc-09.pdf
⚠ No embedded text detected. Switching to OCR...
✅ OCR Extraction Complete

--> Processing: documents_banking\doc-10.pdf
✅ Embedded Text Found

--> Processing: documents_banking\doc-11.pdf
⚠ No embe