<a href="https://colab.research.google.com/github/Ayuathm/Job_matching_Analysis_AI_SSD/blob/main/OCR_Parallel_Text_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📄 OCR-Enabled Parallel Text Extraction in Colab
This notebook extracts text from PDFs and DOCX files using PyMuPDF, pytesseract (for scanned documents), and docx2txt. It uses parallel processing with batching, timeouts, and checkpointing to efficiently handle large datasets.

In [1]:
# 📦 Install required libraries
!pip install pymupdf pytesseract docx2txt pdf2image tqdm
# ✅ Install Tesseract OCR backend (optional but recommended)
!apt-get install -y poppler-utils tesseract-ocr

Collecting pymupdf
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: docx2txt, pytesseract, pymupdf, pdf2image
Successfully installed docx2txt-0.9 pdf2image-1.17.0 pymupdf-1.26.1 pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is alr

In [3]:
import os
import fitz  # PyMuPDF
import pytesseract
import docx2txt
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
from tqdm import tqdm

In [5]:
# ✅ Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# 📁 Path to attachments
attachments_folder = "/content/drive/MyDrive/batch6"
output_file = "/content/drive/MyDrive/extracted_batch6.csv"
checkpoint_file = "/content/checkpoint.csv"

# 🧠 OCR-aware PDF extractor
def extract_text_from_pdf(file_path):
    try:
        with fitz.open(file_path) as doc:
            text = "\n".join([page.get_text() for page in doc if page.get_text().strip()])
        if text.strip():
            return text
        else:
            # 🧾 Fallback to OCR
            images = convert_from_path(file_path, dpi=200)
            ocr_text = "\n".join([pytesseract.image_to_string(img) for img in images])
            return ocr_text.strip()
    except Exception as e:
        return f"ERROR in PDF: {e}"

# 📄 DOCX extractor
def extract_text_from_docx(file_path):
    try:
        return docx2txt.process(file_path).strip()
    except Exception as e:
        return f"ERROR in DOCX: {e}"

# 📦 Unified file processor
def process_file(file_path):
    try:
        if file_path.lower().endswith(".pdf"):
            return file_path, extract_text_from_pdf(file_path)
        elif file_path.lower().endswith(".docx"):
            return file_path, extract_text_from_docx(file_path)
        else:
            return file_path, "Unsupported file type"
    except Exception as e:
        return file_path, f"ERROR in file: {e}"

# 🗂️ Collect files
file_list = []
for root, _, files in os.walk(attachments_folder):
    for name in files:
        if name.lower().endswith((".pdf", ".docx")):
            file_list.append(os.path.join(root, name))

print(f"📦 Total files found: {len(file_list)}")

# ⚡ Safe parallel execution with timeout + checkpointing
results = []
timeout_secs = 180  # 3 minutes max per file

batch_size = 500
for batch_idx in range(0, len(file_list), batch_size):
    batch = file_list[batch_idx:batch_idx + batch_size]
    print(f"🚀 Processing batch {batch_idx//batch_size + 1}")

    with ProcessPoolExecutor(max_workers=6) as executor:
        futures = {executor.submit(process_file, f): f for f in batch}
        for future in tqdm(as_completed(futures), total=len(futures)):
            file = futures[future]
            try:
                result = future.result(timeout=timeout_secs)
            except TimeoutError:
                result = (file, "ERROR: Timeout")
            except Exception as e:
                result = (file, f"ERROR: {e}")
            results.append(result)

    # 💾 Save intermediate checkpoint
    pd.DataFrame(results, columns=["filename", "text"]).to_csv(checkpoint_file, index=False)
    print(f"💾 Checkpoint saved for batch {batch_idx//batch_size + 1}")

# ✅ Final export
df = pd.DataFrame(results, columns=["filename", "text"])
df.to_csv(output_file, index=False)
print(f"✅ Extraction complete. Final CSV saved to: {output_file}")

📦 Total files found: 1382
🚀 Processing batch 1


100%|██████████| 500/500 [2:37:34<00:00, 18.91s/it]


💾 Checkpoint saved for batch 1
🚀 Processing batch 2


100%|██████████| 500/500 [2:55:58<00:00, 21.12s/it]


💾 Checkpoint saved for batch 2
🚀 Processing batch 3


100%|██████████| 382/382 [1:54:20<00:00, 17.96s/it]


💾 Checkpoint saved for batch 3
✅ Extraction complete. Final CSV saved to: /content/drive/MyDrive/extracted_batch6.csv
