In [2]:
import pytesseract
from PIL import Image
from tqdm import tqdm
import multiprocessing
from pathlib import Path
import os
import numpy as np

def extract_text_from_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        return ""  # Return an empty string if OCR fails

def process_images(image_paths):
    texts = []
    for path in tqdm(image_paths):
        text = extract_text_from_image(path)
        texts.append(text)
    return texts

# Batch processing with multiprocessing
def process_images_in_batches(image_dir, batch_size=10000):
    all_image_paths = list(Path(image_dir).glob('*.jpg'))  # List all images
    num_batches = int(np.ceil(len(all_image_paths) / batch_size))
    
    all_texts = []
    
    for i in range(num_batches):
        print(f"Processing batch {i + 1}/{num_batches}")
        batch_paths = all_image_paths[i * batch_size : (i + 1) * batch_size]
        
        with multiprocessing.Pool(processes=8) as pool:
            batch_texts = pool.map(extract_text_from_image, batch_paths)
            all_texts.extend(batch_texts)
    
    return all_texts

# Example usage:
image_texts = process_images_in_batches(image_dir='../images')
