In [None]:
import cv2
import numpy as np
import os
from scipy.stats import gaussian_kde, chi2
from scipy.spatial.distance import mahalanobis
from concurrent.futures import ThreadPoolExecutor

def process_image(image_path, output_path, threshold=None, quantile=0.99936, alpha=1.5):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        raise FileNotFoundError(f"Image not found: {image_path}")
    
    _, binary = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY_INV)
    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(binary, connectivity=8)
    segment_areas = stats[1:, cv2.CC_STAT_AREA]  # Ignore background label
    
    def find_effective_threshold():
        if segment_areas.size == 0:
            return 0
        kde = gaussian_kde(segment_areas)
        x_vals = np.linspace(segment_areas.min(), segment_areas.max(), 500)
        y_vals = kde(x_vals)
        d2y_dx2 = np.gradient(np.gradient(y_vals, x_vals), x_vals)
        return float(x_vals[np.argmax(d2y_dx2)])
    
    if threshold is None:
        threshold = find_effective_threshold()
    threshold = float(threshold)
    
    filtered_labels = np.where(segment_areas >= threshold)[0] + 1  # Labels start from 1
    filtered_image = np.isin(labels, filtered_labels).astype(np.uint8) * 255
    
    contours, _ = cv2.findContours(filtered_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if len(contours) == 0:
        cv2.imwrite(output_path, filtered_image)
        return filtered_image, output_path
    
    features = np.array([[cv2.boundingRect(c)[2] / max(cv2.boundingRect(c)[3], 1),
                          cv2.contourArea(c),
                          cv2.contourArea(c) / (cv2.boundingRect(c)[2] * cv2.boundingRect(c)[3])]
                         for c in contours])
    
    min_letter_area, max_letter_area = np.percentile(features[:, 1], [10, 90])
    letter_arange = np.percentile(features[:, 0], [10, 90])
    
    mean_vec = np.mean(features, axis=0)
    inv_covmat = np.linalg.inv(np.cov(features, rowvar=False))
    chi2_threshold = chi2.ppf(quantile, df=3)
    
    non_letter_mask = np.zeros_like(filtered_image)
    for i, contour in enumerate(contours):
        md_squared = mahalanobis(features[i], mean_vec, inv_covmat) ** 2
        if md_squared > chi2_threshold:
            area, aspect_ratio = features[i, 1], features[i, 0]
            if min_letter_area <= area <= max_letter_area and letter_arange[0] <= aspect_ratio <= letter_arange[1]:
                continue
            cv2.drawContours(non_letter_mask, [contour], -1, 255, thickness=cv2.FILLED)
    
    final_image = cv2.bitwise_and(filtered_image, filtered_image, mask=cv2.bitwise_not(non_letter_mask))
    
    blurred = cv2.GaussianBlur(final_image, (3, 3), 0)
    sharpened = cv2.addWeighted(final_image, 1 + alpha, blurred, -alpha, 0)
    _, binary_sharpened = cv2.threshold(sharpened, 128, 255, cv2.THRESH_BINARY)
    
    cv2.imwrite(output_path, binary_sharpened)
    return binary_sharpened, output_path

def process_multiple_images(input_dir, output_dir, threshold=None, quantile=0.99936, alpha=1.5, max_workers=None):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    image_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('png', 'jpg', 'jpeg', 'bmp'))]
    input_paths = [os.path.join(input_dir, f) for f in image_files]
    output_paths = [os.path.join(output_dir, f) for f in image_files]
    
    if max_workers is None:
        max_workers = min(8, len(image_files))
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(process_image, input_paths, output_paths, 
                     [threshold] * len(input_paths), [quantile] * len(input_paths), [alpha] * len(input_paths))

# Example usage:
if __name__ == '__main__':
    process_multiple_images("data", "output")


In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

# Load the image in grayscale
img = cv2.imread("./output/147.jpg", cv2.IMREAD_GRAYSCALE)
if img is None:
    raise Exception("Image not found. Please check the path.")

# Step 1: Ensure we have black text on white background
# If the mean pixel value is low (dark image), assume text is white on black and invert.
if np.mean(img) < 127:
    img = cv2.bitwise_not(img)

# Step 2: Enhance contrast using CLAHE (Contrast Limited Adaptive Histogram Equalization)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
img_clahe = clahe.apply(img)

# Step 3: Denoise with a median filter to remove small artifacts
img_denoised = cv2.medianBlur(img_clahe, 3)

# Step 4: Adaptive thresholding for a crisp binary image
img_thresh = cv2.adaptiveThreshold(
    img_denoised, 
    255, 
    cv2.ADAPTIVE_THRESH_MEAN_C, 
    cv2.THRESH_BINARY, 
    11, 2
)

# Step 5: Morphological closing to connect broken parts of letters
kernel = np.ones((3, 3), np.uint8)
img_closed = cv2.morphologyEx(img_thresh, cv2.MORPH_CLOSE, kernel)

# Optional: Dilation to thicken strokes slightly (adjust iterations if needed)
img_final = cv2.dilate(img_closed, kernel, iterations=1)

# Display input and processed images side by side
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(img, cmap='gray')
plt.title("Input Image")
plt.axis("off")
plt.subplot(1, 2, 2)
plt.imshow(img_final, cmap='gray')
plt.title("Processed Image for OCR")
plt.axis("off")
plt.show()

# Save the processed image
cv2.imwrite("processed_image.png", img_final)
