In [1]:
pip install opencv-python numpy pytesseract pillow





[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\Himan\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [2]:
import cv2
import numpy as np
import pytesseract
from PIL import Image
import os

class OCRProcessor:
    def __init__(self, tesseract_path=None):
        """Initialize the OCR processor with optional custom Tesseract path"""
        if tesseract_path:
            pytesseract.pytesseract.tesseract_cmd = tesseract_path
            
    def preprocess_image(self, image):
        """Apply preprocessing steps to enhance text recognition"""
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Apply noise reduction
        denoised = cv2.fastNlMeansDenoising(gray)
        
        # Apply adaptive thresholding
        thresh = cv2.adaptiveThreshold(
            denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
            cv2.THRESH_BINARY, 11, 2
        )
        
        # Apply dilation to connect text components
        kernel = np.ones((1, 1), np.uint8)
        dilated = cv2.dilate(thresh, kernel, iterations=1)
        
        return dilated

    def detect_text_regions(self, preprocessed_image):
        """Detect potential text regions using contour detection"""
        # Find contours
        contours, _ = cv2.findContours(
            preprocessed_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )
        
        # Filter and sort contours
        valid_contours = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            
            # Filter based on aspect ratio and area
            aspect_ratio = w / float(h)
            area = cv2.contourArea(contour)
            
            if 0.1 <= aspect_ratio <= 15 and area > 100:
                valid_contours.append((x, y, w, h))
        
        # Sort contours top to bottom, left to right
        return sorted(valid_contours, key=lambda x: (x[1], x[0]))

    def extract_text(self, image, regions=None):
        """Extract text from the image or specific regions"""
        if regions is None:
            # Process entire image
            return pytesseract.image_to_string(image, config='--psm 3')
        
        extracted_text = []
        for x, y, w, h in regions:
            # Extract region
            roi = image[y:y+h, x:x+w]
            
            # Convert to PIL Image
            pil_image = Image.fromarray(roi)
            
            # Extract text from region
            text = pytesseract.image_to_string(
                pil_image, config='--psm 7'
            ).strip()
            
            if text:
                extracted_text.append({
                    'text': text,
                    'position': (x, y, w, h)
                })
        
        return extracted_text

    def process_image(self, image_path):
        """Process an image and extract text"""
        try:
            # Read image
            image = cv2.imread(image_path)
            if image is None:
                raise ValueError("Could not read image file")
            
            # Preprocess image
            preprocessed = self.preprocess_image(image)
            
            # Detect text regions
            regions = self.detect_text_regions(preprocessed)
            
            # Extract text from regions
            results = self.extract_text(preprocessed, regions)
            
            return {
                'success': True,
                'text_regions': results,
                'total_regions': len(results)
            }
            
        except Exception as e:
            return {
                'success': False,
                'error': str(e)
            }

def main():
    # Initialize OCR processor
    # Specify your Tesseract path if needed
    # ocr = OCRProcessor(r'C:\Program Files\Tesseract-OCR\tesseract.exe')
    ocr = OCRProcessor()
    
    # Process an image
    image_path = './markus-spiske-3JgQuMyFkgA-unsplash.jpg'  # Replace with your image path
    results = ocr.process_image(image_path)
    
    if results['success']:
        print(f"Found {results['total_regions']} text regions:")
        for region in results['text_regions']:
            print(f"Text: {region['text']}")
            print(f"Position: {region['position']}\n")
    else:
        print(f"Error processing image: {results['error']}")

if __name__ == "__main__":
    main()

Found 9 text regions:
Text: td er ag
Position: (789, 0, 47, 13)

Text: pa tg rer
Position: (1653, 0, 83, 14)

Text: od
Position: (2624, 27, 34, 11)

Text: tig
Position: (2715, 140, 21, 17)

Text: re
Position: (2738, 314, 23, 21)

Text: ar
Position: (2735, 2282, 44, 42)

Text: A
Position: (273, 2309, 20, 24)

Text: a.
Position: (348, 2313, 18, 14)

Text: be
Position: (346, 2314, 35, 19)

