<a href="https://colab.research.google.com/github/Abhishekjha111/AI-codes/blob/master/Untitled35.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cv2
import easyocr
import os

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'])  # You can add more languages as needed

# Path to the folder containing your images
image_folder = "path_to_image_folder"

# Function to process and extract text from an image
def extract_text_from_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)

    # Convert the image to grayscale (if needed)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Perform any additional preprocessing (e.g., noise removal, resizing, etc.) as needed
    # Example: resized_image = cv2.resize(gray_image, (800, 600))

    # Rotate the image to improve text recognition
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, -90, 1.0)
    rotate_image = cv2.warpAffine(image, M, (w, h))

    # Use EasyOCR to extract text from the preprocessed image
    result = reader.readtext(rotate_image)

    # Extract and return the recognized text from the result
    extracted_text = [entry[1] for entry in result]

    return extracted_text

# Loop through all images in the folder
for filename in os.listdir(image_folder):
    if filename.endswith(('.jpg', '.jpeg', '.png')):  # Add more extensions if needed
        image_path = os.path.join(image_folder, filename)
        extracted_text = extract_text_from_image(image_path)

        # Do something with the extracted text (e.g., print, save to a file, or further processing)
        print(f"Text from {filename}:")
        for text in extracted_text:
            print(text)

Here is one way to combine all the suggestions into a single program:

```python
import cv2
import easyocr
import os
from multiprocessing import Pool
import csv

reader = easyocr.Reader(['en'])

def process_image(image_path):
  
  try:
    image = cv2.imread(image_path)
  except Exception as e:
    print(f"Error reading image {image_path}: {e}")
    return
    
  # Preprocessing  
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  resized = cv2.resize(gray, (800,600))
  
  # OCR
  result = reader.readtext(resized, detail=0)
  
  text = ""
  for b in result:
    if b[2] > 0.5:  
      text += b[1] + " "
      
  return image_path, text

def main():

  image_folder = 'images'
  image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder)]

  pool = Pool(8)
  texts = pool.map(process_image, image_paths)
  pool.close()

  with open('results.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Image','Text'])
    writer.writerows(texts)

  # Additional analysis on extracted texts
  ...
  
if __name__ == '__main__':
  main()
```

The key changes:

- Uses multiprocessing pool for parallelization
- Adds OCR confidence filtering
- Writes extracted texts to CSV for persistence
- Combines image loading, preprocessing and OCR in one function

Let me know if any part needs more explanation!

Here are a few ways you could improve the image text extraction code:

1. Add error handling for invalid/unsupported image formats:

```python
try:
  image = cv2.imread(image_path)
except Exception as e:
  print(f"Error reading image {image_path}: {e}")
  continue
```

2. Parallelize the processing using multiprocessing:

```python
from multiprocessing import Pool

def process_image(image_path):
  # extract text logic
  return extracted_text

if __name__ == '__main__':

  pool = Pool(processes=8)
  texts = pool.map(process_image, image_paths)
  pool.close()
```

3. Use OCR confidence scores to filter out low confidence results

```python
result = reader.readtext(image, detail=0)

text = ""
for b in result:
  if b[2] > 0.5:
    text += b[1] + " "
```

4. Persist extracted text to a CSV/database for later analysis

5. Add preprocessing like noise removal, deskew, resize for better OCR

6. Handle landscape/portrait orientations intelligently

7. Add progress tracking for long image folders

8. Cache/reuse OCR model for faster processing

9. Analyze/group extracted texts after processing all images

In [None]:
import cv2
import easyocr
import os
from multiprocessing import Pool
import csv
import time

reader = easyocr.Reader(['en'])

def process_image(image_path):

  try:
    image = cv2.imread(image_path)
  except Exception as e:
    print(f"Error reading image {image_path}: {e}")
    return

  # Preprocessing
  gray = preprocess(image)

  # OCR
  result = reader.readtext(gray, detail=0)

  text = ""
  for b in result:
    if b[2] > 0.5:
      text += b[1] + " "

  return image_path, text

def preprocess(image):

  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

  # Deskew
  gray = deskew(gray)

  # Resize
  gray = cv2.resize(gray, (800,600))

  return gray

def deskew(image):
  # image deskewing logic
  return deskewed

def main():

  start = time.time()

  image_folder = 'images'
  image_paths = get_image_paths(image_folder)

  pool = Pool(8)
  texts = pool.map(process_image, image_paths)
  pool.close()

  write_results(texts)

  end = time.time()
  print(f'Completed in {end-start} seconds')

if __name__ == '__main__':
  main()

Certainly! Here's an updated version of the code that includes additional image processing steps such as noise removal, resizing, and adaptive thresholding to enhance OCR accuracy:

```python
import cv2
import easyocr
import os
import multiprocessing
import pandas as pd
import numpy as np

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'])  # You can add more languages as needed

# Path to the folder containing your images
image_folder = "path_to_image_folder"

# Function to process and extract text from an image
def extract_text_from_image(image_path):
    try:
        # Load the image using OpenCV
        image = cv2.imread(image_path)
        
        # Automatically detect and correct the orientation of the image
        if 'deskew' in reader.model_list[0].module_list:
            image = reader.model_list[0].module_list[0].deskew(image)
        
        # Convert the image to grayscale
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Perform additional preprocessing (e.g., noise removal, resizing)
        # Noise removal using Gaussian blur
        gray_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
        
        # Resize the image for better OCR accuracy
        resized_image = cv2.resize(gray_image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
        
        # Apply adaptive thresholding to binarize the image
        binary_image = cv2.adaptiveThreshold(resized_image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
        
        # Use EasyOCR to extract text from the preprocessed image
        result = reader.readtext(binary_image)
        
        # Filter results based on confidence scores
        high_confidence_text = [entry[1] for entry in result if entry[2] > 0.7]  # Adjust confidence threshold as needed
        
        return high_confidence_text
    except Exception as e:
        return []

# Function to process images in parallel
def process_images(image_paths):
    extracted_text = []
    for image_path in image_paths:
        text = extract_text_from_image(image_path)
        extracted_text.extend(text)
    return extracted_text

# Get a list of image file paths from the folder
image_paths = [os.path.join(image_folder, filename) for filename in os.listdir(image_folder)
               if filename.endswith(('.jpg', '.jpeg', '.png'))]

# Split the image paths into chunks for parallel processing
num_cores = multiprocessing.cpu_count()
image_chunks = [image_paths[i:i + num_cores] for i in range(0, len(image_paths), num_cores)]

# Initialize a progress counter
processed_count = 0
total_images = len(image_paths)

# Initialize a list to store extracted text
all_extracted_text = []

# Process images in parallel
pool = multiprocessing.Pool(processes=num_cores)
for extracted_text_chunk in pool.imap_unordered(process_images, image_chunks):
    all_extracted_text.extend(extracted_text_chunk)
    processed_count += len(image_paths)
    print(f"Processed {processed_count}/{total_images} images")

# Close the multiprocessing pool
pool.close()
pool.join()

# Create a DataFrame to store the extracted text
df = pd.DataFrame({'Extracted Text': all_extracted_text})

# Save the DataFrame to a CSV file
df.to_csv('extracted_text.csv', index=False)

# Analyze/group extracted texts
# For example, you can count and print the most frequent words
word_counts = df['Extracted Text'].str.split().explode().value_counts()
print("Most frequent words:")
print(word_counts.head(10))
```

This code now includes Gaussian blur for noise removal, resizing for better OCR accuracy, and adaptive thresholding for binarization. Adjust the parameters as needed for your specific images.

.

**Assumptions:**
1. We have a folder named "sample_images" containing various images with text.
2. The images may have varying levels of noise.
3. We will resize the images to a width of 800 pixels for better OCR accuracy.
4. We will apply adaptive thresholding to binarize the images.
5. We will set a confidence threshold of 0.7 to filter out low-confidence OCR results.
6. We will save the extracted text to a CSV file named "extracted_text.csv".
7. We will analyze the extracted text by counting the most frequent words.

```python
import cv2
import easyocr
import os
import multiprocessing
import pandas as pd
import numpy as np

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'])

# Path to the folder containing sample images (hypothetical)
image_folder = "sample_images"

# Function to process and extract text from an image
def extract_text_from_image(image_path):
    try:
        # Load the image using OpenCV
        image = cv2.imread(image_path)
        
        # Automatically detect and correct the orientation of the image (hypothetical assumption)
        if 'deskew' in reader.model_list[0].module_list:
            image = reader.model_list[0].module_list[0].deskew(image)
        
        # Convert the image to grayscale
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Perform additional preprocessing (hypothetical assumptions)
        # Noise removal using Gaussian blur
        gray_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
        
        # Resize the image to a fixed width of 800 pixels
        resized_image = cv2.resize(gray_image, (800, int(800 * gray_image.shape[0] / gray_image.shape[1])))
        
        # Apply adaptive thresholding to binarize the image
        binary_image = cv2.adaptiveThreshold(resized_image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
        
        # Use EasyOCR to extract text from the preprocessed image
        result = reader.readtext(binary_image)
        
        # Filter results based on a hypothetical confidence threshold of 0.7
        high_confidence_text = [entry[1] for entry in result if entry[2] > 0.7]
        
        return high_confidence_text
    except Exception as e:
        return []

# Function to process images in parallel
def process_images(image_paths):
    extracted_text = []
    for image_path in image_paths:
        text = extract_text_from_image(image_path)
        extracted_text.extend(text)
    return extracted_text

# Get a list of image file paths from the hypothetical sample folder
image_paths = [os.path.join(image_folder, filename) for filename in os.listdir(image_folder)
               if filename.endswith(('.jpg', '.jpeg', '.png'))]

# Split the image paths into chunks for parallel processing
num_cores = multiprocessing.cpu_count()
image_chunks = [image_paths[i:i + num_cores] for i in range(0, len(image_paths), num_cores)]

# Initialize a progress counter
processed_count = 0
total_images = len(image_paths)

# Initialize a list to store extracted text
all_extracted_text = []

# Process images in parallel
pool = multiprocessing.Pool(processes=num_cores)
for extracted_text_chunk in pool.imap_unordered(process_images, image_chunks):
    all_extracted_text.extend(extracted_text_chunk)
    processed_count += len(image_paths)
    print(f"Processed {processed_count}/{total_images} images")

# Close the multiprocessing pool
pool.close()
pool.join()

# Create a DataFrame to store the extracted text
df = pd.DataFrame({'Extracted Text': all_extracted_text})

# Save the DataFrame to a hypothetical CSV file
df.to_csv('extracted_text.csv', index=False)

# Analyze/group extracted texts (hypothetical)
# For example, count and print the most frequent words
word_counts = df['Extracted Text'].str.split().explode().value_counts()
print("Most frequent words:")
print(word_counts.head(10))
```

This hypothetical example assumes certain image preprocessing steps, confidence threshold, and output file names for demonstration purposes. Actual results may require adjustments based on real data and use cases.

Certainly! To save the extracted text as a result in a CSV file, you can modify the code as follows to include text extraction and CSV file writing:

```python
import cv2
import easyocr
import os
import multiprocessing
import pandas as pd
import numpy as np

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'])

# Path to the folder containing sample images (hypothetical)
image_folder = "sample_images"

# Function to process and extract text from an image
def extract_text_from_image(image_path):
    try:
        # Load the image using OpenCV
        image = cv2.imread(image_path)
        
        # Automatically detect and correct the orientation of the image (hypothetical assumption)
        if 'deskew' in reader.model_list[0].module_list:
            image = reader.model_list[0].module_list[0].deskew(image)
        
        # Convert the image to grayscale
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Perform additional preprocessing (hypothetical assumptions)
        # Noise removal using Gaussian blur
        gray_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
        
        # Resize the image to a fixed width of 800 pixels
        resized_image = cv2.resize(gray_image, (800, int(800 * gray_image.shape[0] / gray_image.shape[1])))
        
        # Apply adaptive thresholding to binarize the image
        binary_image = cv2.adaptiveThreshold(resized_image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
        
        # Use EasyOCR to extract text from the preprocessed image
        result = reader.readtext(binary_image)
        
        # Filter results based on a hypothetical confidence threshold of 0.7
        high_confidence_text = [entry[1] for entry in result if entry[2] > 0.7]
        
        return high_confidence_text
    except Exception as e:
        return []

# Function to process images in parallel
def process_images(image_paths):
    extracted_text = []
    for image_path in image_paths:
        text = extract_text_from_image(image_path)
        extracted_text.extend(text)
    return extracted_text

# Get a list of image file paths from the hypothetical sample folder
image_paths = [os.path.join(image_folder, filename) for filename in os.listdir(image_folder)
               if filename.endswith(('.jpg', '.jpeg', '.png'))]

# Split the image paths into chunks for parallel processing
num_cores = multiprocessing.cpu_count()
image_chunks = [image_paths[i:i + num_cores] for i in range(0, len(image_paths), num_cores)]

# Initialize a progress counter
processed_count = 0
total_images = len(image_paths)

# Initialize a list to store extracted text
all_extracted_text = []

# Process images in parallel
pool = multiprocessing.Pool(processes=num_cores)
for extracted_text_chunk in pool.imap_unordered(process_images, image_chunks):
    all_extracted_text.extend(extracted_text_chunk)
    processed_count += len(image_paths)
    print(f"Processed {processed_count}/{total_images} images")

# Close the multiprocessing pool
pool.close()
pool.join()

# Create a DataFrame to store the extracted text
df = pd.DataFrame({'Extracted Text': all_extracted_text})

# Save the DataFrame to a hypothetical CSV file
output_csv = 'extracted_text.csv'
df.to_csv(output_csv, index=False)

# Analyze/group extracted texts (hypothetical)
# For example, count and print the most frequent words
word_counts = df['Extracted Text'].str.split().explode().value_counts()
print("Most frequent words:")
print(word_counts.head(10))

print(f"Extracted text saved to {output_csv}")
```

This code will save the extracted text as a CSV file named "extracted_text.csv" in the same directory where the script is run. The CSV file will contain a column labeled "Extracted Text" with the extracted text results.