In [None]:
%pip install tensorflow torch opencv-python pandas scikit-learn scipy

In [None]:
%pip install easyocr

In [None]:
import easyocr

# Create an EasyOCR Reader instance
# Specify the languages to use (e.g., ['en'] for English)
# Setting gpu=True will use the GPU if available, otherwise it defaults to CPU
reader = easyocr.Reader(['en'], gpu=True)

print("EasyOCR reader loaded successfully with English language model.")

In [None]:
import pandas as pd
import os

# Define the path to the test CSV file
test_csv_file_path = 'extracted_archive/testdata.csv'

# Check if the CSV file exists
if not os.path.exists(test_csv_file_path):
    print(f"Error: The file {test_csv_file_path} was not found.")
else:
    try:
        # Read the CSV file into a pandas DataFrame
        # Assuming the file is comma-separated and has a header row
        # We'll explicitly name the columns as they might not be consistently named
        test_df = pd.read_csv(test_csv_file_path, sep=',', header=0, names=['ImgName', 'GroundTruth', 'smallLexi', 'mediumLexi'])

        # --- Filter DataFrame to include only 'test' images ---
        initial_rows_test = len(test_df)
        test_df = test_df[test_df['ImgName'].str.startswith('test/')]
        filtered_rows_test = len(test_df)
        print(f"Filtered test data: Kept {filtered_rows_test} rows starting with 'test/' out of {initial_rows_test}.")
        # --- End filtering ---


        # Print the first few rows of the DataFrame
        print("Test CSV data loaded successfully:")
        display(test_df.head())

        # Print DataFrame information
        print("\nTest DataFrame Info:")
        test_df.info()

    except Exception as e:
        print(f"An error occurred while reading the test CSV file: {e}")

In [None]:
import os
import cv2
import pandas as pd
import numpy as np

# Assume resize_image, normalize_pixels, and grayscale_image functions are already defined
# Assume test_df is loaded from the previous step

# 1. Create empty lists
processed_test_images = []
original_test_labels = []

# Correct Base directory for images - same as training data
base_image_dir = 'extracted_archive/IIIT5K-Word_V3.0/IIIT5K'

# Target size for resizing (same as training)
target_size = (128, 32) # (width, height)

# Check if test_df exists and is not empty before proceeding
if 'test_df' in locals() and not test_df.empty:
    # 2. Iterate through each row of the test_df DataFrame
    for index, row in test_df.iterrows():
        # 3. Get image path and text label
        image_path_relative = row['ImgName']
        text_label = row['GroundTruth']

        # Construct the full image path
        full_image_path = os.path.join(base_image_dir, image_path_relative)

        # 4. Check if the image file exists
        if os.path.exists(full_image_path):
            # 5. Read the image
            img = cv2.imread(full_image_path)

            # 6. If the image is successfully loaded (not None)
            if img is not None:
                # Apply preprocessing steps
                gray_img = grayscale_image(img)
                resized_img = resize_image(gray_img, target_size)
                normalized_img = normalize_pixels(resized_img) # This is float32

                # 7. Append the preprocessed image (float32) to the list
                processed_test_images.append(normalized_img)

                # 8. Append the original text label
                original_test_labels.append(text_label)
            else:
                # 9. If image cannot be loaded, print warning and skip
                print(f"Warning: Could not load image file: {full_image_path}")
        else:
            # 9. If image file not found, print warning and skip
            print(f"Warning: Image file not found: {full_image_path}")

    # 10. Convert the processed_test_images list into a NumPy array
    processed_test_images = np.array(processed_test_images)

    # 11. Convert the original_test_labels list into a pandas Series or NumPy array
    original_test_labels = pd.Series(original_test_labels)

    # 12. Print the shape and length to verify
    print("Preprocessing of test images complete.")
    print("Shape of processed_test_images array:", processed_test_images.shape)
    print("Length of original_test_labels list:", len(original_test_labels))

else:
    print("DataFrame 'test_df' not found or is empty. Please run the cell to load the test CSV first.")

In [None]:
import nltk
from nltk.metrics.distance import edit_distance

# Download the necessary NLTK data
try:
    nltk.download('averaged_perceptron_tagger', quiet=True)
    nltk.download('punkt', quiet=True)
    print("NLTK data ('averaged_perceptron_tagger', 'punkt') downloaded successfully.")
except Exception as e:
    print(f"Error downloading NLTK data: {e}")


def calculate_cer(ground_truth, prediction):
    """
    Calculates the Character Error Rate (CER) between two strings.

    Args:
        ground_truth: The ground truth string.
        prediction: The predicted string.

    Returns:
        The Character Error Rate (float). Returns 0 if ground_truth is empty.
    """
    # Handle empty ground truth to avoid division by zero
    if len(ground_truth) == 0:
        return 0.0

    # Calculate Levenshtein distance (character-level)
    levenstein_dist = edit_distance(ground_truth, prediction)

    # CER is Levenshtein distance divided by the length of the ground truth
    cer = levenstein_dist / len(ground_truth)
    return cer

def calculate_wer(ground_truth, prediction):
    """
    Calculates the Word Error Rate (WER) between two strings.

    Args:
        ground_truth: The ground truth string.
        prediction: The predicted string.

    Returns:
        The Word Error Rate (float). Returns 0 if ground_truth is empty (after splitting into words).
    """
    # Split strings into words
    # Use a simple split by space for word tokenization
    ground_truth_words = ground_truth.split()
    prediction_words = prediction.split()

    # Handle empty ground truth word list to avoid division by zero
    if len(ground_truth_words) == 0:
        # If ground truth is an empty string, WER should arguably be 0
        # If prediction is also empty, error is 0. If prediction is not empty, error is high.
        # A common approach for empty reference is to return 0 if hypothesis is also empty, else inf or 1.
        # Let's return 0 for consistency with CER on empty string.
        return 0.0


    # Calculate Levenshtein distance (word-level)
    # edit_distance can work on lists
    levenstein_dist = edit_distance(ground_truth_words, prediction_words)


    # WER is Levenshtein distance divided by the number of words in the ground truth
    wer = levenstein_dist / len(ground_truth_words)
    return wer

print("Character Error Rate (CER) and Word Error Rate (WER) calculation functions defined.")

In [None]:
import easyocr
import numpy as np
import pandas as pd
import random
import cv2
from google.colab.patches import cv2_imshow

# Assume reader is loaded from the previous cell
# Assume test_df is available from previous steps (loaded testdata.csv)
# Assume processed_test_images is available from previous steps (preprocessed test images)
# Assume calculate_cer and calculate_wer functions are defined (cell kOLluGXQ8bLg)

# Check if necessary variables are available
if 'reader' in locals() and 'test_df' in locals() and not test_df.empty and 'processed_test_images' in locals() and processed_test_images.size > 0:

    print("Starting text recognition and evaluation using EasyOCR...")

    # Initialize empty lists to store the calculated CER and WER
    all_easyocr_cer = []
    all_easyocr_wer = []

    total_samples = processed_test_images.shape[0]
    print(f"Processing {total_samples} test samples with EasyOCR...")

    # Iterate through the preprocessed test images
    for i in range(total_samples):
        # Get the preprocessed image (NumPy array)
        # EasyOCR's readtext function expects a NumPy array or image file path
        image_for_easyocr = processed_test_images[i] # This is already preprocessed (grayscale, resized, normalized float32)

        # EasyOCR works best with original images or images in standard formats (like uint8)
        # Since processed_test_images is float32 [0, 1], let's convert it back to uint8 [0, 255]
        # and ensure it's in grayscale format if needed by EasyOCR.
        # Based on EasyOCR documentation, it can handle grayscale or color.
        # Let's convert back to uint8.
        image_for_easyocr_uint8 = (image_for_easyocr * 255).astype(np.uint8)

        # EasyOCR expects shape (height, width) or (height, width, channels).
        # processed_test_images has shape (num_samples, height, width) after preprocessing.
        # When we select processed_test_images[i], it has shape (height, width).
        # This should be suitable for EasyOCR's readtext.

        # Perform text recognition using EasyOCR's readtext
        # The result is a list of tuples: (bbox, text, confidence)
        easyocr_results = reader.readtext(image_for_easyocr_uint8, detail=0) # detail=0 returns only the text

        # Combine the recognized text from all detected regions (EasyOCR does detection internally)
        # Join the recognized text strings with a space
        recognized_text = " ".join(easyocr_results)

        # Get the corresponding ground truth label from the original test_df
        # Assume original_test_labels is available from cell 7dfe8ff3
        ground_truth_label = test_df.iloc[i]['GroundTruth']


        # Convert ground truth and recognized text to lowercase for case-insensitive evaluation
        ground_truth_lower = ground_truth_label.lower()
        recognized_text_lower = recognized_text.lower()

        # Calculate CER and WER
        cer = calculate_cer(ground_truth_lower, recognized_text_lower)
        wer = calculate_wer(ground_truth_lower, recognized_text_lower)

        # Append to lists
        all_easyocr_cer.append(cer)
        all_easyocr_wer.append(wer)

        if (i + 1) % 100 == 0:
            print(f"Processed {i + 1}/{total_samples} samples with EasyOCR.")


    # Calculate the average CER and WER
    average_easyocr_cer = np.mean(all_easyocr_cer)
    average_easyocr_wer = np.mean(all_easyocr_wer)

    # Print the results
    print("\n--- EasyOCR Evaluation Results ---")
    print(f"Average Character Error Rate (CER): {average_easyocr_cer:.4f}")
    print(f"Average Word Error Rate (WER): {average_easyocr_wer:.4f}")

    print("EasyOCR evaluation completed.")

    # --- Visualize Predictions on Random Samples using EasyOCR ---
    # Check if test_df exists and is not empty for visualization
    if 'test_df' in locals() and not test_df.empty:
        # Get a list of all possible indices in the test DataFrame
        all_indices = list(test_df.index)

        # Number of samples to visualize
        num_samples_to_visualize = 10 # You can change this number

        # Select random indices for visualization
        if len(all_indices) >= num_samples_to_visualize:
            sample_indices = random.sample(all_indices, num_samples_to_visualize)
        else:
            # If not enough samples, visualize all available
            sample_indices = all_indices
            num_samples_to_visualize = len(all_indices) # Update the number to visualize

        print(f"\nVisualizing EasyOCR predictions for {num_samples_to_visualize} random samples from the test set.")
        print("-" * 30)


        # Process and Visualize Samples
        # Need to load the original image for visualization with bounding boxes
        base_image_dir = 'extracted_archive/IIIT5K-Word_V3.0/IIIT5K' # Base directory for original images

        for i in sample_indices:
            # Get the image path and ground truth label for the selected index
            row = test_df.iloc[i]
            image_path_relative = row['ImgName']
            ground_truth_label = row['GroundTruth']

            # Construct the full image path to load the original image
            full_image_path = os.path.join(base_image_dir, image_path_relative)

            # Load the original image
            original_image = cv2.imread(full_image_path)

            if original_image is not None:
                # Use EasyOCR's readtext with detail=1 to get bounding boxes for visualization
                easyocr_results_detail = reader.readtext(original_image, detail=1)

                # Extract recognized text and draw bounding boxes on the original image
                recognized_texts_for_viz = []
                img_with_boxes = original_image.copy()

                for (bbox, text, prob) in easyocr_results_detail:
                    recognized_texts_for_viz.append(text)
                    # Draw bounding box (bbox is a list of 4 points)
                    # Convert the bbox points to integer tuples
                    points = np.array(bbox, dtype=np.int32)
                    cv2.polylines(img_with_boxes, [points], isClosed=True, color=(0, 255, 0), thickness=2)
                    # Optional: Put text near the bounding box
                    # cv2.putText(img_with_boxes, text, (points[0][0], points[0][1] - 10),
                    #             cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

                # Combine recognized text for display
                combined_recognized_text_for_viz = " ".join(recognized_texts_for_viz)


                # --- Display Results ---
                print(f"Sample Index: {i}")
                print(f"Ground Truth: {ground_truth_label}")
                print(f"EasyOCR Prediction: {combined_recognized_text_for_viz}")

                # Display the original image with bounding boxes (resized for consistent size)
                # Adjust the resize target based on the original image dimensions for better aspect ratio
                display_width = 300
                display_height = int(img_with_boxes.shape[0] * (display_width / img_with_boxes.shape[1]))
                display_img_with_boxes = cv2.resize(img_with_boxes, (display_width, display_height))
                cv2_imshow(display_img_with_boxes)

            else:
                print(f"Warning: Could not load image file for visualization: {full_image_path}")

            print("-" * 30)

        print("EasyOCR visualization complete.")

    else:
        print("\nDataFrame 'test_df' not found or is empty. Skipping visualization.")


else:
    print("Necessary variables for EasyOCR evaluation not found. Please run previous cells to load data, preprocess, and load the EasyOCR reader.")