In [None]:
%pip install tensorflow torch opencv-python pandas scikit-learn scipy

In [None]:
import pandas as pd
import os


csv_file_path = 'extracted_archive/traindata.csv'

# Check if the CSV file exists
if not os.path.exists(csv_file_path):
    print(f"Error: The file {csv_file_path} was not found.")
else:
    try:
        # Read the CSV file into a pandas DataFrame
        # The file seems to be comma-separated based on the name and previous attempts
        # Assuming the first row is the header based on the visual inspection of the error output
        # We'll explicitly name the columns as they might not be consistently named
        df = pd.read_csv(csv_file_path, sep=',', header=0, names=['ImgName', 'GroundTruth', 'smallLexi', 'mediumLexi'])

        # --- Filter DataFrame to include only 'train' images ---
        initial_rows = len(df)
        df = df[df['ImgName'].str.startswith('train/')]
        filtered_rows = len(df)
        print(f"Filtered training data: Kept {filtered_rows} rows starting with 'train/' out of {initial_rows}.")
        # --- End filtering ---

        # Display the first few rows of the DataFrame to verify
        print("CSV data loaded successfully:")
        display(df.head())

        # You can now access image paths and text labels using df['ImgName'] and df['GroundTruth']

    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")

In [None]:
import cv2

def resize_image(image, target_size):
    """
    Resizes an image to a target size.

    Args:
        image: The input image (NumPy array).
        target_size: A tuple (width, height) representing the desired size.

    Returns:
        The resized image (NumPy array).
    """
    return cv2.resize(image, target_size)

def normalize_pixels(image):
    """
    Normalizes pixel values of an image to the range [0, 1].

    Args:
        image: The input image (NumPy array).

    Returns:
        The normalized image (NumPy array).
    """
    # Convert to float for accurate division
    return image.astype('float32') / 255.0

def grayscale_image(image):
    """
    Converts an image to grayscale.

    Args:
        image: The input image (NumPy array).

    Returns:
        The grayscale image (NumPy array).
    """
    # Check if the image is already grayscale
    if len(image.shape) == 2:
        return image
    # Convert to grayscale
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

print("Preprocessing functions defined.")


In [None]:
import numpy as np
import cv2
import random

def augment_image_cv2(image):
    """
    Applies a series of random data augmentation techniques to an image using OpenCV.

    Args:
        image: The input image (NumPy array).

    Returns:
        The augmented image (NumPy array).
    """
    augmented_image = image.copy()
    h, w = augmented_image.shape[:2]

    # Random Rotation
    if random.random() < 0.5:
        angle = random.uniform(-5, 5)
        M = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1.0)
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)

    # Random Translation
    if random.random() < 0.5:
        tx = random.uniform(-0.05, 0.05) * w
        ty = random.uniform(-0.05, 0.05) * h
        M = np.float32([[1, 0, tx], [0, 1, ty]])
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)

    # Random Scaling
    if random.random() < 0.5:
        sx = random.uniform(0.95, 1.05)
        sy = random.uniform(0.95, 1.05)
        M = np.float32([[sx, 0, 0], [0, sy, 0]])
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)

    # Random Shear (simplified)
    if random.random() < 0.5:
        shear_factor = random.uniform(-0.05, 0.05)
        M = np.float32([[1, shear_factor, 0], [0, 1, 0]])
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)


    # Random Gaussian Blur
    if random.random() < 0.25:
        sigma = random.uniform(0, 1.0)
        ksize = int(2 * int(sigma * 3 + 0.5) + 1) # Kernel size based on sigma
        ksize = ksize if ksize % 2 == 1 else ksize + 1 # Ensure kernel size is odd
        if ksize > 1: # Apply blur only if kernel size is greater than 1
          augmented_image = cv2.GaussianBlur(augmented_image, (ksize, ksize), sigma)


    # Random Gaussian Noise
    if random.random() < 0.25:
        mean = 0
        stddev = random.uniform(0, 0.05 * 255)
        noise = np.random.normal(mean, stddev, augmented_image.shape).astype(np.uint8)
        augmented_image = cv2.add(augmented_image, noise)


    # Random Brightness and Contrast
    if random.random() < 0.25:
        alpha = random.uniform(0.9, 1.1) # Contrast control (1.0-base contrast)
        beta = random.uniform(-10, 10) # Brightness control (0-base brightness)
        augmented_image = cv2.convertScaleAbs(augmented_image, alpha=alpha, beta=beta)

    # Random Grayscale (only if image is not already grayscale)
    if random.random() < 0.25 and len(augmented_image.shape) == 3:
         augmented_image = cv2.cvtColor(augmented_image, cv2.COLOR_BGR2GRAY)


    return augmented_image.astype(np.uint8)

print("Data augmentation function defined using OpenCV.")

In [None]:
import numpy as np
import cv2
import random

def augment_image_cv2(image):
    """
    Applies a series of random data augmentation techniques to an image using OpenCV.

    Args:
        image: The input image (NumPy array).

    Returns:
        The augmented image (NumPy array).
    """
    augmented_image = image.copy()
    h, w = augmented_image.shape[:2]

    # Random Rotation
    if random.random() < 0.5:
        angle = random.uniform(-5, 5)
        M = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1.0)
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)

    # Random Translation
    if random.random() < 0.5:
        tx = random.uniform(-0.05, 0.05) * w
        ty = random.uniform(-0.05, 0.05) * h
        M = np.float32([[1, 0, tx], [0, 1, ty]])
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)

    # Random Scaling
    if random.random() < 0.5:
        sx = random.uniform(0.95, 1.05)
        sy = random.uniform(0.95, 1.05)
        M = np.float32([[sx, 0, 0], [0, sy, 0]])
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)

    # Random Shear (simplified)
    if random.random() < 0.5:
        shear_factor = random.uniform(-0.05, 0.05)
        M = np.float32([[1, shear_factor, 0], [0, 1, 0]])
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)


    # Random Gaussian Blur
    if random.random() < 0.25:
        sigma = random.uniform(0, 1.0)
        ksize = int(2 * int(sigma * 3 + 0.5) + 1) # Kernel size based on sigma
        ksize = ksize if ksize % 2 == 1 else ksize + 1 # Ensure kernel size is odd
        if ksize > 1: # Apply blur only if kernel size is greater than 1
          augmented_image = cv2.GaussianBlur(augmented_image, (ksize, ksize), sigma)


    # Random Gaussian Noise
    if random.random() < 0.25:
        mean = 0
        stddev = random.uniform(0, 0.05 * 255)
        noise = np.random.normal(mean, stddev, augmented_image.shape).astype(np.uint8)
        augmented_image = cv2.add(augmented_image, noise)


    # Random Brightness and Contrast
    if random.random() < 0.25:
        alpha = random.uniform(0.9, 1.1) # Contrast control (1.0-base contrast)
        beta = random.uniform(-10, 10) # Brightness control (0-base brightness)
        augmented_image = cv2.convertScaleAbs(augmented_image, alpha=alpha, beta=beta)

    # Random Grayscale (only if image is not already grayscale)
    if random.random() < 0.25 and len(augmented_image.shape) == 3:
         augmented_image = cv2.cvtColor(augmented_image, cv2.COLOR_BGR2GRAY)


    return augmented_image.astype(np.uint8)

print("Data augmentation function defined using OpenCV.")

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import random

# Redefine the augment_image_cv2 function to handle the type mismatch
def augment_image_cv2(image):
    """
    Applies a series of random data augmentation techniques to an image using OpenCV.

    Args:
        image: The input image (NumPy array) with pixel values in [0, 1].

    Returns:
        The augmented image (NumPy array) with pixel values in [0, 1].
    """
    # Ensure the image is float32 for calculations
    augmented_image = image.copy().astype(np.float32)
    h, w = augmented_image.shape[:2]

    # Random Rotation
    if random.random() < 0.5:
        angle = random.uniform(-5, 5)
        M = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1.0)
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)

    # Random Translation
    if random.random() < 0.5:
        tx = random.uniform(-0.05, 0.05) * w
        ty = random.uniform(-0.05, 0.05) * h
        M = np.float32([[1, 0, tx], [0, 1, ty]])
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)

    # Random Scaling
    if random.random() < 0.5:
        sx = random.uniform(0.95, 1.05)
        sy = random.uniform(0.95, 1.05)
        M = np.float32([[sx, 0, 0], [0, sy, 0]])
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)


    # Random Shear (simplified)
    if random.random() < 0.5:
        shear_factor = random.uniform(-0.05, 0.05)
        M = np.float32([[1, shear_factor, 0], [0, 1, 0]])
        augmented_image = cv2.warpAffine(augmented_image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)


    # Random Gaussian Blur
    if random.random() < 0.25:
        sigma = random.uniform(0, 1.0)
        ksize = int(2 * int(sigma * 3 + 0.5) + 1) # Kernel size based on sigma
        ksize = ksize if ksize % 2 == 1 else ksize + 1 # Ensure kernel size is odd
        if ksize > 1: # Apply blur only if kernel size is greater than 1
          # GaussianBlur expects image to be uint8 or float32. Since we are already float32, it's fine.
          augmented_image = cv2.GaussianBlur(augmented_image, (ksize, ksize), sigma)


    # Random Gaussian Noise
    if random.random() < 0.25:
        mean = 0
        # Generate noise with the same dtype as the image
        stddev = random.uniform(0, 0.05) # Standard deviation relative to the [0, 1] range
        noise = np.random.normal(mean, stddev, augmented_image.shape).astype(np.float32)
        # Add noise. Since both inputs are float32, cv2.add will return float32.
        augmented_image = cv2.add(augmented_image, noise)
        # Clip values to stay in the [0, 1] range
        augmented_image = np.clip(augmented_image, 0.0, 1.0)


    # Random Brightness and Contrast
    if random.random() < 0.25:
        alpha = random.uniform(0.9, 1.1) # Contrast control (1.0-base contrast)
        beta = random.uniform(-0.1, 0.1) # Brightness control (0-base brightness relative to [0,1])
        # Apply brightness and contrast. Since image is float32, beta should be in [0, 1] range.
        augmented_image = cv2.convertScaleAbs(augmented_image, alpha=alpha, beta=beta * 255.0) # convertScaleAbs outputs uint8, need to convert back to float32
        augmented_image = augmented_image.astype(np.float32) / 255.0


    # Random Grayscale (only if image is not already grayscale)
    if random.random() < 0.25 and len(augmented_image.shape) == 3:
         augmented_image = cv2.cvtColor(augmented_image, cv2.COLOR_BGR2GRAY)


    return augmented_image

# 1. Create empty lists
processed_images = []
processed_labels = []

# Correct Base directory for images
base_image_dir = 'extracted_archive/IIIT5K-Word_V3.0/IIIT5K'

# Check if df exists and is not empty before proceeding
if 'df' in locals() and not df.empty:
    # 2. Iterate through each row of the df DataFrame
    for index, row in df.iterrows():
        # 3. Get image path and text label
        # Ensure column names match your DataFrame
        image_path_relative = row['ImgName']
        text_label = row['GroundTruth']

        # Construct the full image path
        full_image_path = os.path.join(base_image_dir, image_path_relative)

        # Check if the image file exists
        if os.path.exists(full_image_path):
            # Read the image
            img = cv2.imread(full_image_path)

            # If the image is successfully loaded (not None)
            if img is not None:
                # Apply preprocessing steps
                gray_img = grayscale_image(img)
                resized_img = resize_image(gray_img, (128, 32))
                normalized_img = normalize_pixels(resized_img) # This is float32

                # Apply data augmentation
                augmented_img = augment_image_cv2(normalized_img) # This function now returns float32

                # Append the augmented image (float32) to the processed_images list
                processed_images.append(augmented_img)

                # Append the text label
                processed_labels.append(text_label)
            else:
                print(f"Could not load image file: {full_image_path}")
        else:
            print(f"Image file not found: {full_image_path}")

    # 13. Convert the processed_images list into a NumPy array
    processed_images = np.array(processed_images)

    # 14. Convert the processed_labels list into a Pandas Series or NumPy array
    processed_labels = pd.Series(processed_labels)

    # 15. Print the shape of the processed_images array and the length of the processed_labels list
    print("Shape of processed_images array:", processed_images.shape)
    print("Length of processed_labels list:", len(processed_labels))
else:
    print("DataFrame 'df' not found or is empty. Please run the cell to load the CSV first.")

In [None]:
import os
import cv2
import pandas as pd
import numpy as np

# Assume resize_image, normalize_pixels, grayscale_image, and augment_image_cv2 are already defined

def load_and_preprocess_image(image_path_relative, label, base_image_dir='extracted_archive/IIIT5K-Word_V3.0/IIIT5K'):
    """
    Loads an image, applies preprocessing and data augmentation, and returns the processed image and label.

    Args:
        image_path_relative: The relative path to the image file from the base_image_dir.
        label: The text label for the image.
        base_image_dir: The base directory where images are stored.

    Returns:
        A tuple containing:
            - processed_image: The processed and augmented image (NumPy array, float32, [0, 1]).
            - original_label: The original text label (string).
            Or (None, label) if the image file is not found or cannot be loaded.
    """
    full_image_path = os.path.join(base_image_dir, image_path_relative)

    if os.path.exists(full_image_path):
        img = cv2.imread(full_image_path)

        if img is not None:
            # Apply preprocessing steps
            gray_img = grayscale_image(img)
            resized_img = resize_image(gray_img, (128, 32))
            normalized_img = normalize_pixels(resized_img) # This is float32

            # Apply data augmentation
            augmented_img = augment_image_cv2(normalized_img) # This function now returns float32

            return augmented_img, label
        else:
            print(f"Could not load image file: {full_image_path}")
            return None, label
    else:
        print(f"Image file not found: {full_image_path}")
        return None, label

# Demonstrate the integration by processing a subset of the DataFrame
# Check if df exists and is not empty before proceeding
if 'df' in locals() and not df.empty:
    subset_df = df.head(5) # Process the first 5 rows for demonstration

    processed_data = []

    for index, row in subset_df.iterrows():
        # Ensure column names match your DataFrame
        image_path_relative = row['ImgName']
        text_label = row['GroundTruth']

        processed_image, original_label = load_and_preprocess_image(image_path_relative, text_label)

        if processed_image is not None:
            processed_data.append({'processed_image': processed_image, 'text_label': original_label})

    # You can now work with the processed_data list
    print(f"Processed {len(processed_data)} images from the subset.")

    # Example: Display the shape of the first processed image
    if processed_data:
        print("Shape of the first processed image:", processed_data[0]['processed_image'].shape)
else:
    print("DataFrame 'df' not found or is empty. Please run the cell to load the CSV first.")

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers # Import regularizers

def build_cnn_backbone(input_shape):
    """
    Builds the CNN backbone for extracting visual features from images, with Dropout regularization.

    Args:
        input_shape: The shape of the input images (height, width, channels).

    Returns:
        A TensorFlow Keras Model representing the CNN backbone with regularization.
    """
    model = keras.Sequential()

    # Initial layers with increased filters and slightly reduced dropout
    model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(0.2)) # Reduced Dropout


    model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(0.2)) # Reduced Dropout


    model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 1)))
    model.add(layers.Dropout(0.25)) # Reduced Dropout


    model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 1)))
    model.add(layers.Dropout(0.25)) # Reduced Dropout

    # Added an additional Convolutional layer
    model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(layers.Dropout(0.25)) # Added Dropout after the new conv layer


    model.add(layers.Conv2D(512, (2, 2), activation='relu', padding='valid')) # Smaller kernel for final layer
    model.add(layers.Dropout(0.3)) # Adjusted Dropout after final conv layer


    # No Flatten or Dense layers as the output is a feature map for the RNN
    return model

def build_rnn_head(cnn_output_shape, num_classes):
    """
    Builds the RNN head for interpreting the feature sequence from the CNN, with Dropout regularization.

    Args:
        cnn_output_shape: The shape of the output feature map from the CNN (height, width, channels).
        num_classes: The number of possible characters (including blank).

    Returns:
        A TensorFlow Keras Model representing the RNN head with regularization.
    """
    input_tensor = layers.Input(shape=cnn_output_shape)

    # Reshape the input from CNN output (batch, height, width, channels)
    # to a sequence format for RNN (batch, time steps, features)
    # The time steps will be the width of the feature map, and features will be height * channels
    height, width, channels = cnn_output_shape
    # Reshape to (batch, width, height * channels)
    reshape_layer = layers.Reshape(target_shape=(width, height * channels))(input_tensor)

    # Add Bidirectional LSTM layers with increased units and reduced recurrent dropout
    # Increased LSTM units to 512
    rnn_layer_1 = layers.Bidirectional(layers.LSTM(512, return_sequences=True, recurrent_dropout=0.15))(reshape_layer) # Reduced recurrent_dropout
    rnn_layer_1 = layers.Dropout(0.25)(rnn_layer_1) # Reduced Dropout after the first LSTM layer

    rnn_layer_2 = layers.Bidirectional(layers.LSTM(512, return_sequences=True, recurrent_dropout=0.15))(rnn_layer_1) # Increased LSTM units, reduced recurrent_dropout
    rnn_layer_2 = layers.Dropout(0.25)(rnn_layer_2) # Reduced Dropout after the second LSTM layer

    # Added an additional Bidirectional LSTM layer
    rnn_layer_3 = layers.Bidirectional(layers.LSTM(512, return_sequences=True, recurrent_dropout=0.15))(rnn_layer_2) # Added LSTM layer
    rnn_layer_3 = layers.Dropout(0.25)(rnn_layer_3) # Added Dropout after the new LSTM layer


    # Add a Dense layer for character prediction
    output_layer = layers.Dense(num_classes, activation='softmax')(rnn_layer_3)

    # Create the RNN model
    rnn_model = Model(inputs=input_tensor, outputs=output_layer)

    return rnn_model

print("CNN backbone and RNN head functions redefined with increased capacity and reduced Dropout.")

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import backend as K
import tensorflow as tf # Import tensorflow to print tensor shape

def build_rnn_head(reshaped_cnn_output_shape, num_classes):
    """
    Builds the RNN head for interpreting the feature sequence from the CNN, with Dropout regularization.

    Args:
        reshaped_cnn_output_shape: The shape of the reshaped CNN output (time_steps, features).
        num_classes: The number of possible characters (including blank).

    Returns:
        A TensorFlow Keras Model representing the RNN head with regularization.
    """
    # Input shape is the shape of the reshaped CNN output, excluding the batch dimension
    input_tensor = layers.Input(shape=reshaped_cnn_output_shape)

    # The input_tensor is already in the sequence format for RNN (batch, time_steps, features)
    # We just need to define the RNN layers on top of this input_tensor.

    # Print the shape of the input tensor to the RNN head
    print("Input shape to RNN head:", input_tensor.shape)

    # Add Bidirectional LSTM layers with increased units and recurrent dropout
    # Added recurrent_dropout for regularization within the LSTM cells
    rnn_layer_1 = layers.Bidirectional(layers.LSTM(256, return_sequences=True, recurrent_dropout=0.2))(input_tensor)
    rnn_layer_1 = layers.Dropout(0.3)(rnn_layer_1) # Added Dropout after the first LSTM layer

    rnn_layer_2 = layers.Bidirectional(layers.LSTM(256, return_sequences=True, recurrent_dropout=0.2))(rnn_layer_1)
    rnn_layer_2 = layers.Dropout(0.3)(rnn_layer_2) # Added Dropout after the second LSTM layer


    # Add a Dense layer for character prediction
    output_layer = layers.Dense(num_classes, activation='softmax')(rnn_layer_2)

    # Create the RNN model
    rnn_model = Model(inputs=input_tensor, outputs=output_layer)

    return rnn_model

# Now, rebuild the combined model using the corrected build_rnn_head function

# 1. Define the input shape for the overall model
# Assuming preprocessed images are grayscale (1 channel), height 32, width 128
input_shape = (32, 128, 1)

# Define the number of classes (characters + blank)
# Using the num_classes from cell a1b65cae which is 70.
try:
    num_classes = len(char_to_int) # Use the actual number of classes from the character set
    print(f"Using num_classes from data preparation: {num_classes}")
except NameError:
    print("Warning: 'char_to_int' not found. Using a default num_classes = 70. Please run data preparation cell first.")
    num_classes = 70 # Default value if char_to_int is not defined


# 2. Use the previously defined build_cnn_backbone function (with regularization)
cnn_backbone = build_cnn_backbone(input_shape)

# 3. Define the combined model using the Functional API
input_layer = keras.Input(shape=input_shape, name='image_input')

# Pass the input through the CNN backbone
cnn_output = cnn_backbone(input_layer)

# Determine the output shape of the CNN backbone using .shape (static shape)
# Exclude batch dimension (index 0)
_, height, width, channels = cnn_output.shape

# Reshape the CNN output for the RNN
# The time steps will be the width of the feature map, and features will be height * channels
reshape_output = layers.Reshape(target_shape=(width, height * channels))(cnn_output)

# Get the shape of the reshaped output (excluding the batch dimension)
reshaped_output_shape = reshape_output.shape[1:]

# Use the corrected build_rnn_head function (with regularization)
rnn_head = build_rnn_head(reshaped_output_shape, num_classes) # Pass the correct shape

# Pass the reshaped output through the RNN head
final_output = rnn_head(reshape_output)


# Define the final combined model
combined_model = keras.Model(inputs=input_layer, outputs=final_output, name='ocr_model_regularized')

print("Combined CNN-RNN model recreated with corrected RNN input shape and regularization.")
combined_model.summary()

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import string
import pandas as pd # Assuming df is available from previous steps
from sklearn.model_selection import train_test_split # Import train_test_split

# 1. Create a character set (vocabulary)
# You should build this character set based on the unique characters present in your training labels.
# For demonstration, let's use lowercase letters, digits, and a few symbols.
# Ensure df is available and filtered to 'train/' images before building vocabulary
if 'df' in locals() and not df.empty:
    # Build character set from the GroundTruth of the filtered training data
    all_ground_truth = "".join(df['GroundTruth'].str.lower().tolist())
    characters = sorted(list(set(all_ground_truth)))

    # Add a blank character for CTC loss
    # The blank character is typically the last character in the vocabulary
    blank_token = '-'
    if blank_token not in characters:
        characters.append(blank_token)

    # Create a character-to-integer mapping
    char_to_int = {char: i for i, char in enumerate(characters)}

    # Create an integer-to-character mapping
    int_to_char = {i: char for char, i in char_to_int.items()}

    # Define the number of classes (including the blank token)
    num_classes = len(characters)
    print(f"Number of classes (including blank): {num_classes}")
    print(f"Character set: {''.join(characters)}")

    # 2. Encode text labels into sequences of integers and store original lengths
    def encode_text_labels_with_lengths(labels, char_to_int):
        """
        Encodes a list of text labels into sequences of integers and returns original lengths.

        Args:
            labels: A list or pandas Series of text labels.
            char_to_int: A dictionary mapping characters to integers.

        Returns:
            A tuple containing:
                - encoded_labels: A list of integer sequences.
                - original_lengths: A list of original label lengths.
        """
        encoded_labels = []
        original_lengths = []
        for label in labels:
            # Ensure label is a string and handle potential NaNs if any
            if isinstance(label, str):
                encoded_sequence = [char_to_int.get(char, char_to_int[blank_token]) for char in label.lower()] # Convert to lowercase and handle unknown chars
                encoded_labels.append(encoded_sequence)
                original_lengths.append(len(encoded_sequence))
            else:
                # Handle non-string labels (e.g., NaNs) - skip or encode as empty
                print(f"Warning: Skipping non-string label: {label}")
                encoded_labels.append([])
                original_lengths.append(0)

        return encoded_labels, original_lengths

    # Encode the text labels from your DataFrame and get original lengths
    # Assuming processed_labels is a pandas Series from previous preprocessing steps
    # processed_images and processed_labels should correspond to the filtered df
    # Re-run preprocessing on the filtered df to ensure processed_images and processed_labels match the filtered df
    # For now, let's assume processed_images and processed_labels are already updated to match the filtered df
    # (This would require re-running the preprocessing cell after filtering df)

    # Let's re-generate processed_images and processed_labels based on the filtered df
    # This part should ideally be in the preprocessing cell, but for this fix, we'll do it here.
    # In a real pipeline, ensure preprocessing is applied to the filtered df.

    # --- Re-generating processed_images and processed_labels from filtered df ---
    # Assume grayscale_image, resize_image, normalize_pixels, augment_image_cv2 are defined
    # Assume base_image_dir and target_size are defined

    # Correct Base directory for images
    base_image_dir = 'extracted_archive/IIIT5K-Word_V3.0/IIIT5K'
    target_size = (128, 32) # (width, height)

    temp_processed_images = []
    temp_processed_labels = []

    print("Re-processing images and labels from filtered training data...")
    for index, row in df.iterrows(): # Use the filtered df
        image_path_relative = row['ImgName']
        text_label = row['GroundTruth']
        full_image_path = os.path.join(base_image_dir, image_path_relative)

        if os.path.exists(full_image_path):
            img = cv2.imread(full_image_path)
            if img is not None:
                gray_img = grayscale_image(img)
                resized_img = resize_image(gray_img, target_size)
                normalized_img = normalize_pixels(resized_img) # float32
                # Decide whether to apply augmentation here or later in the tf.data pipeline
                # Applying here means augmentation is fixed per epoch. Applying in tf.data is dynamic.
                # For simplicity in this fix, let's assume augmentation is applied here as before.
                augmented_img = augment_image_cv2(normalized_img) # Assuming augment_image_cv2 is defined

                temp_processed_images.append(np.expand_dims(augmented_img, axis=-1)) # Add channel dimension
                temp_processed_labels.append(text_label)
            else:
                print(f"Warning: Could not load image file during re-processing: {full_image_path}")
        else:
            print(f"Warning: Image file not found during re-processing: {full_image_path}")

    processed_images = np.array(temp_processed_images)
    processed_labels = pd.Series(temp_processed_labels) # Keep as Series for consistency
    print("Re-processing complete.")
    print("Shape of re-processed_images array:", processed_images.shape)
    print("Length of re-processed_labels list:", len(processed_labels))
    # --- End Re-generating ---


    encoded_labels, original_label_lengths = encode_text_labels_with_lengths(processed_labels, char_to_int)

    # Determine the maximum label length
    # Use the maximum length from the original encoded labels before padding
    max_label_length = max(original_label_lengths) if original_label_lengths else 0
    print(f"Maximum label length: {max_label_length}")

    # 3. Pad the encoded sequences to a fixed length
    padded_encoded_labels = pad_sequences(encoded_labels, maxlen=max_label_length, padding='post', value=char_to_int[blank_token])


    # --- Split Data into Training and Validation Sets ---
    # Use train_test_split from scikit-learn
    # Split processed_images, padded_encoded_labels, and original_label_lengths
    train_images, val_images, \
    train_labels_padded, val_labels_padded, \
    train_original_lengths, val_original_lengths = train_test_split(
        processed_images,
        padded_encoded_labels,
        original_label_lengths, # Split the list of original lengths
        test_size=0.2, # 20% for validation
        random_state=42 # for reproducibility
    )

    print(f"\nTraining set size: {len(train_images)}")
    print(f"Validation set size: {len(val_images)}")

    # 4. Create TensorFlow Datasets for Training and Validation
    # Convert NumPy arrays to TensorFlow Tensors
    train_images_tensor = tf.constant(train_images, dtype=tf.float32)
    train_labels_tensor = tf.constant(train_labels_padded, dtype=tf.int32)
    train_original_lengths_tensor = tf.constant(train_original_lengths, dtype=tf.int32) # Convert list to Tensor

    val_images_tensor = tf.constant(val_images, dtype=tf.float32)
    val_labels_tensor = tf.constant(val_labels_padded, dtype=tf.int32)
    val_original_lengths_tensor = tf.constant(val_original_lengths, dtype=tf.int32) # Convert list to Tensor


    # Create datasets
    batch_size = 32

    train_dataset = tf.data.Dataset.from_tensor_slices((train_images_tensor, train_labels_tensor, train_original_lengths_tensor))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

    val_dataset = tf.data.Dataset.from_tensor_slices((val_images_tensor, val_labels_tensor, val_original_lengths_tensor))
    val_dataset = val_dataset.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE) # No need to shuffle validation data

    print("\nData prepared for training and validation, and batched.")

else:
    print("DataFrame 'df' not found or is empty. Please run the cell to load the CSV first and ensure it's filtered.")

In [None]:
import tensorflow as tf

# @tf.function # Use tf.function for better performance
def train_step(images, labels, original_lengths, model, optimizer, char_to_int):
    """
    Performs one training step, including forward pass, CTC loss calculation,
    and gradient application.

    Args:
        images: A batch of preprocessed images (TensorFlow Tensor, float32).
        labels: A batch of padded integer-encoded labels (TensorFlow Tensor, int32).
        original_lengths: A batch of original, unpadded label lengths (TensorFlow Tensor, int32).
        model: The combined CNN-RNN model.
        optimizer: The optimizer for updating model weights.
        char_to_int: Dictionary mapping characters to integers (used to get blank index).

    Returns:
        The calculated CTC loss for the batch (TensorFlow Tensor).
    """
    with tf.GradientTape() as tape:
        # Forward pass: Get model predictions
        predictions = model(images, training=True) # training=True is important for layers like BatchNorm, Dropout

        # --- Debugging Prints ---
        # print("\n--- Debugging train_step inputs/predictions ---")
        # print("Images shape:", images.shape)
        # print("Labels shape:", labels.shape)
        # print("Original lengths shape:", original_lengths.shape)
        # print("Predictions shape:", predictions.shape)
        # print("Predictions dtype:", predictions.dtype)
        # print("Predictions min value:", tf.reduce_min(predictions))
        # print("Predictions max value:", tf.reduce_max(predictions))
        # print("Predictions mean value:", tf.reduce_mean(predictions))
        # print("Predictions contains NaNs:", tf.reduce_any(tf.math.is_nan(predictions)))
        # print("Predictions contains Infs:", tf.reduce_any(tf.math.is_inf(predictions)))
        # # print("First few predictions (softmax output):", predictions[0, :5, :].numpy()) # Optional: print actual values
        # print("--- End Debugging ---")


        # Calculate CTC Loss using tf.compat.v1.nn.ctc_loss
        # tf.compat.v1.nn.ctc_loss expects log probabilities
        predictions_log_softmax = tf.math.log(tf.clip_by_value(predictions, 1e-8, 1.0))

        # Create SparseTensor for true labels using original lengths
        blank_index = char_to_int['-'] # Get the index of the blank token
        # Find the indices of non-blank characters
        indices = tf.where(tf.not_equal(labels, blank_index))
        # Get the values at these indices
        values = tf.gather_nd(labels, indices)
        # Get the dense shape (batch_size, max_label_length) - use max_label_length from data prep
        # The dense shape needs to be consistent with the padded labels
        dense_shape = tf.shape(labels, out_type=tf.int64) # Use shape of padded labels

        # Create the SparseTensor
        sparse_labels_tensor = tf.SparseTensor(indices, values, dense_shape)


        # --- Debugging SparseTensor ---
        # print("\n--- Debugging SparseTensor for labels ---")
        # print("SparseTensor indices shape:", tf.shape(indices))
        # print("SparseTensor values shape:", tf.shape(values))
        # print("SparseTensor dense shape:", dense_shape)
        # # print("First few SparseTensor indices:", indices[:10].numpy()) # Optional
        # # print("First few SparseTensor values:", values[:10].numpy()) # Optional
        # print("--- End Debugging ---")


        # Get input lengths (time steps from predictions)
        # input_length = tf.fill([tf.shape(predictions)[0]], tf.shape(predictions)[1]) # This might be too large if padding is excessive
        # Use the actual number of time steps available for prediction, which is the width of the RNN output
        input_length = tf.fill([tf.shape(predictions)[0]], tf.shape(predictions)[1]) # Number of time steps is the second dimension of predictions

        # --- Debugging Input Length ---
        # print("\n--- Debugging Input Length for CTC ---")
        # print("Input length shape:", input_length.shape)
        # print("Input length values (first 5):", input_length[:5].numpy()) # Should be constant within a batch
        # print("--- End Debugging ---")


        # Calculate the CTC loss
        # Pass the original_lengths_batch as sequence_length (experimental fix)
        loss = tf.compat.v1.nn.ctc_loss(
            labels=tf.cast(sparse_labels_tensor, tf.int32), # Use the correctly named SparseTensor here
            inputs=predictions_log_softmax,
            sequence_length=input_length, # Using input_length derived from predictions shape
            time_major=False
        )

        # tf.compat.v1.nn.ctc_loss returns per-batch loss, take the mean
        mean_batch_loss = tf.reduce_mean(loss)

    # Calculate gradients
    gradients = tape.gradient(mean_batch_loss, model.trainable_variables)

    # Apply gradient clipping
    # Clip gradients by global norm to prevent exploding gradients
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, clip_norm=5.0) # You can adjust clip_norm value


    # --- Debugging Gradients ---
    # print("\n--- Debugging Gradients ---")
    # for grad, var in zip(clipped_gradients, model.trainable_variables):
    #     if grad is not None:
    #         print(f"Gradient for {var.name}: shape={grad.shape}, min={tf.reduce_min(grad):.4f}, max={tf.reduce_max(grad):.4f}, mean={tf.reduce_mean(grad):.4f}")
    #     else:
    #         print(f"Gradient for {var.name}: None")
    # print("--- End Debugging ---")


    # Apply gradients to update model weights
    optimizer.apply_gradients(zip(clipped_gradients, model.trainable_variables))

    return mean_batch_loss

print("Debugging prints for gradients commented out in train_step function.")

In [None]:
import time
import tensorflow as tf
import numpy as np

# Re-instantiate the optimizer if needed (optional, but ensures a fresh start)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Compile the model with the Adam optimizer and a dummy loss function.
# The actual CTC loss calculation will be done in the custom training loop/step.
combined_model.compile(optimizer=optimizer, loss=None)

print("Model compiled successfully with Adam optimizer (learning rate 0.001) and dummy loss.")

# --- Manual Callback Implementation Setup ---
# Initialize variables for Early Stopping
best_val_loss = float('inf')
epochs_since_last_improvement = 0
best_weights = None # To store the best model weights

# Initialize variables for ReduceLROnPlateau
lr_patience_counter = 0
current_lr = tf.keras.backend.get_value(optimizer.learning_rate)

# Get the number of batches for training and validation (recalculate in case dataset changed)
num_train_batches = tf.data.experimental.cardinality(train_dataset).numpy()
num_val_batches = tf.data.experimental.cardinality(val_dataset).numpy()
print(f"Number of training batches per epoch: {num_train_batches}")
print(f"Number of validation batches per epoch: {num_val_batches}")


# Define the number of epochs for training
epochs = 70 # Increased epochs to give the model more time to learn

# --- Manual Callback Patientce Adjustment ---
# Increased patience for Early Stopping as requested
early_stopping_patience = 10 # Increased patience from 5 to 10

# Keep ReduceLROnPlateau patience as is for now, can adjust later if needed
reduce_lr_patience = 3
reduce_lr_factor = 0.5
reduce_lr_min_lr = 0.00001

# --- Training Loop with Manual Callback Logic ---
print(f"\nStarting training with manual callback logic for {epochs} epochs...")
for epoch in range(epochs):
    epoch_train_loss = 0.0
    start_time = time.time()

    # Training steps
    print(f"Epoch {epoch+1}/{epochs}: Training...")
    for step, (images_batch, labels_batch, original_lengths_batch) in enumerate(train_dataset):
        # Perform a training step
        batch_loss = train_step(images_batch, labels_batch, original_lengths_batch, combined_model, optimizer, char_to_int)

        # Accumulate the training loss
        epoch_train_loss += batch_loss

        # Optional: Print loss periodically
        if step % 50 == 0: # Print more frequently to see progress
            print(f"Epoch {epoch+1}, Step {step}/{num_train_batches}: Training Loss = {batch_loss.numpy():.4f}")


    # Calculate average epoch training loss
    if num_train_batches > 0:
        average_epoch_train_loss = epoch_train_loss / num_train_batches
    else:
        average_epoch_train_loss = 0.0


    # --- Validation Step ---
    epoch_val_loss = 0.0
    print(f"Epoch {epoch+1}/{epochs}: Validating...")
    for step, (val_images_batch, val_labels_batch, val_original_lengths_batch) in enumerate(val_dataset):
        # Forward pass on validation data (training=False)
        val_predictions = combined_model(val_images_batch, training=False)

        # Calculate CTC Loss for validation batch
        # Using the same logic as in train_step, but without gradients
        val_predictions_log_softmax = tf.math.log(tf.clip_by_value(val_predictions, 1e-8, 1.0))

        # Create SparseTensor for validation labels
        blank_index = char_to_int['-']
        indices = tf.where(tf.not_equal(val_labels_batch, blank_index))
        values = tf.gather_nd(val_labels_batch, indices)
        dense_shape = tf.shape(val_labels_batch, out_type=tf.int64)
        sparse_val_labels_tensor = tf.SparseTensor(indices, values, dense_shape)

        # Get input lengths (time steps from predictions)
        val_input_length = tf.fill([tf.shape(val_predictions)[0]], tf.shape(val_predictions)[1])

        val_batch_loss = tf.compat.v1.nn.ctc_loss(
            labels=tf.cast(sparse_val_labels_tensor, tf.int32),
            inputs=val_predictions_log_softmax,
            sequence_length=val_input_length,
            time_major=False
        )

        # Accumulate the validation loss
        epoch_val_loss += tf.reduce_mean(val_batch_loss) # Take mean per batch

    # Calculate average epoch validation loss
    if num_val_batches > 0:
        average_epoch_val_loss = epoch_val_loss / num_val_batches
    else:
        average_epoch_val_loss = 0.0


    end_time = time.time()
    epoch_duration = end_time - start_time

    print(f"Epoch {epoch+1} finished in {epoch_duration:.2f} seconds.")
    print(f"Average Training Loss: {average_epoch_train_loss.numpy():.4f}")
    print(f"Average Validation Loss: {average_epoch_val_loss.numpy():.4f}")


    # --- Manual Callback Checks using Validation Loss ---
    # Early Stopping Logic
    if average_epoch_val_loss < best_val_loss:
        best_val_loss = average_epoch_val_loss
        epochs_since_last_improvement = 0
        # Save best weights manually
        best_weights = combined_model.get_weights() # Store the model weights
        print(f"Validation loss improved. Best validation loss: {best_val_loss:.4f}")
    else:
        epochs_since_last_improvement += 1
        print(f"Validation loss did not improve. Epochs since last improvement: {epochs_since_last_improvement}/{early_stopping_patience}")
        # Check if early stopping patience is reached
        if epochs_since_last_improvement >= early_stopping_patience:
            print(f"Early stopping triggered after {epoch+1} epochs.")
            # Restore best weights
            if best_weights is not None:
                combined_model.set_weights(best_weights)
                print("Restored best model weights.")
            break # Exit the training loop


    # ReduceLROnPlateau Logic
    if epochs_since_last_improvement > 0: # Only reduce LR if validation loss hasn't improved
        lr_patience_counter += 1
        if lr_patience_counter >= reduce_lr_patience:
            # Reduce learning rate
            old_lr = tf.keras.backend.get_value(optimizer.learning_rate)
            new_lr = max(old_lr * reduce_lr_factor, reduce_lr_min_lr)
            # Corrected line: Directly set the learning_rate attribute
            optimizer.learning_rate.assign(new_lr)
            print(f"Reducing learning rate from {old_lr:.6f} to {new_lr:.6f}")
            lr_patience_counter = 0 # Reset counter
    else:
        lr_patience_counter = 0 # Reset counter if validation loss improved


    print(f"Current Learning Rate: {tf.keras.backend.get_value(optimizer.learning_rate):.6f}")
    print("-" * 30)


print("Training finished.")

In [None]:
import tensorflow as tf
import os

# Define the directory where you want to save the model
model_save_dir = 'saved_ocr_model'
os.makedirs(model_save_dir, exist_ok=True) # Create the directory if it doesn't exist

# Define the path to save the model with a .keras extension
model_save_path = os.path.join(model_save_dir, 'combined_ocr_model.keras') # Added .keras extension

# Check if the combined_model is defined and trained
if 'combined_model' in locals() and isinstance(combined_model, tf.keras.Model):
    print(f"Saving the trained model to: {model_save_path}")
    try:
        # Save the model in the native Keras format
        combined_model.save(model_save_path)
        print("Model saved successfully.")
    except Exception as e:
        print(f"An error occurred while saving the model: {e}")
else:
    print("Error: The 'combined_model' is not defined or is not a valid Keras Model. Please ensure the model has been built and trained.")