<a href="https://colab.research.google.com/github/ArooshKics/PdfOcrCode/blob/master/PdfOcr_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import cv2
import numpy as np
import csv
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Conv2D, MaxPooling2D, BatchNormalization, Activation, Reshape, Dense, LSTM
from sklearn.model_selection import train_test_split


# from google.colab import drive
# drive.mount('/content/drive')


# Check if Google Drive is mounted
if not os.path.exists('/content/drive'):
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")

Google Drive is already mounted.


In [None]:
# Paths of images and text directories
images_dir = '/content/drive/MyDrive/Pdf_Ocr/Datasets/Dataset_500/images'
texts_dir = '/content/drive/MyDrive/Pdf_Ocr/Datasets/Dataset_500/texts'
lt_pth = '/content/drive/MyDrive/Pdf_Ocr/Datasets/Dataset_500/labels/lt_char.csv'

In [None]:
# Load Images and their corresponding text.

img_pths= []
txt_pths = []

for img_name in os.listdir(images_dir):
  img_pth = os.path.join(images_dir, img_name)
  txt_pth = os.path.join(texts_dir, img_name[:-4]+".txt")

  if os.path.exists(img_pth) and os.path.exists(txt_pth):
    img_pths.append(img_pth)
    txt_pths.append(txt_pth)


In [None]:
len(img_pths), len(txt_pths)

(460, 460)

In [None]:
import tensorflow as tf
import re

def preprocess_image(image_path):
    # # Read image
    # image = tf.io.read_file(image_path)
    # # Decode image
    # image = tf.io.decode_image(image, channels=3)  # Assuming RGB images
    # # Resize image if needed
    # # image = tf.image.resize(image, [new_height, new_width])

    image = cv2.imread(image_path)
    new_size = (1000, 64) # width, height, channel
    image = cv2.resize(image, new_size)

    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    return image


# Preprocess Text file

def preprocess_text(txt_pth):
    english_chars = '[A-Za-z0-9۱۲۳۴۵۶۷۸۹۰]'

    with open(txt_pth, mode='r', encoding='utf-8-sig') as f:
        try:
            text = f.read()

            non_joiners = ['آ', 'ا', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'ں', 'و', 'ے', '\"', '،', '(', ')', '؟', '۔', '!', ':']
            ligatures = []
            ligatures_return = []

            words = text.split(' ')

            for word in words:
                ligature = ''
                for char in word:
                    if char not in non_joiners:
                        ligature += char
                    else:
                        ligature += char
                        ligatures.append(ligature)
                        ligatures_return.append(ligature)
                        ligature = ''
                if ligature!= '':
                    ligatures.append(ligature)
                    ligatures_return.append(ligature)

            extra_char = ['\"', '،', '(', ')', '؟', '۔', '!', ':', 'ء']

            lig_list = []
            for ligature in ligatures:
                for char in ligature:
                    result = re.findall(english_chars, char)
                    if result:
                        lig_list.append(char + '_isolated')
                        ligature = ligature.replace(char, '')
                    if char in extra_char:
                        char_index = ligature.index(char)
                        ligature = ligature.replace(char, '')
                if ligature:
                    if (len(ligature) == 1):
                        a = ligature + '_isolated'
                        lig_list.append(a)
                    else:
                        initial = ligature[0]
                        b = initial + '_initial'
                        lig_list.append(b)
                        middles = ligature[1:-1]
                        if middles:
                            for middle in middles:
                                c = middle + '_middle'
                                lig_list.append(c)
                        final = ligature[-1]
                        d = final + '_final'
                        lig_list.append(d)

            # Load the label dictionary from the CSV file
            with open(lt_pth, mode='r') as lt_file:
                reader = csv.reader(lt_file)
                label_dict = {row[0]: int(row[1]) for row in reader}

            # Convert the ligatures to labels
            labels = [label_dict.get(lig, 0) for lig in lig_list]

            return labels

        except Exception as e:
            print("Exception occured")
            print(e)
            return []



In [None]:
images = []
labels = []

for img_pth, txt_pth in zip(img_pths,txt_pths):
  image = preprocess_image(img_pth)
  images.append(image)

  label = preprocess_text(txt_pth)
  labels.append(label)


In [None]:
def pad_lists(labels):
    max_length = max(len(lst) for lst in labels)
    padded_lists = [lst + [-1] * (max_length - len(lst)) for lst in labels]
    return padded_lists, max_length

padded_labels, max_length = pad_lists(labels)

In [None]:
len(labels[1]), len(padded_labels[1])

(46, 79)

In [None]:
# Convert image lists to TensorFlow tensors
images_tensor = tf.convert_to_tensor(images)

In [None]:
# Convert label lists to TensorFlow tensors
labels_tensor = tf.convert_to_tensor(padded_labels)

In [None]:
# Create a TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((images_tensor, labels_tensor))

# Shuffle and batch the dataset
BATCH_SIZE = 5
dataset = dataset.shuffle(buffer_size=len(images)).batch(BATCH_SIZE)

# Example usage of the dataset
for batch in dataset.take(2):
    batch_images, batch_labels = batch
    print("Batch of images shape:", batch_images.shape)
    print("Batch of labels shape:", batch_labels.shape)

Batch of images shape: (5, 64, 1000)
Batch of labels shape: (5, 79)
Batch of images shape: (5, 64, 1000)
Batch of labels shape: (5, 79)


In [None]:
max_length

79

In [None]:
for image, label in zip(batch_images, batch_labels):
  print(image.shape, label.shape)

(64, 1000) (79,)
(64, 1000) (79,)
(64, 1000) (79,)
(64, 1000) (79,)
(64, 1000) (79,)


In [None]:
batch_labels[0]

<tf.Tensor: shape=(79,), dtype=int32, numpy=
array([  7,  14,  15,  90,  31,  32,  27,  33,   1,  15,   3,   7,  19,
        28,  34,  14, 132,  36,  51,  49,  27,  16,   3,  15,  16,  20,
        66,  31,   5,  10,  14,   7,   8,   9,  10,   1,  12,  20,   5,
         7,  72,  69,  42,  26,  27,  36,  14,  50,  21,  27,   7,  92,
        37,   1,  15,  33,  34,  14,  10,  14,  26,   5,  17,  18,  14,
        46,  64,  19,  21,  14,  33,  30,  -1,  -1,  -1,  -1,  -1,  -1,
        -1], dtype=int32)>

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, LSTM, Bidirectional

def build_crnn_model():
    model = Sequential()

    # CNN Layers
    # CNN Layer 1
    model.add(Conv2D(filters=32, kernel_size=(5, 5), strides=(1, 1), padding='SAME', activation='relu', input_shape=(64, 1000, 1)))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='SAME'))

    # CNN Layer 2
    model.add(Conv2D(filters=64, kernel_size=(5, 5), strides=(1, 2), padding='SAME', activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

    # CNN Layer 3
    model.add(Conv2D(filters=128, kernel_size=(5, 5), strides=(1, 2), padding='SAME', activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

    # CNN Layer 4
    model.add(Conv2D(filters=128, kernel_size=(5, 5), strides=(1, 2), padding='SAME', activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

    # CNN Layer 5
    model.add(Conv2D(filters=256, kernel_size=(3, 3), strides=(1, 2), padding='SAME', activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

    # CNN Layer 6
    model.add(Conv2D(filters=256, kernel_size=(3, 3), strides=(1, 2), padding='SAME', activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

    # CNN Layer 7
    model.add(Conv2D(filters=512, kernel_size=(3, 3), strides=(1, 1), padding='SAME', activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 1), strides=(1, 1), padding='SAME'))

    # Flatten Layer
    model.add(Flatten())

    # Reshape Layer
    model.add(Reshape((64, -1)))  # Reshape to (64, 1000)

    # Bidirectional LSTM Layers
    # Bidirectional LSTM Layer 1
    model.add(Bidirectional(LSTM(units=512, return_sequences=True)))

    # Bidirectional LSTM Layer 2
    model.add(Bidirectional(LSTM(units=512, return_sequences=False)))

    # Dropout Layer
    model.add(Dropout(0.2))

    # Output Layer
    model.add(Dense(units=num_classes, activation='softmax'))

    return model

# Build the CRNN model
crnn_model = build_crnn_model()

# Print model summary
crnn_model.summary()


Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_108 (Conv2D)         (None, 64, 1000, 32)      832       
                                                                 
 max_pooling2d_104 (MaxPool  (None, 32, 500, 32)       0         
 ing2D)                                                          
                                                                 
 conv2d_109 (Conv2D)         (None, 32, 250, 64)       51264     
                                                                 
 max_pooling2d_105 (MaxPool  (None, 32, 125, 64)       0         
 ing2D)                                                          
                                                                 
 conv2d_110 (Conv2D)         (None, 32, 63, 128)       204928    
                                                                 
 max_pooling2d_106 (MaxPool  (None, 32, 32, 128)     

In [None]:
from keras.optimizers import Adam

# Compile the model
optimizer = Adam(lr=0.001)  # You can adjust the learning rate as needed
crnn_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
num_epochs = 10  # Adjust the number of epochs as needed
crnn_model.fit(dataset, epochs=num_epochs)

# Optionally, you can evaluate the model on a test set
# test_loss, test_acc = crnn_model.evaluate(test_images, test_labels)
# print('Test accuracy:', test_acc)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79a059598550>

In [None]:
def preprocess_image(image_path):
    # Load the image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    # Resize the image to match the input shape of your model
    image = cv2.resize(image, (1000, 64))
    # Normalize the pixel values to be between 0 and 1
    image = image / 255.0
    # Add a batch dimension
    image = np.expand_dims(image, axis=0)
    # Add a channel dimension
    image = np.expand_dims(image, axis=-1)
    return image

def predict_image_label(model, image_path):
    # Preprocess the image
    image = preprocess_image(image_path)
    # Make predictions
    predictions = model.predict(image)
    # Get the predicted label
    predicted_label = np.argmax(predictions)
    return predicted_label


def predict_single_image(image_path):
    # Load the trained model
    model = crnn_model
    # Predict the label of the image
    predicted_label = predict_image_label(model, image_path)
    return predicted_label

image_path = "/content/drive/MyDrive/Pdf_Ocr/Datasets/Dataset_500/images/Al Jihad Fil Islam (Volume 02) SwaneUmri Hazrat Uma100_Line19.jpg"

predicted_label = predict_single_image(image_path)
print("Predicted label:", predicted_label)

Predicted label: 7
