<a href="https://colab.research.google.com/github/ArooshKics/PdfOcrCode/blob/master/PdfOcr_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import cv2
import numpy as np
import csv
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Conv2D, MaxPooling2D, BatchNormalization, Activation, Reshape, Dense, LSTM
from sklearn.model_selection import train_test_split
import pandas as pd


# Check if Google Drive is mounted
if not os.path.exists('/content/drive'):
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")

Mounted at /content/drive


In [2]:
# Paths of images and text directories
images_dir = '/content/drive/MyDrive/My Documents/Pdf_Ocr/Datasets/Training_200_set/images'
texts_dir = '/content/drive/MyDrive/My Documents/Pdf_Ocr/Datasets/Training_200_set/texts'
lt_pth = '/content/drive/MyDrive/My Documents/Pdf_Ocr/Datasets/Training_200_set/labels/lt_char.csv'

In [3]:
# Load Images and their corresponding text.

img_pths= []
txt_pths = []

for img_name in os.listdir(images_dir):
  img_pth = os.path.join(images_dir, img_name)
  txt_pth = os.path.join(texts_dir, img_name[:-4]+".txt")

  if os.path.exists(img_pth) and os.path.exists(txt_pth):
    img_pths.append(img_pth)
    txt_pths.append(txt_pth)


In [4]:
len(img_pths), len(txt_pths)

(200, 200)

In [5]:
import tensorflow as tf
import re

def preprocess_image(image_path):
    # # Read image
    # image = tf.io.read_file(image_path)
    # # Decode image
    # image = tf.io.decode_image(image, channels=3)  # Assuming RGB images
    # # Resize image if needed
    # # image = tf.image.resize(image, [new_height, new_width])

    image = cv2.imread(image_path)
    new_size = (1000, 64) # width, height, channel
    image = cv2.resize(image, new_size)

    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    return image


# Preprocess Text file

def preprocess_text(txt_pth):
    english_chars = '[A-Za-z0-9۱۲۳۴۵۶۷۸۹۰]'

    with open(txt_pth, mode='r', encoding='utf-8-sig') as f:
        try:
            text = f.read()

            non_joiners = ['آ', 'ا', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'ں', 'و', 'ے', '\"', '،', '(', ')', '؟', '۔', '!', ':']
            ligatures = []
            ligatures_return = []

            words = text.split(' ')

            for word in words:
                ligature = ''
                for char in word:
                    if char not in non_joiners:
                        ligature += char
                    else:
                        ligature += char
                        ligatures.append(ligature)
                        ligatures_return.append(ligature)
                        ligature = ''
                if ligature!= '':
                    ligatures.append(ligature)
                    ligatures_return.append(ligature)

            extra_char = ['\"', '،', '(', ')', '؟', '۔', '!', ':', 'ء']

            lig_list = []
            for ligature in ligatures:
                for char in ligature:
                    result = re.findall(english_chars, char)
                    if result:
                        lig_list.append(char + '_isolated')
                        ligature = ligature.replace(char, '')
                    if char in extra_char:
                        char_index = ligature.index(char)
                        ligature = ligature.replace(char, '')
                if ligature:
                    if (len(ligature) == 1):
                        a = ligature + '_isolated'
                        lig_list.append(a)
                    else:
                        initial = ligature[0]
                        b = initial + '_initial'
                        lig_list.append(b)
                        middles = ligature[1:-1]
                        if middles:
                            for middle in middles:
                                c = middle + '_middle'
                                lig_list.append(c)
                        final = ligature[-1]
                        d = final + '_final'
                        lig_list.append(d)

            # Load the label dictionary from the CSV file
            with open(lt_pth, mode='r') as lt_file:
                reader = csv.reader(lt_file)
                label_dict = {row[0]: int(row[1]) for row in reader}

            # Convert the ligatures to labels
            labels = [label_dict.get(lig, 0) for lig in lig_list]

            return labels

        except Exception as e:
            print("Exception occured")
            print(e)
            return []



In [6]:
images = []
labels = []

for img_pth, txt_pth in zip(img_pths,txt_pths):
  image = preprocess_image(img_pth)
  images.append(image)

  label = preprocess_text(txt_pth)
  labels.append(label)

In [7]:
"""
  Label Padding is not required, as CTC can handle variable length labels.
  You need to find the vocabulary size, which will be the number of classes your model will need to predict.
"""

images = np.expand_dims(images, axis=-1)
images.shape

(200, 64, 1000, 1)

In [8]:
from keras.preprocessing.sequence import pad_sequences

# Normalize images to have values between 0 and 1
images = np.array(images) / 255.0
# Pad labels to the same length with 999, as it is not in the char vocabulary.
max_label_length = max(len(label) for label in labels)
padded_labels = pad_sequences(labels, maxlen=max_label_length, padding='post', value=999)

max_label_length

81

In [9]:
for label in padded_labels:
  if len(label) != 81:
    print(len(label))

In [10]:
# Convert image lists to TensorFlow tensors
image_tensors = tf.convert_to_tensor(images)

# use ragged tensors, as your list elements are of variable length.
labels = tf.convert_to_tensor(padded_labels)

print("Type of images is :", type(image_tensors))
print("Type of labels is : ",type(labels))

Type of images is : <class 'tensorflow.python.framework.ops.EagerTensor'>
Type of labels is :  <class 'tensorflow.python.framework.ops.EagerTensor'>


In [11]:
# number of classes should be the total number of the characters in our vocabulary
lt_pth = "/content/drive/MyDrive/My Documents/Pdf_Ocr/Datasets/Training_200_set/labels/lt_char.csv"
# Load the CSV file into a DataFrame
df = pd.read_csv(lt_pth)
num_classes = df.shape[0] + 1
num_classes # it is actually the total number of the characters we need to predict.

200

In [12]:
# Number of classes (including the blank label for CTC)
num_classes = num_classes + 1  # Update this based on the actual number of unique labels + 1 for the blank label

# Convert labels to a numpy array
padded_labels = np.array(padded_labels)

# Input lengths (all sequences have the same length in this case)
input_lengths = np.ones((len(image_tensors), 1)) * (image_tensors.shape[2] // 2)

# Label lengths (actual lengths of the labels)
label_lengths = np.array([len(label) for label in labels]).reshape(-1, 1)

In [13]:
# Create the dataset
dataset = tf.data.Dataset.from_tensor_slices((image_tensors, padded_labels))

# Shuffle the entire dataset
dataset = dataset.shuffle(buffer_size=len(image_tensors), seed=42)

# Define the split ratio
validation_split = 0.2
num_samples = len(image_tensors)
num_val_samples = int(validation_split * num_samples)
num_train_samples = num_samples - num_val_samples

# Split the dataset into training and validation sets
train_dataset = dataset.take(num_train_samples)
val_dataset = dataset.skip(num_train_samples)

# Batch the datasets
BATCH_SIZE = 5
train_dataset = train_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)

In [43]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, LSTM, Bidirectional, Reshape
from keras.optimizers import Adam
import keras.backend as K
from keras.utils import pad_sequences
from keras.layers import Input, Lambda
from keras.models import Model
import nltk


# Define the model
model = Sequential()

# CNN Layers
# CNN Layer 1
model.add(Conv2D(filters=32, kernel_size=(5, 5), strides=(1, 1), padding='SAME', activation='relu', input_shape=(64, 1000, 1)))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='SAME'))

# CNN Layer 2
model.add(Conv2D(filters=64, kernel_size=(5, 5), strides=(1, 2), padding='SAME', activation='relu'))
model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

# CNN Layer 3
model.add(Conv2D(filters=128, kernel_size=(5, 5), strides=(1, 2), padding='SAME', activation='relu'))
model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

# CNN Layer 4
model.add(Conv2D(filters=128, kernel_size=(5, 5), strides=(1, 2), padding='SAME', activation='relu'))
model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

# CNN Layer 5
model.add(Conv2D(filters=256, kernel_size=(3, 3), strides=(1, 2), padding='SAME', activation='relu'))
model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

# CNN Layer 6
model.add(Conv2D(filters=256, kernel_size=(3, 3), strides=(1, 2), padding='SAME', activation='relu'))
model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='SAME'))

# CNN Layer 7
model.add(Conv2D(filters=512, kernel_size=(3, 3), strides=(1, 1), padding='SAME', activation='relu'))
model.add(MaxPooling2D(pool_size=(1, 1), strides=(1, 1), padding='SAME'))

# Flatten Layer
model.add(Flatten())

# Calculate the output shape after the CNN layers
# Assuming input shape is (64, 1000, 1), the output shape after Flatten will be (16, 1024)
# Thus, we need to reshape it to (81, something) for LSTM layers

# Adjust Reshape Layer
model.add(Reshape((128, 128)))  # Adjust 'something' based on the output shape after Flatten and before Reshape

# Bidirectional LSTM Layers
# Bidirectional LSTM Layer 1
model.add(Bidirectional(LSTM(units=512, return_sequences=True)))

# Bidirectional LSTM Layer 2
model.add(Bidirectional(LSTM(units=512, return_sequences=True)))

# Dropout Layer
model.add(Dropout(0.2))

# Output Layer
model.add(Dense(units=num_classes, activation='softmax'))


In [54]:
# Loss Functions

def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    # print("Lengths are : ",type(input_length),type(label_length))
    # print("Lengths are : ",input_length,label_length)
    loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss


In [None]:
# Optimizer
opt = Adam(learning_rate=0.001)
model.compile(loss = CTCLoss, optimizer= opt)

# Fit the model using the training and validation datasets
history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset
)

In [None]:
model.save('/content/drive/MyDrive/My Documents/Pdf_Ocr/first_model.h5')

  saving_api.save_model(


In [56]:
import cv2
import numpy as np

# Load and preprocess your image
img_pth = "/content/drive/MyDrive/My Documents/Pdf_Ocr/Datasets/Training_200_set/images/Al Jihad Fil Islam (Volume 02) SwaneUmri Hazrat Uma100_Line7.jpg"
img = cv2.imread(img_pth)

# Resize the image to match your model's input shape (1000, 64, 1)
new_size = (1000, 64)  # width, height
img = cv2.resize(img, new_size)

# Convert to grayscale
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Normalize the image to have values between 0 and 1
img_normalized = img_gray / 255.0

# Expand dimensions to make it a batch of 1 (if necessary)
img_input = np.expand_dims(img_normalized, axis=0)

In [57]:
from keras.models import load_model

# Load your trained model
model_path = "/content/drive/MyDrive/My Documents/Pdf_Ocr/first_model.h5"
model = load_model(model_path, custom_objects={'CTCLoss': CTCLoss})




In [76]:
predictions = model.predict(img_input)



In [77]:
predictions

array([[[1.1519050e-02, 2.4325051e-04, 3.1199835e-03, ...,
         8.3262266e-06, 1.0694757e-05, 7.6095611e-02],
        [6.6412009e-02, 6.4807228e-04, 2.2023341e-04, ...,
         6.0783523e-06, 8.5689480e-06, 5.2516967e-01],
        [5.9029865e-03, 1.0208150e-03, 9.7988464e-05, ...,
         8.2924538e-07, 1.9935997e-06, 6.9850719e-01],
        ...,
        [1.5212782e-02, 5.9366360e-04, 2.8618608e-04, ...,
         2.2555716e-06, 2.4901246e-06, 7.1094358e-01],
        [2.5523128e-03, 1.0333974e-03, 1.4288729e-04, ...,
         1.0565723e-06, 1.9818876e-06, 6.8200594e-01],
        [1.9799829e-02, 4.2850129e-02, 3.0595325e-03, ...,
         1.3863455e-05, 1.6370152e-05, 8.2115822e-02]]], dtype=float32)

In [78]:
len(predictions[0][0])

201

In [79]:
predictions.shape

(1, 128, 201)

In [80]:
predictions_reshaped = predictions.reshape((128, 201))

In [83]:
def ctc_decode(predictions):
    # Assume predictions shape is (1, T, C), where T is time steps and C is classes
    pred_shape = predictions.shape
    pred_labels = K.ctc_decode(predictions, input_length=np.ones(pred_shape[0]) * pred_shape[1])[0][0]

    # Convert sparse tensor to string
    decoded_text = K.get_value(pred_labels[0])

    return decoded_text

# Assuming predictions is your model output for the single image
decoded_text = ctc_decode(predictions)
print(decoded_text)

[16 27 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1]


In [87]:
gt_pth = "/content/drive/MyDrive/My Documents/Pdf_Ocr/Datasets/Training_200_set/texts/Al Jihad Fil Islam (Volume 02) SwaneUmri Hazrat Uma100_Line7.txt"

with open(gt_pth, 'r') as file:
  content = file.readlines()
  print(content)


['’’سب لوگوں کو معلوم ہے کہ میں سب سے زیادہ مستحق ہوں جس ']


In [88]:
label = preprocess_text(gt_pth)
print(label)

[0, 0, 72, 85, 64, 1, 31, 1, 58, 26, 1, 7, 75, 49, 1, 50, 47, 27, 26, 45, 7, 8, 9, 21, 85, 21, 27, 83, 20, 14, 12, 46, 7, 72, 22, 69, 79, 47, 1, 58, 10, 38]


In [93]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein)
  Downloading rapidfuzz-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.25.1 rapidfuzz-3.9.1


In [94]:
from Levenshtein import distance

def character_error_rate(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Filter out blanks (-1) from both true and predicted sequences
    true_filtered = [char for char in y_true if char != -1]
    pred_filtered = [char for char in y_pred if char != -1]

    # Calculate Levenshtein distance between true and predicted sequences
    dist = distance(''.join(map(str, true_filtered)), ''.join(map(str, pred_filtered)))

    # Calculate Character Error Rate
    cer = dist / len(true_filtered) if len(true_filtered) > 0 else 0.0

    return cer

loss = character_error_rate(label, decoded_text)
print(loss)

1.619047619047619
