<a href="https://colab.research.google.com/github/ArooshKics/PdfOcrCode/blob/master/PdfOcr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import cv2
import numpy as np
import csv
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Conv2D, MaxPooling2D, BatchNormalization, Activation, Reshape, Dense, LSTM
from sklearn.model_selection import train_test_split


from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Pre-Processing

In [None]:
# Paths of images and text directories
images_dir = '/content/drive/MyDrive/Pdf_Ocr/Datasets/Dataset_500/images'
text_files_dir = '/content/drive/MyDrive/Pdf_Ocr/Datasets/Dataset_500/texts'
char_maps_pth = '/content/drive/MyDrive/Pdf_Ocr/Datasets/Dataset_500/labels/gt_char.csv'

In [None]:
gt_dir = char_maps_pth
images_dir = images_dir

gt = []
filenames = []

def _size_alright(directory, img_width=None, img_range=None):
    if img_range == None:
        return True

    image = cv2.imread(directory, cv2.IMREAD_GRAYSCALE)
    if image.shape[1] <= img_width + img_range and image.shape[1] >= img_width - img_range:
        return True

    return False

# Read by gt by reading the csv path file in directory
with open(gt_dir, 'r', encoding='utf8') as gt_file:
    text = csv.reader(gt_file, quoting=csv.QUOTE_NONE)
    # print("Reading the gt file : ") # Debug print statement by Aroosh
    for i, row in enumerate(text):
        # print(f"Reading row {i} : {row}") # Debug print statement by Aroosh
        if images_dir is None:
            is_row_valid = True
        else:
            image_name = row[0].split('.')[0]
            path = os.path.join(images_dir, image_name+".jpg")
            if os.path.exists(path):
              image = cv2.imread(path)
              img_wdth = image.shape[1]
              is_row_valid = path is not None and _size_alright(path, img_wdth, img_range = None)
              if is_row_valid:
                sub_row = row[1:]
                sub_row[0]  = sub_row[0].replace("[",'')
                sub_row[0]  = sub_row[0].replace('"','')
                sub_row[-1] = sub_row[-1].replace("]",'')
                sub_row[-1] = sub_row[-1].replace('"','')

                #------------Lines added by Aroosh-------------#
                # Added on 26 March 2024
                # Issue : 1. last entry has ] which is replaced above, but here we get last entry as '' to make equal sequence
                # 2. cannot convert '' to int and also 14] to int.
                sub_row = [num.replace("]",'') for num in sub_row]
                sub_row = [num for num in sub_row if num != '']
                #------------Lines added by Aroosh-------------#

                gt.append([int(num) for num in sub_row])

                if images_dir is not None:
                  filenames.append(path)


In [None]:
print(len(gt), len(filenames))

460 460


In [None]:
filenames[0]

'/content/drive/MyDrive/Pdf_Ocr/Datasets/Dataset_500/images/Al Jihad Fil Islam (Volume 02) SwaneUmri Hazrat Uma103_Line4.jpg'

## Loading Images

In [None]:
images = [] # images will be a list of images

for img_pth in filenames:
    img = cv2.imread(img_pth)
    if img is not None:
      images.append(img)
    else:
      print(f"Failed to load image: {img_pth}")

In [None]:
images[0].shape

(93, 1500, 3)

In [None]:
import math
def is_light_or_dark(rgb_color):
    r, g, b = rgb_color
    hsp = math.sqrt(0.299 * (r * r) + 0.587 * (g * g) + 0.114 * (b * b))
    if hsp > 150:  # Threshold for light/dark
        return 'light'
    else:
        return 'dark'

def get_gray_image(image):
    # Function to invert image if background is dark
    def invert_image(img):
        return cv2.bitwise_not(img)

    # Check if the image has a dark background
    def has_dark_background(img):
        average_color = np.mean(img, axis=(0, 1))
        return is_light_or_dark(average_color) == 'dark'

    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # If background is dark, invert the grayscale image
    if has_dark_background(image):
        gray_image = invert_image(gray_image)

    return gray_image

In [None]:
features = [] # append the f , feature in it
seq_len = [] # append the new_size[0] i.e. the height here

num_buckets = 4
image_size = ["None", 64]

for image in images:

  """
    First the image is binarized then other things happen here.
  """

  # Convert to grayscale
  image = get_gray_image(image)

  (wt, ht) = image_size


  (h, w) = image.shape
  # If Image  width is not defiend in our case wt is always none because the in config file none,64
  f = ht / h
  new_size = (int(w * f), ht)

  # # resize the image into new size
  image = cv2.resize(image, new_size)

  features.append(image)
  seq_len.append(new_size[0])


In [None]:
min(seq_len), max(seq_len), len(seq_len)

(711, 4000, 460)

# Bucket Images

In [None]:
def _bucket(images, seq_len, num_buckets):
    # Bucketing may require some experimentation for each dataset. If there are too many buckets and too few points
    # then some buckets may be empty because the total range is divided *equally* between all buckets. Empty buckets
    # may cause errors in later functions.
    # This function does not remove empty buckets deliberately, because otherwise the user may think that the data is
    # being divided into the number of buckets specified, which may then lead to reporting errors.

    min_seq_len = min(seq_len)
    max_seq_len = max(seq_len)
    bins = np.arange(min_seq_len, max_seq_len, (max_seq_len - min_seq_len) / num_buckets)
    bucket_indices = np.digitize(seq_len, bins)  # should be a list of values ranging from 1 to num_buckets inclusively

    # This does bucketing in O(N) where N is the number of images, i.e.
    # we only have to traverse the bucket_indices list of size N once.
    bucket_images = [[] for _ in range(num_buckets)]
    bucket_seq_len = [[] for _ in range(num_buckets)]

    for i, j in enumerate(bucket_indices):
        bucket_images[j - 1].append(images[i])
        bucket_seq_len[j - 1].append(np.array(seq_len[i]))

    for i in range(num_buckets):
        bucket_seq_len[i] = np.array(bucket_seq_len[i])

    return bucket_images, bucket_seq_len, bucket_indices

features, seq_len, bucket_indices = _bucket(features, seq_len, num_buckets)


In [None]:
type(seq_len)

list

In [None]:
def _pad_image_horizontally(image, max_width, flip_image=True):
    image = _copy_to_target(image, [image.shape[0], max_width], flip_image=flip_image)
    image = _normalize(image)

    return image

In [None]:
resized_features = []

if image_size[0] is None:
    for features_bucket, seq_len_bucket in zip(features, seq_len):
        new_features = []
        # below we get a [] for seq_len_bucket which causes an issue
        max_seq_len = max([i for i in seq_len_bucket])
        for f in features_bucket:
            # Pad hosrizontally always because we have veritically standard 64 mention in config_file
            n_f = _pad_image_horizontally(f, max_seq_len, flip_image=flip_image)
            new_features.append(n_f)

            if VERBOSE:
                cv2.imshow("padded image", n_f)
                cv2.waitKey(1000)
        # resize image according to the new features
        resized_features.append(np.array(new_features))

features = resized_features


In [None]:
seq_len = seq_len[0] # have to see later what caused this , to have the element at index 0

In [None]:
features = np.array(features)
seq_len =  np.array(seq_len)

# Bucket Ground Truths

In [None]:
type(gt), len(gt)

(list, 460)

In [None]:
y_seq_len = [len(g) for g in gt]

In [None]:
len(y_seq_len)

460

In [None]:
y = [gt]
y_seq_len = [np.array(y_seq_len)]

In [None]:
len(bucket_indices)

460

In [None]:
def _bucket_gt(gt, num_buckets, bucket_indices):
    y_seq_len = [len(g) for g in gt]

#Load the data w.r.t the bucket indices
    if bucket_indices is None:
        y = [gt]
        y_seq_len = [np.array(y_seq_len)]

    else:
        y = [[] for _ in range(num_buckets)]
        seq_len = [[] for _ in range(num_buckets)]

        for i, j in enumerate(bucket_indices):
            y[j-1].append(np.array(gt[i]))
            seq_len[j-1].append(np.array(y_seq_len[i]))

        for i in range(num_buckets):
            y[i] = np.array(y[i])
            seq_len[i] = np.array(seq_len[i])

        y_seq_len = seq_len

    return np.array(y), np.array(y_seq_len)

In [None]:
# Now we have the imges,their seq lens and same y and their seq lens
# features = np.array(features)
# seq_len =  np.array(seq_len)
# y = [gt]
# y_seq_len = [np.array(y_seq_len)]

X = features
X_seq_len = seq_len

vocab_size = int(max([max(yij) for yi in y for yij in yi]) + 1)
y_max_len = np.max([np.max(s) for s in y_seq_len])

dataset = (X, X_seq_len, y, y_seq_len)

In [None]:
def prepare_dataset(dataset, r=1.0, shuffle=True, all_indices=None): #dataset => (x,x_seq_len,y,y_seq_len)
    # Prepare 'dataset' for the minibatches function.
    # Each element in 'dataset' is a list of size 'num_buckets'.

    # If r != 1, then divide each bucket into two sets. Used to prepare test and validation
    # sets. Dividing each bucket separately makes the validation set more balanced.

    print("Preparing dataset....")
    restoring_previous = False if all_indices is None else True

    num_buckets = dataset[0].shape[0]#No. of buckets
    train_dataset = [[] for _ in range(num_buckets)]# train dataset initialization
    val_dataset = [[] for _ in range(num_buckets)]# validation dataset initialization

    to_remove = []
    if not restoring_previous:# Restoring flag in config
        all_indices = []
    for i in range(num_buckets):
        num_train = dataset[0][i].shape[0]

        if restoring_previous:
            indices = all_indices[i]
        else:
            indices = np.arange(num_train)
            if shuffle:
                np.random.shuffle(indices)#Suffle the indices
            all_indices.append(indices)
#Split the dataset into the validation and train dataset.
        split_at = int(np.ceil(r*num_train))
        indices_train = indices[:split_at]
        indices_val = indices[split_at:]
#And Make sure the validation data is not in train dataset and vise versa
        train_dataset[i] = [d[i][indices_train] for d in dataset]
        if split_at == num_train:
            to_remove.append(i)
        else:
            val_dataset[i] = [d[i][indices_val] for d in dataset]
#Create a empty buckets in val_dataset of training dataset.
    val_dataset = [vd for i, vd in enumerate(val_dataset) if not i in to_remove] if r != 1 else []

    return train_dataset, val_dataset, all_indices


In [None]:
SPLIT_RATIO = 0.80
indices = None
train_dataset,val_dataset, indices = prepare_dataset(dataset,
                r=SPLIT_RATIO,
                shuffle=True,
                all_indices=indices)

Preparing dataset....


In [None]:
type(train_dataset), train_dataset

(list, [])