In [167]:
alphabet = "ةأىآإ ابتثجحخدذرزسشصضطظعغفقكلمنهوي٠١٢٣٤٥٦٧٨٩0123456789ئءؤ"
# Create a mapping between characters and their indices
char2index = {char: idx for idx, char in enumerate(alphabet)}
index2char = {idx: char for idx, char in enumerate(alphabet)}
char2index[" "]

5

# Utility functions

In [2]:
def encode(text):
    # target_tensors =[]
   
    token_indices = [char2index[char] for char in text]
#     target_tensors.append(token_indices)

    return token_indices
    

In [3]:
import re
import unicodedata

def remove_arabic_diacritics(text):
    # Remove Arabic diacritics using Unicode normalization
    normalized_text = unicodedata.normalize('NFD', text)
    cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)])
    
    return cleaned_text

# Sample text with Arabic diacritics
text_with_diacritics = "مَرْحَبًا بِكُم"

# Remove diacritics
cleaned_text = remove_arabic_diacritics(text_with_diacritics)

print(cleaned_text)  # Output: "مرحبا بكم"


مرحبا بكم


In [4]:
import re

def remove_english_letters(text):
    # Define a regular expression pattern to match English letters
    pattern = r'[a-zA-Z]'

    # Use re.sub() to remove English letters from the text
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

# Sample text with mixed Arabic and English letters
mixed_text = "مَرْحَبًا بِHello كُم"

# Remove English letters
cleaned_text = remove_english_letters(mixed_text)

print(cleaned_text)  # Output: "مَرْحَبًا بِ كُم"


مَرْحَبًا بِ كُم


In [5]:


def pad_list(lst, desired_length , value):
    current_length = len(lst)

    if current_length < desired_length:
        # Calculate the number of zeros needed
        num_zeros = desired_length - current_length

        # Add zeros to the end of the list
        lst.extend([value] * num_zeros)

    return lst

# Define Dataset

In [144]:
import os
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader , random_split
import tqdm 

import torch
# from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, data_dir,  transform=None):
        self.data_dir = data_dir
        self.files = os.listdir(data_dir)
        self.samples = []
        self.alphapet = [
                'أ', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي',
                '٠', '١', '٢', '٣', '٤', '٥', '٦', '٧', '٨', '٩'
            ]
        self.max_length = 64
    
        # self.vectorizer = CountVectorizer(analyzer='char')
        for filename in self.files:
            if filename.endswith('.jpg'):
                base_name = os.path.splitext(filename)[0]
                txt_name = base_name + '.txt'
                if txt_name in self.files:
                    self.samples.append( (filename, txt_name) )# ,CustomDataset.read_text(self.data_dir + txt_name) ))

        

        self.transform = transform

    def __len__(self):

        return len(self.samples)

    def __getitem__(self, idx):
        img_name, txt_name  = self.samples[idx]
        img_path = os.path.join(self.data_dir, img_name)
        txt_path = os.path.join(self.data_dir, txt_name)
        
        image = Image.open(img_path)
        with open(txt_path, 'r') as txt_file:
            text = txt_file.read()
        
        if self.transform:
            image = self.transform(image)
        
        # print(txt_raw + "asdsss"  )
        # print(self.vectorizer.transform([txt_raw]))

        # raw_text = CustomDataset.read_text(self.data_dir  + txt_name)
        cleaned_text = remove_arabic_diacritics(text)
        cleaned_text = remove_english_letters(cleaned_text)
        encoded_text = encode(cleaned_text)
        # try:
        
        #     print(encoded_text.shape)
        # except Exception as ex:
        #     print(f"asdas {len(encoded_text)}")


        if len(encoded_text) > self.max_length :
            encoded_text = encoded_text[:self.max_length]   ## truncate
        elif len(encoded_text) < self.max_length:
            encoded_text = pad_list(encoded_text ,self.max_length ,char2index[" "])                      ## Padd
        
        encoded_text = torch.tensor(encoded_text)
        # print(len(encoded_text) , encoded_text)
        return {'image': image, 'text': text, 'embeddings' : encoded_text  }
    


    @staticmethod
    def read_text(txt_path):
        with open(txt_path, 'r') as txt_file:
                text = txt_file.read()

        return text


# Create Dataset and Dataloaders

In [145]:
# Set your data directory and other relevant parameters
data_directory = '../data/OCR_Text_Dataset/OCR_Text/'
batch_size = 16


# Define transformation for the images (you can modify this based on your needs)
image_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    # transforms.Grayscale() ,
        
    transforms.ToTensor(),
])

# Create the dataset and data loaders
custom_dataset = CustomDataset(data_dir=data_directory, transform=image_transform)

train_size = int(0.8 * len(custom_dataset))
test_size = len(custom_dataset) - train_size



train_dataset, test_dataset = random_split(custom_dataset, [train_size, test_size])

train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)





### Check a batch from dataloader

In [146]:
batch = next(iter(train_dataloader))
batch.keys() , batch['embeddings'].shape

(dict_keys(['image', 'text', 'embeddings']), torch.Size([16, 64]))

In [147]:
cleaned_text = remove_arabic_diacritics(batch["text"][0])
cleaned_text = remove_english_letters(cleaned_text)
encoded_text = encode(cleaned_text)
len(encoded_text)

51

# Build Model

In [None]:
import torch
import torchvision
import torch.nn as nn

class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.batch_norm = nn.BatchNorm2d(256)
        self.conv4 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1))
        self.gru = nn.GRU(256, 256, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.relu(x)
        x = self.batch_norm(x)
        x = self.conv4(x)
        x = self.relu(x)
        x = self.pool3(x)
        
        # Prepare data for the GRU
        x = x.squeeze(2)  # Remove the second dimension
        x = x.permute(0, 2, 1)  # Permute to (batch_size, seq_len, features)

        # GRU layer
        x, _ = self.gru(x)

        # FC layer
        x = self.fc(x)

        return x

# Initialize the CRNN model
num_classes = len(alphabet)  # Number of output classes (characters)
crnn = CRNN(num_classes)



In [191]:
import matplotlib.pyplot as plt
image_transform = transforms.Compose([
    # transforms.Resize((256, 256)),
    # transforms.Grayscale() ,
        
    transforms.ToTensor(),
])

crnn(image_transform(plt.imread("black_text_extracted.jpg")).unsqueeze(0))

RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 4 is not equal to len(dims) = 3

In [176]:
crnn(torchvision.io.read_image("in"))

RuntimeError: Input type (unsigned char) and bias type (float) should be the same

In [141]:
output.shape , hn.shape , cn.shape , batch["image"].shape

(torch.Size([16, 128]),
 torch.Size([2, 64]),
 torch.Size([2, 64]),
 torch.Size([16, 3, 256, 256]))

In [166]:
import h5py



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# Training loop
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(2):
    for data in train_dataloader:  # Iterate through your dataset
        images = data["image"]
        encoded_labels = data["embeddings"]
        optimizer.zero_grad()  # Zero the gradients
        outputs , (_,_) = model(images)
        
        # Calculate CTC loss
        input_lengths = torch.full((batch_size,), outputs.shape[0], dtype=torch.int32)
        target_lengths = torch.full((batch_size,), len(encoded_labels[0]), dtype=torch.int32)
        loss = loss(outputs, encoded_labels, input_lengths, target_lengths)
        

In [150]:
outputs.shape

torch.Size([16, 128])

# Character recognition

In [165]:
import cv2
import os

def img_to_chars(img_path, output_directory):
    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Load the image
    image = cv2.imread(img_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to create a binary image
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Find contours in the binary image
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Define a function to extract and save individual letters
    def extract_letters(contour, img, index):
        x, y, w, h = cv2.boundingRect(contour)
        letter = img[y:y+h, x:x+w]

        # Resize the letter to a fixed size if needed
        letter = cv2.resize(letter, (50, 50))  # Adjust dimensions as needed

        # Convert the single-channel grayscale letter to a three-channel RGB image
        letter_rgb = cv2.cvtColor(letter, cv2.COLOR_GRAY2RGB)

        # Save the letter as an image
        letter_filename = os.path.join(output_directory, f'letter_{index}.png')
        cv2.imwrite(letter_filename, letter_rgb)

    # Process each contour (letter) and save it as a separate image
    for i, contour in enumerate(contours):
        extract_letters(contour, gray, i)

out = img_to_chars("inverted_image.jpg" , "outputs/")


In [163]:
import cv2
import numpy as np

def extract_white_letters(image_path):
    # Load the image
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Threshold the image to create a binary mask of white text on a black background
    _, binary_mask = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)

    # Find contours in the binary mask
    contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Initialize a list to store individual letter images
    letter_images = []

    # Iterate through the contours and extract individual letters
    for contour in contours:
        # Get the coordinates of the bounding box around the letter
        x, y, w, h = cv2.boundingRect(contour)

        # Extract the individual letter from the image using the bounding box
        letter = image[y:y+h, x:x+w]

        # Append the letter image to the list
        letter_images.append(letter)

    return letter_images

# Example usage:
input_image_path = 'black_text_extracted.jpg'
letters = extract_white_letters(input_image_path)

# Save each extracted letter as a separate image
for i, letter in enumerate(letters):
    cv2.imwrite(f'letter_{i}.jpg', letter)


In [164]:
import cv2
import numpy as np

# Load the image using OpenCV
image = cv2.imread('black_text_extracted.jpg')

# Convert the image to grayscale if it's in color (optional)
# gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Invert the colors (reverse black and white)
inverted_image = cv2.bitwise_not(image)

# # Display the inverted image
# cv2.imshow('Inverted Image', inverted_image)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

# Save the inverted image if needed
cv2.imwrite('inverted_image.jpg', inverted_image)

True