In [None]:
#%pip install pytesseract

### Imports

In [1]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'E:\\TesseractOCR\\tesseract.exe'
from pytesseract import Output 

import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
import cv2
import os

### DataLoader: 
To load train and test data

In [2]:
class CustomOCRDataset (Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths

        #Transformations done post tesseract processing
        self.transform = transform
    
    def __getitem__(self, index):
        img_path = self.image_paths[index]

        #Load image as PIL and ensure RGB format
        image = Image.open(img_path).convert('RGB')

        #Add resizing and other preprocesssing as needed below
        # -
        # -
        # -

        return image
    
    def __len__(self):
        #count of images
        return len(self.image_paths)



In [3]:
#Dataset Paths
image_paths = []
SignverOD_path = "E:\\PROJECTS\\Personal Projects\\AI-Powered-Contract-Auditing-System\\Datasets\\SignverOD\\images"  
image_paths.append(SignverOD_path)   #SignverOD Dataset (Physical Signatures)

#Activating dataset processing
#dataset = CustomOCRDataset(image_path, transform=None)

### Image Preprocessing

In [None]:
def imgToTxt(img):
    #<--This function extracts text data from an image-->

    text = pytesseract.image_to_string(img)
    return text

def getGrayScale(img):
    #<--This function converts image into grayscale in order to make text clearer-->

    return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

def removeNoise(img):
    #<--This fucntion removes any blurr from the image--> 

    return cv2.medianBlur(img,5)

def thresholding(img):
    #<--This function makes the image pure black and white, making it easier for pytesseract to extract text

    return cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

def resizeImg(img):
    #<--This function resizes the image to 128 x 128-->

    return cv2.resize(img, (128, 128))


#### Testing

In [5]:
def readImagesFromFolder(path=None):
    #<---This function reads all image files from a specific folder-->
    #path :- stored path to the images

    images = [] #Stores all read images
    allowed_extensions = ['.png', '.jpg', '.jpeg']  # List of allowed image extensions

    #Go thorugh the specified folder and store all image files
    try:
        for filename in os.listdir(path):
            path = os.path.join(path, filename)

            if os.path.isfile(path):
                try:
                    _, extension = os.path.splitext(filename)
                    #Allow only the valid extensions for image files
                    if extension.lower() in allowed_extensions:
                        img = cv2.imread(path)

                        #check if its a valid image
                        if img is not None and (img.ndim == 2 or img.ndim == 3): 
                            images.append(img)

                except Exception as e:
                    print("Error reading file: {filename} - {e}")

    except FileNotFoundError:
        print("File not found in folder {path}")
    except NotADirectoryError:
        print("Invalid directory {path}")
    
    return images

path = "E:\\PROJECTS\\Personal Projects\\AI-Powered-Contract-Auditing-System\\Datasets"
image_files = readImagesFromFolder(path)
print(len(image_files))


for img in image_files:
    print("----------------------------------------------------------------------------")
    img = getGrayScale(img)
    img = thresholding(img)
    img = removeNoise(img)
    print(imgToTxt(img))



1
----------------------------------------------------------------------------
Mr. Scott Yasuda Page 2

CONFIDENTIAL

As you are no doubt aware, MARLBORO is the world's leading
cigarette and one of the most famous of all American trademarks.
Moreover, while cigarettes are our major product, substantial
business is done in several areas and, as you are also, no doubt,
aware, MARLBORO sponsors several race cars and racing teams
around the world, including the world champion MARLBORO-McLaren

racing tean.

MARLBORO, the MARLBORO Roof Design, and the combination
thereof are all trademarks owned by Philip Morris. In addition,
we also own United States Registration No. 1400689 for MARLBORO
WORLD CHAMPIONSHIP TEAM (& racing car), (& Red Roof Design),
which covers “entertainment services, namely conducting car
racing events." Similar rights are in existence in almost every
country world-wide. Accordingly, the usage of our trademarks and
those confusingly similar therewith infringe our trademar

#### Text Localisation and Detection

In [6]:
rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
results = pytesseract.image_to_data(rgb, output_type=Output.DICT)

print(results.keys())



n_boxes = len(results['text'])
for i in range(n_boxes):
    if int(results['conf'][i]) > 60:  # You can adjust the confidence threshold
        (x, y, w, h) = (results['left'][i], results['top'][i], results['width'][i], results['height'][i])
        text = results['text'][i]
        cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)  # Draw a green rectangle
        cv2.putText(img, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

#cv2.imshow("Text Detection", img)
#cv2.waitKey(0)
#cv2.destroyAllWindows()

dict_keys(['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text'])
