# Perform OCR on SmartDoc documents
Performs OCR using tesseract on SmartDoc documents

In [None]:
import os
from tqdm import tqdm
from glob import glob
import pytesseract
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

In [None]:
pytesseract.pytesseract.tesseract_cmd  = r'C:\*\Tesseract-OCR\tesseract.exe'

In [None]:
def add_gaussian_noise(img:Image, mean=0, std=50) -> Image:
    img_array = np.asarray(img, dtype=np.float32)
    noise = np.random.normal(mean, std, img_array.shape)
    noisy_img_array = np.clip(img_array + noise, 0, 255).astype(np.uint8)
    return Image.fromarray(noisy_img_array)


def drop_resolution(img, scale_factor=8):

    new_width = img.width // scale_factor
    new_height = img.height // scale_factor
    low_res_img = img.resize((new_width, new_height), Image.Resampling.BOX)  # Downscale
    restored_img = low_res_img.resize(img.size, Image.Resampling.NEAREST)   # Upscale
    
    return restored_img

def crop(img:Image, size=300) -> Image:
    width, height = img.size
    left = (width - size) // 2
    top = (height - size) // 2
    right = left + size
    bottom = top + size

    return img.crop((left, top, right, bottom))

def get_text_SmartDoc(fn, scale_factor=None, plot=False, ocr=True):
    img = Image.open(fn)  
    img = img.rotate(-90, expand=True) # the images for this datasets are all sideways
    if not scale_factor is None:
        img = drop_resolution(img, scale_factor=scale_factor)
    if plot:
        fig, axes = plt.subplots(nrows=1, ncols=2)
        axes[0].imshow(crop(img))
        axes[0].axis('off')  
        axes[1].imshow(img)
        axes[1].axis('off') 
        plt.show()
    if ocr:
        return pytesseract.image_to_string(img)


def save_SmartDoc_ocr(fn, save_path, scale_falctor=None, plot=False):
    text = get_text_SmartDoc(fn, scale_factor=scale_falctor, plot=plot)
    file_path = os.path.join(save_path, f"{os.path.basename(fn).replace('.jpg', '')}.txt")
    with open(file_path, "w") as file:
        file.write(text)


save_path = r'*'
input_path = r'*'

input_fns = glob(os.path.join(input_path, '*.jpg'))

for input_fn in tqdm(input_fns):
    save_SmartDoc_ocr(input_fn, save_path=save_path)