In [22]:
# @title Import

# !pip install trdg
# !pip install easyocr

# Imports
import copy
import torch
import random
import pathlib

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchvision import transforms
from torchvision.datasets import ImageFolder

from tqdm.auto import tqdm
from IPython.display import HTML, display

from trdg.generators import GeneratorFromStrings
from PIL import Image
import os
import csv
import string

import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split

import easyocr

import editdistance

In [3]:
# @title Set random seed
# @markdown Executing `set_seed(seed=seed)` you are setting the seed

# For DL its critical to set the random seed so that students can have a
# baseline to compare their results to expected results.
# Read more here: https://pytorch.org/docs/stable/notes/randomness.html

# Call `set_seed` function in the exercises to ensure reproducibility.
import random
import torch

def set_seed(seed=None, seed_torch=True):
  """
  Function that controls randomness. NumPy and random modules must be imported.

  Args:
    seed : Integer
      A non-negative integer that defines the random state. Default is `None`.
    seed_torch : Boolean
      If `True` sets the random seed for pytorch tensors, so pytorch module
      must be imported. Default is `True`.

  Returns:
    Nothing.
  """
  if seed is None:
    seed = np.random.choice(2 ** 32)
  random.seed(seed)
  np.random.seed(seed)
  if seed_torch:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

  print(f'Random seed {seed} has been set.')


# In case that `DataLoader` is used
def seed_worker(worker_id):
  """
  DataLoader will reseed workers following randomness in
  multi-process data loading algorithm.

  Args:
    worker_id: integer
      ID of subprocess to seed. 0 means that
      the data will be loaded in the main process
      Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details

  Returns:
    Nothing
  """
  worker_seed = torch.initial_seed() % 2**32
  np.random.seed(worker_seed)
  random.seed(worker_seed)

In [4]:
# @title Set device (GPU or CPU). Execute `set_device()`
# especially if torch modules used.

# Inform the user if the notebook uses GPU or CPU.

def set_device():
  """
  Set the device. CUDA if available, CPU otherwise

  Args:
    None

  Returns:
    Nothing
  """
  device = "cuda" if torch.cuda.is_available() else "cpu"
  if device != "cuda":
    print("WARNING: For this notebook to perform best, "
        "if possible, in the menu under `Runtime` -> "
        "`Change runtime type.`  select `GPU` ")
  else:
    print("GPU is enabled in this notebook.")

  return device

In [5]:
SEED = 2021
set_seed(seed=SEED)
DEVICE = set_device()

Random seed 2021 has been set.


In [6]:
# @title Generate the dataset (Run once or import it)

output_dir = "ocr_dataset"
os.makedirs(output_dir, exist_ok=True)
csv_file = os.path.join(output_dir, "labels.csv")

# Function to generate random words
def generate_random_word(length=10):
    letters = string.ascii_lowercase + ' '
    return ''.join(random.choice(letters) for i in range(length))

# Function to save images with labels
def save_handwritten_text_images(output_dir, csv_file, num_samples=1000):
    # Generate random words
    random_words = [generate_random_word(10) for _ in range(num_samples)]

    # Create generator for handwritten text
    generator = GeneratorFromStrings(
        random_words,
        blur=0,  # No blur
        random_blur=False,
        distorsion_type=0,  # No distortion
        size=32,  # Font size
        language="en"  # Language set to English
    )

    labels = []
    fixed_width = 256
    fixed_height = 56

    for count, (img, lbl) in enumerate(tqdm(generator, total=num_samples, desc="Creating Datas")):
        if count >= num_samples:
          break
        img = img.convert("L")  # Convert image to grayscale
        # Resize the image to fixed dimensions
        img = img.resize((fixed_width, fixed_height), Image.ANTIALIAS)
        # Save image with related filename
        img_filename = os.path.join(output_dir, f"{count+1}.png")
        img.save(img_filename)
        # print(f"Saved {img_filename} with label {lbl}")
        labels.append((f"{count+1}.png", lbl))

    # Save labels to CSV
    with open(csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow( ["image_name", "label"])
        for img_name, label in labels:
            writer.writerow([img_name, label])

# Generate and save images
num_samples = 10000 ## USER 20 000
save_handwritten_text_images(output_dir, csv_file, num_samples=num_samples)

Creating Datas:   0%|          | 0/10000 [00:00<?, ?it/s]

In [8]:
#  @title Extract the dataset with a zip file

from google.colab import drive
drive.mount('/content/drive')

%cd /content
!zip -r ocr_dataset.zip ocr_dataset
from google.colab import files
files.download('ocr_dataset.zip')

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  adding: ocr_dataset/6300.png (stored 0%)
  adding: ocr_dataset/7455.png (stored 0%)
  adding: ocr_dataset/5750.png (stored 0%)
  adding: ocr_dataset/5898.png (stored 0%)
  adding: ocr_dataset/2060.png (stored 0%)
  adding: ocr_dataset/5013.png (stored 0%)
  adding: ocr_dataset/8480.png (stored 0%)
  adding: ocr_dataset/6407.png (stored 0%)
  adding: ocr_dataset/6169.png (stored 0%)
  adding: ocr_dataset/9633.png (stored 0%)
  adding: ocr_dataset/5397.png (stored 0%)
  adding: ocr_dataset/4194.png (stored 0%)
  adding: ocr_dataset/5111.png (stored 0%)
  adding: ocr_dataset/6590.png (stored 0%)
  adding: ocr_dataset/4324.png (stored 0%)
  adding: ocr_dataset/8833.png (stored 0%)
  adding: ocr_dataset/651.png (stored 0%)
  adding: ocr_dataset/927.png (stored 0%)
  adding: ocr_dataset/3584.png (stored 0%)
  adding: ocr_dataset/9885.png (stored 0%)
  adding: ocr_dataset/9307.png (stored 0%)
  addin

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
# @title Data Loader
class CustomImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.img_labels = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_name = self.img_labels.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path)
        label = self.img_labels.iloc[idx, 1]

        if self.transform:
            image = self.transform(image)

        return image, label

# apply corruptions to the preprocess

# -1 : nothing
# 0 : rotation
# 1 : affine tsf
# 2 Gaussian blur
# ...

## USER
tsf = -1 # type of corruption
p = 0.1 # intensity of corruption
##

float_to_odd_number = lambda float_value: (lambda n: n if n % 2 != 0 else n + 1 if n < float_value else n - 1)(int(round(float_value)))

corr_list = [transforms.RandomRotation(degrees=p*20), transforms.RandomAffine(degrees=p*20), transforms.GaussianBlur(float_to_odd_number(p*20))]

# compose transformations including the specified corruption
if tsf == -1:
  transform = transforms.Compose([
                                 transforms.ToTensor()
                                ])
else:
  transform = transforms.Compose([
                                 corr_list[tsf],
                                 transforms.ToTensor()
                                ])

# Create an instance of the custom dataset
csv_file = pathlib.Path('.')/'ocr_dataset/labels.csv' # Using pathlib to be compatible with all OS's
img_dir = pathlib.Path('.')/'ocr_dataset'

dataset = CustomImageDataset(csv_file=csv_file, img_dir=img_dir, transform=transform)

# Split dataset into training, validation, and test sets
dataset_size = len(dataset)
train_size = int(0.20 * dataset_size)
val_size = int(0.70 * dataset_size)
test_size = int(0.10 * dataset_size) # reduced otherwise too slow

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create DataLoaders for each dataset
batch_size = 32  # Adjust batch size as needed
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [36]:
# @title Initialize easyocr (Only once)
# This needs to run only once to load the model into memory
reader = easyocr.Reader(['en'])




In [48]:
# @title Helper function

# Function to calculate Character Error Rate (CER)
# CER is calculated as the number of character-level errors (insertions, deletions, substitutions)
# divided by the total number of characters in the ground truth text.
def calculate_cer(gt, pred):
    return editdistance.eval(gt, pred) / len(gt)

# Function to calculate Word Error Rate (WER)
def calculate_wer(gt, pred):
    gt_words = gt.split()
    pred_words = pred.split()
    return editdistance.eval(gt_words, pred_words) / len(gt_words)

# Function to evaluate the EasyOCR model
def evaluate_easyocr(reader, dataloader):
    total_chars = 0
    total_words = 0
    total_char_errors = 0
    total_word_errors = 0
    correct = 0
    total = 0

    for images, labels in tqdm(dataloader, desc="Evaluating"):
        for img, label in zip(images, labels):
            # Ensure the image is in the correct format for EasyOCR
            if isinstance(img, Image.Image):
                img = np.array(img)
            elif isinstance(img, torch.Tensor):
                img = img.numpy().transpose(1, 2, 0) * 255  # Convert tensor to numpy array and scale
                img = img.astype('uint8')

            result = reader.readtext(img, detail=0)  # Set detail=0 to get only the recognized text
            prediction = ' '.join(result).strip()

            # Calculate accuracy
            if prediction == label:
                correct += 1
            total += 1

            # Calculate CER
            cer = calculate_cer(label, prediction)
            total_char_errors += cer * len(label)
            total_chars += len(label)

            # Calculate WER
            wer = calculate_wer(label, prediction)
            total_word_errors += wer * len(label.split())
            total_words += len(label.split())

    accuracy = correct / total
    cer = total_char_errors / total_chars
    wer = total_word_errors / total_words
    return accuracy, cer, wer

def test(model, test_loader, loader='Test', criterion=F.nll_loss,
         device='cpu'):
  """
  Tests the current model

  Args:
    model: nn.module
      Neural network instance
    device: string
      GPU/CUDA if available, CPU otherwise
    test_loader: torch.loader
      Test dataset
    criterion: function
      Specifies loss function [default: nll_loss]

  Returns:
    test_loss: float
      Test loss
  """
  model.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for data, target in test_loader:
      data, target = data.to(device), target.to(device)
      output = model(data)
      test_loss += criterion(output, target, reduction='sum').item()  # sum up batch loss
      pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability
      correct += pred.eq(target.view_as(pred)).sum().item()

  test_loss /= len(test_loader.dataset)
  return 100. * correct / len(test_loader.dataset)

In [49]:
# @title Evaluation of a given model

# Evaluate the model easyocr
test_accuracy, test_cer, test_wer = evaluate_easyocr(reader, test_loader)
print(f'Test Accuracy: {test_accuracy:.4f}') # we want it to be at 100% ideally
print(f'Test CER: {test_cer:.4f}') # we want it to be at 0% ideally
print(f'Test WER: {test_wer:.4f}') # we want it to be at 0% ideally

Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

Test Accuracy: 0.6230
Test CER: 0.1302
Test WER: 0.3039
