## Evaluate Synthetic OCR Dataset Model

Description: Since the training session on Google Colab timed out prior to the completion of evaluating it, I am going to evaluate it here.

#### Install Necessary & Missing Libraries

In [1]:
%pip install torch transformers evaluate tqdm jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting jiwer
  Downloading jiwer-3.0.1-py3-none-any.whl (21 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199

#### Import Necessary Libraries

In [2]:
import os, sys

import pandas as pd

from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import VisionEncoderDecoderModel, TrOCRProcessor

import evaluate

import PIL.Image
from PIL import Image

#### Access to HuggingFace Hub

In [3]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential he

#### Mount Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Ingest Annotations File

In [5]:
parent_dir = "/content/drive/MyDrive/Colab Notebooks/20,000 Synthetic Sample OCR/"

test_df = pd.read_csv(os.path.join(parent_dir, "test.csv"))

test_df = test_df.rename(columns={"image name": "file_name", "label": "text"})

test_df

Unnamed: 0,file_name,text
0,img_0.jpg,O8SOB/S7DLVC0IL38N
1,img_1.jpg,0TRVD
2,img_2.jpg,LMM15C/4MO/
3,img_3.jpg,RV88SCBB0TI7ASNT2HH
4,img_4.jpg,N/5V0VPSM/DH
...,...,...
4995,img_4995.jpg,3P35SDRD
4996,img_4996.jpg,ON7287PO
4997,img_4997.jpg,456DRTT/RD9A8PNDL5V
4998,img_4998.jpg,RD5M93B4BCS2MB99


#### Define Data Class

In [6]:
class Synthetic_Dataset_OCR_Dataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        # some file names end with jp instead of jpg, the two lines below fix this
        if file_name.endswith('jp'):
          file_name = file_name + 'g'
        # prepare image (i.e. resize + normalize)
        image = PIL.Image.open(self.root_dir + file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

#### Instantiate Model

In [7]:
MODEL_NAME = "DunnBC22/trocr-base-printed-synthetic_dataset_ocr"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
model.to(DEVICE)

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.87k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=76

#### Define Processor & Ingest Dataset

In [8]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
test_ds = Synthetic_Dataset_OCR_Dataset(root_dir=os.path.join(parent_dir, "files", "20k test/"),
                           df=test_df,
                           processor=processor)

Downloading (…)rocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

#### Define Test DataLoader

In [9]:
test_dataloader = DataLoader(test_ds, batch_size=8)

batch = next(iter(test_dataloader))

#### Display Size of 'pixel_values' & 'labels' of Example

In [10]:
for k,v in batch.items():
  print(k, v.shape)

pixel_values torch.Size([8, 3, 384, 384])
labels torch.Size([8, 128])


#### Display Some Label Values

In [11]:
labels = batch["labels"]
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.batch_decode(labels, skip_special_tokens=True)
label_str

['O8SOB/S7DLVC0IL38N',
 '0TRVD',
 'LMM15C/4MO/',
 'RV88SCBB0TI7ASNT2HH',
 'N/5V0VPSM/DH',
 'S97RDRMR6L2HP6M',
 'M7BTO59N7HO8BIB69',
 '1389BLV7L38T2D']

#### Evaluate Model

In [13]:
cer_metric = evaluate.load("cer")

for batch in tqdm(test_dataloader):
    # Predict output using generate
    pixel_values = batch["pixel_values"].to(DEVICE)
    outputs = model.generate(pixel_values)

    # Decode output
    pred_str = processor.batch_decode(outputs, skip_special_tokens=True)
    labels = batch["labels"]
    labels[labels == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    # Append batch to metric
    cer_metric.add_batch(predictions=pred_str, references=label_str)

cer_score = cer_metric.compute()

  0%|          | 0/625 [00:00<?, ?it/s]

#### Print Results

In [14]:
print("Character error rate on test set:", cer_score)

Character error rate on test set: 0.002896524170994806


#### Save Evaluation Results

In [23]:
cer_score_as_dict = {"cer": cer_score}
hyperparams = {"model": "trocr-base-printed"}

evaluate.save("./results/", experiment="v1", **cer_score_as_dict, **hyperparams)

PosixPath('results/result-2023_03_30-16_24_34.json')

#### Push Evaluation Results to HuggingFace Hub (My Profile!!!)

In [19]:
evaluate.push_to_hub(
  model_id=MODEL_NAME,
  metric_value=cer_score,
  metric_type="cer",
  metric_name="CER",
  dataset_type="synthetic_dataset_ocr",
  dataset_name="synthetic_dataset_ocr",
  dataset_split="test",
  task_type="image-to-text",
  task_name="Text Generation"
)



Downloading (…)solve/main/README.md:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

'https://huggingface.co/DunnBC22/trocr-base-printed-synthetic_dataset_ocr/blob/main/README.md'

#### Notes & Other Takeaways From This Project
****
- This is the evaluation for the TrOCR project using the 20k Synthetic dataset because Google Colab stopped short of evaluating the model during the training session.
- This project turned out really well! The Character Error Rate (CER) was 0.0029!
****