## TrOCR

In [1]:
import os
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

In [2]:
def trocr_inference(model, processor, image_path):
    image = Image.open(image_path)
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

## TEST

In [3]:
dataset_path = 'dataset/transfer_dataset/'
val_dataset_path = os.path.join(dataset_path, 'val')

In [4]:
from Levenshtein import distance
import tensorflow as tf

2024-05-06 21:54:07.858739: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 21:54:07.858797: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 21:54:07.859327: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-06 21:54:07.862827: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
val_df_list = os.listdir(val_dataset_path)
val_df_jpg_list = [val_df_list[i] for i in range(len(val_df_list)) if val_df_list[i].endswith('.jpg')]
val_df_jpg_list = [os.path.join(val_dataset_path, val_df_jpg_list[i]) for i in range(len(val_df_jpg_list)) ]

In [6]:
import os
base_model_name = "microsoft/trocr-small-stage1"
list_of_models = os.listdir("models/trocr/trocr-small-stage1-finetuned/")

In [8]:
scores = dict()
for model_name in list_of_models:
    save_model_name = f"models/trocr/trocr-small-stage1-finetuned/{model_name}"
    processor = TrOCRProcessor.from_pretrained(base_model_name)
    model = VisionEncoderDecoderModel.from_pretrained(save_model_name)
    trocr_distances = []
    for image_path in val_df_jpg_list:
        text_path = image_path.replace('.jpg', '.txt')
        with open(text_path) as f:
            real_text = f.read()
        real_text = real_text.replace('|', ' ').strip()
        trocr_text = trocr_inference(model, processor, image_path)
        trocr_text = trocr_text.replace('|', ' ').strip()
        trocr_distance = distance(trocr_text, real_text)

        trocr_distances.append(trocr_distance)

    trocr_avg_distance = sum(trocr_distances) / len(trocr_distances)
    print(f'Model: {model_name}')
    print(f'TROCR average distance: {trocr_avg_distance}')
    scores[model_name] = trocr_avg_distance




ValueError: could not determine the shape of object type 'torch.storage.UntypedStorage'

In [None]:
import matplotlib.pyplot as plt

# Names of modules
modules = [scores.keys()]
# Values of modules
values = [scores.values()]

plt.figure(figsize=(10, 5))
plt.bar(modules, values)
plt.show()