**Mount Google Drive to load your Test Data and image import libraries**

In [None]:
# Imports PIL module and mount Google drive
import urllib.request
from PIL import Image
import os
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/My Drive/Colab Notebooks/Engine_TestData')
#image = Image.open('P001_test.jpg')
#image = plt.imread('list1.jpg')
#plt.imshow(image)

**Load the Transformer model, tensorboard and sentencepiece library packages**

In [None]:
#!pip install -q "transformers==4.45.2"
!pip install -q datasets sentencepiece tensorboard
!pip install transformers==4.45.2 sentence-transformers==3.1.1

**Load the Donut base pre-trained model (not finetuned on custom datset)**

In [None]:
import re
import transformers
from PIL import Image
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch
import random
import numpy as np
from transformers.image_transforms import to_pil_image

# hidde logs
transformers.logging.disable_default_handler()


# Load our model from Hugging Face
processor_base = DonutProcessor.from_pretrained("philschmid/donut-base-sroie")
model_base = VisionEncoderDecoderModel.from_pretrained("philschmid/donut-base-sroie")

os.chdir('/content/drive/My Drive/Colab Notebooks/Engine_TestData')
image = Image.open('E038.jpg')
#image = plt.imread('list1.jpg')
plt.imshow(image)
pixel_values = processor_base(image, return_tensors="pt").pixel_values
print(pixel_values.shape)

# Move model to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model_base.to(device)

#task_prompt = "<s_cord-v1>"
task_prompt = "<s>"
decoder_input_ids = processor_base.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

device = "cuda" if torch.cuda.is_available() else "cpu"
model_base.to(device)

outputs = model_base.generate(pixel_values.to(device),
                               decoder_input_ids=decoder_input_ids.to(device),
                               max_length=model_base.decoder.config.max_position_embeddings,
                               early_stopping=True,
                               pad_token_id=processor_base.tokenizer.pad_token_id,
                               eos_token_id=processor_base.tokenizer.eos_token_id,
                               use_cache=True,
                               num_beams=1,
                               bad_words_ids=[[processor_base.tokenizer.unk_token_id]],
                               return_dict_in_generate=True,
                               output_scores=True,)

'''
# Load random document image from the test set
test_sample = processed_dataset["test"][random.randint(1, 50)]

def run_prediction(sample, model=model, processor=processor):
    # prepare inputs
    pixel_values = torch.tensor(test_sample["pixel_values"]).unsqueeze(0)
    task_prompt = "<s>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

    # run inference
    outputs = model.generate(
        pixel_values.to(device),
        decoder_input_ids=decoder_input_ids.to(device),
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    # process output
    prediction = processor.batch_decode(outputs.sequences)[0]
    prediction = processor.token2json(prediction)

    # load reference target
    target = processor.token2json(test_sample["target_sequence"])
    return prediction, target

prediction, target = run_prediction(test_sample)
print(f"Reference:\n {target}")
print(f"Prediction:\n {prediction}")
pixel_values = np.squeeze(test_sample["pixel_values"])
#print(pixel_values)
pixel_values = (pixel_values + 1) / 2
#processor.feature_extractor.to_pil_image(pixel_values)
to_pil_image(pixel_values)
'''

In [None]:

#Generate the ouput in JSON format
sequence = processor_base.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor_base.tokenizer.eos_token, "").replace(processor_base.tokenizer.pad_token, "")
#sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
print(sequence)
print(processor_base.token2json(sequence))



**Load the Fine-tune Donut model and parse the test image**

In [None]:
import re
import transformers
from PIL import Image
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch
import random
import numpy as np
from transformers.image_transforms import to_pil_image


# Load our model from Hugging Face
processor = DonutProcessor.from_pretrained("1992anubhav/donut-base-engine_logbook")
model = VisionEncoderDecoderModel.from_pretrained("1992anubhav/donut-base-engine_logbook")

os.chdir('/content/drive/My Drive/Colab Notebooks/Engine_TestData')
image = Image.open('E038.jpg')
#image = plt.imread('list1.jpg')
plt.imshow(image)
pixel_values = processor(image, return_tensors="pt").pixel_values
print(pixel_values.shape)

# Move model to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

#task_prompt = "<s_cord-v1>"
task_prompt = "<s>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

outputs = model.generate(pixel_values.to(device),
                               decoder_input_ids=decoder_input_ids.to(device),
                               max_length=model.decoder.config.max_position_embeddings,
                               early_stopping=True,
                               pad_token_id=processor.tokenizer.pad_token_id,
                               eos_token_id=processor.tokenizer.eos_token_id,
                               use_cache=True,
                               num_beams=1,
                               bad_words_ids=[[processor.tokenizer.unk_token_id]],
                               return_dict_in_generate=True,
                               output_scores=True,)


# process output
prediction = processor.batch_decode(outputs.sequences)[0]
prediction = processor.token2json(prediction)
print(prediction)


**Visual Document Question Answering on the Processed Data**

In [None]:
import re
import transformers
from PIL import Image
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch
import random
import numpy as np
from transformers.image_transforms import to_pil_image
processor_vqa = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model_vqa = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

os.chdir('/content/drive/My Drive/Colab Notebooks/Engine_TestData')
image = Image.open('E014.JPG')

pixel_values = processor_vqa(image, return_tensors="pt").pixel_values
print(pixel_values.shape)

device = "cuda" if torch.cuda.is_available() else "cpu"
model_vqa.to(device)
questions = ["what is the aircraft registration number?",
            "What is date of installation on aircraft?",
            "Who is the engine manufacturer?",
            "What is the model number?",
            "What is the blade design no?",
            "What is the hub serial no?",
            ]

task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
for each in questions:
    question = each
    prompt = task_prompt.replace("{user_input}", question)
    decoder_input_ids = processor_vqa.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
    outputs = model_vqa.generate(pixel_values.to(device),
                                decoder_input_ids=decoder_input_ids.to(device),
                                max_length=model_vqa.decoder.config.max_position_embeddings,
                                early_stopping=True,
                                pad_token_id=processor_vqa.tokenizer.pad_token_id,
                                eos_token_id=processor_vqa.tokenizer.eos_token_id,
                                use_cache=True,
                                num_beams=1,
                                bad_words_ids=[[processor_vqa.tokenizer.unk_token_id]],
                                return_dict_in_generate=True,
                                output_scores=True)


    seq = processor_vqa.batch_decode(outputs.sequences)[0]
    print(processor_vqa.token2json(seq))
    plt.imshow(image)