In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
device = torch.device("cpu")

In [None]:

from transformers import AutoModelForCausalLM, AutoProcessor

# Load Dataset

In [None]:
!pwd

/home/datainsight


In [None]:
!cd /mnt/e/Abdul_Muqtadir/Thesis/Dataset/subset


In [None]:
from datasets import load_dataset, load_from_disk

# From local disk (adjusted for WSL)
dataset = load_from_disk("/mnt/e/Abdul_Muqtadir/Thesis/Dataset/Processed_Dataset2")

# From Hugging Face (Complete Dataset)
#dataset = load_dataset("AbdulMuqtadir/DocVQA_Processed_Dataset")

# From Hugging Face example Dataset
#dataset = load_dataset("AbdulMuqtadir/Pix2Struct_DocVQA_Testing")

dataset


Loading dataset from disk:   0%|          | 0/43 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answers', 'image_raw', 'ground_truth'],
        num_rows: 30000
    })
    distill: Dataset({
        features: ['question', 'answers', 'image_raw', 'ground_truth'],
        num_rows: 9463
    })
    test: Dataset({
        features: ['question', 'answers', 'image_raw', 'ground_truth'],
        num_rows: 2500
    })
    valid: Dataset({
        features: ['question', 'answers', 'image_raw', 'ground_truth'],
        num_rows: 2849
    })
})

In [None]:
import matplotlib.pyplot as plt
import numpy as np

Example = train_dataset[889]

# Convert the image to a NumPy array and display it
img_array = np.array(Example['image_raw'])

plt.imshow(img_array, cmap='gray')  # Use 'gray' if the image is grayscale (mode='L')
plt.axis('off')  # Hide axes for a cleaner view
plt.show()


Example


NameError: name 'train_dataset' is not defined

In [None]:
# Extract the image from the dataset
Example = train_dataset[0]

# The image is already loaded as a PIL image, so just display it
Example['image_raw'].show()


In [None]:
train_dataset = dataset['train']
train_dataset[0]

{'question': 'What is the Voucher Number ?',
 'answers': ['8'],
 'image_raw': <PIL.PngImagePlugin.PngImageFile image mode=L size=1490x653>,
 'ground_truth': '{"gt_parses": [{"question" : "What is the Voucher Number ?", "answer" : "8"}]}'}

In [None]:
from PIL import Image
Example = train_dataset[0]
train_dataset[0]['image_raw']
Example

{'question': 'What is the Voucher Number ?',
 'answers': ['8'],
 'image_raw': <PIL.PngImagePlugin.PngImageFile image mode=L size=1490x653>,
 'ground_truth': '{"gt_parses": [{"question" : "What is the Voucher Number ?", "answer" : "8"}]}'}

# Load Processor and Model

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForCausalLM
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True)
#model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True, revision='refs/pr/6').to(device)

In [None]:
model = AutoModelForCausalLM.from_pretrained("/mnt/e/Abdul_Muqtadir/Thesis/Weights/Florance2_DocVQA", trust_remote_code=True).to(device)


In [None]:
model

Florence2ForConditionalGeneration(
  (vision_tower): DaViT(
    (convs): ModuleList(
      (0): ConvEmbed(
        (proj): Conv2d(3, 128, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
        (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
      (1): ConvEmbed(
        (proj): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
      (2): ConvEmbed(
        (proj): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      )
      (3): ConvEmbed(
        (proj): Conv2d(512, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
    (blocks): ModuleList(
      (0): MySequential(
        (0): MySequential(
          (spatial_block): SpatialBlock(
            (conv1): PreNorm(
              (fn): Depth

In [None]:
from torch.utils.data import Dataset

class DocVQADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        question = "<DocVQA>" + example['question']
        first_answer = example['answers'][0]
        image = example['image_raw']
        if image.mode != "RGB":
            image = image.convert("RGB")
        return question, first_answer, image


In [None]:
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (AdamW, AutoProcessor, get_scheduler)

def collate_fn(batch):
    questions, answers, images = zip(*batch)
    inputs = processor(text=list(questions), images=list(images), return_tensors="pt", padding=True).to(device)
    return inputs, answers

# Create datasets
train_dataset = DocVQADataset(dataset['train'])
val_dataset = DocVQADataset(dataset['valid'])

# Create DataLoader
#batch_size = 6
num_workers = 0

train_loader = DataLoader(train_dataset, batch_size=4, collate_fn=collate_fn, num_workers=num_workers, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, collate_fn=collate_fn, num_workers=num_workers)

# Training

In [None]:
import os

def train_model(train_loader, val_loader, model, processor, epochs=12, lr=1e-6):


    #output_dir = ""
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        i = -1
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
            i += 1
            inputs, answers = batch

            input_ids = inputs["input_ids"]
            pixel_values = inputs["pixel_values"]
            labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True, return_token_type_ids=False).input_ids.to(device)

            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Average Training Loss: {avg_train_loss}")

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{epochs}"):
                inputs, answers = batch

                input_ids = inputs["input_ids"]
                pixel_values = inputs["pixel_values"]
                labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True, return_token_type_ids=False).input_ids.to(device)

                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
                loss = outputs.loss

                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Average Validation Loss: {avg_val_loss}")


        # Save model checkpoint
        output_dir = f"/mnt/e/Abdul_Muqtadir/Thesis/Florance2/save_directory/epoch_{epoch+1}"
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        processor.save_pretrained(output_dir)


# trainer


In [None]:
for param in model.vision_tower.parameters():
  param.is_trainable = False

In [None]:
train_model(train_loader, val_loader, model, processor, epochs=15)

Training Epoch 1/15: 100%|████████████████████████████████████████████████████████| 7500/7500 [1:19:25<00:00,  1.57it/s]


Average Training Loss: 0.2688769526972746


Validation Epoch 1/15: 100%|██████████████████████████████████████████████████████████| 713/713 [05:16<00:00,  2.25it/s]


Average Validation Loss: 0.2634341748833635


Training Epoch 2/15: 100%|████████████████████████████████████████████████████████| 7500/7500 [1:03:04<00:00,  1.98it/s]


Average Training Loss: 0.24751807379666715


Validation Epoch 2/15: 100%|██████████████████████████████████████████████████████████| 713/713 [03:32<00:00,  3.36it/s]


Average Validation Loss: 0.2524424009488799


Training Epoch 3/15: 100%|████████████████████████████████████████████████████████| 7500/7500 [1:03:02<00:00,  1.98it/s]


Average Training Loss: 0.22779219613745808


Validation Epoch 3/15: 100%|██████████████████████████████████████████████████████████| 713/713 [03:30<00:00,  3.39it/s]


Average Validation Loss: 0.24104753457111677


Training Epoch 4/15: 100%|████████████████████████████████████████████████████████| 7500/7500 [1:03:34<00:00,  1.97it/s]


Average Training Loss: 0.20851779748008897


Validation Epoch 4/15: 100%|██████████████████████████████████████████████████████████| 713/713 [03:33<00:00,  3.34it/s]


Average Validation Loss: 0.23647588518804769


Training Epoch 5/15: 100%|████████████████████████████████████████████████████████| 7500/7500 [1:03:27<00:00,  1.97it/s]


Average Training Loss: 0.1928861303462026


Validation Epoch 5/15: 100%|██████████████████████████████████████████████████████████| 713/713 [03:31<00:00,  3.37it/s]


Average Validation Loss: 0.23083286379123705


Training Epoch 6/15:  29%|████████████████▋                                         | 2158/7500 [18:05<44:46,  1.99it/s]


KeyboardInterrupt: 

# Inference

In [None]:
# Function to run the model on an example
def Inference(task_prompt, text_input, image):
    prompt = task_prompt + text_input

    # Ensure the image is in RGB mode
    if image.mode != "RGB":
        image = image.convert("RGB")

    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))

    print(text_input)

    return parsed_answer


In [None]:
Test_dataset = dataset['test']
Example = Test_dataset[0]
Example

{'question': 'What is the no of copies of the report distributed in the regional desk?',
 'answers': ['2 copies', '2'],
 'image_raw': <PIL.PngImagePlugin.PngImageFile image mode=L size=1684x2189>,
 'ground_truth': '{"gt_parses": [{"question" : "What is the no of copies of the report distributed in the regional desk?", "answer" : "2 copies"}, {"question" : "What is the no of copies of the report distributed in the regional desk?", "answer" : "2"}]}'}

In [None]:

Inference('DocVQA', Example['question'], Example['image_raw'])

What is the no of copies of the report distributed in the regional desk?


{'DocVQA': '2 copies'}

# Evaluation

In [None]:
from donut import JSONParseEvaluator

def compare_json(target, prediction):
# def compare_json(file_name, target, prediction):
  # print("Image : ",file_name)
  evaluator = JSONParseEvaluator()
  score = evaluator.cal_acc(prediction, target)
  #print("Score",score)
  return score

In [None]:
import re
import json
import time
from distance import levenshtein

Test_TED = 0
total_norm_leven_sim = 0
total_inference_time = 0

for i in range(len(Test_dataset)):
    start_time = time.time()
    print("sample number:", i)
    Sample = Test_dataset[i]
    GT = json.loads(Sample['ground_truth'])
    question = GT['gt_parses'][0]['question']
    target = GT['gt_parses'][0]['answer']
    image = Sample['image_raw']
    type(image)

    Prediction = Inference('DocVQA', question, image)
    Prediction = Prediction['DocVQA']
    #print("Prediction:", Prediction)
    #print("Target:", target)

    Prediction = Prediction.upper().replace(" ", "").rstrip(".")
    target = target.upper().replace(" ", "").rstrip(".")

    Sample_TED_Score = compare_json(target, Prediction)

    print({
    "question": question,
    "Target": target,
    "Prediction": Prediction,
    "Sample TED": Sample_TED_Score
    })


    Test_TED += Sample_TED_Score


    # Calculate Levenshtein similarity
    gt_text = json.dumps(target)
    pred_text = json.dumps(Prediction)
    len_gt = max(len(gt_text), 1)  # Ensure denominator is not zero
    norm_leven_sim = (len_gt - levenshtein(gt_text, pred_text)) / len_gt
    total_norm_leven_sim += norm_leven_sim

    end_time = time.time()
    inference_time = end_time - start_time
    total_inference_time += inference_time

TED_Score_TestDataset = Test_TED / len(Test_dataset)
average_norm_leven_sim = total_norm_leven_sim / len(Test_dataset)

print("Average TED on Test Dataset:", TED_Score_TestDataset)
print("Average Normalized Levenshtein Similarity:", average_norm_leven_sim)
print("Total Inference Time:", total_inference_time, "seconds")




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
how much is the financial total for 1970-71?
{'question': 'how much is the financial total for 1970-71?', 'Target': '3.938', 'Prediction': '3.938', 'Sample TED': 1.0}
sample number: 835
Which experiment result is mentioned in Table 4?
{'question': 'Which experiment result is mentioned in Table 4?', 'Target': 'SEQUENCEWEEDINGRESULTS', 'Prediction': 'SEQUENCEWEEDINGRESULTS', 'Sample TED': 1.0}
sample number: 836
What receipt is this?
{'question': 'What receipt is this?', 'Target': 'RECEIPTFORCERTIFIEDMAIL', 'Prediction': 'RECEIPTFORCERTIFIEDMAIL', 'Sample TED': 1.0}
sample number: 837
Who is sender?
{'question': 'Who is sender?', 'Target': 'THOMASA.CRAIG', 'Prediction': 'THOMASA.CRAIG', 'Sample TED': 1.0}
sample number: 838
what is the promotion name ?
{'question': 'what is the promotion name ?', 'Target': 'VANTAGENOVEMBERPRODUCT', 'Prediction': 'VANTAGENOVEMBERPRODUCT', 'Sample TED': 1.0}
sample number: 839
what is the net

# Evaluation on Categories

In [None]:
from datasets import load_from_disk
category_dataset = load_from_disk(r'/mnt/e/Abdul_Muqtadir/Thesis/Dataset/Test_categories')
category_dataset

DatasetDict({
    text: Dataset({
        features: ['question', 'answers', 'image_raw', 'ground_truth'],
        num_rows: 58
    })
    table: Dataset({
        features: ['question', 'answers', 'image_raw', 'ground_truth'],
        num_rows: 58
    })
    figure: Dataset({
        features: ['question', 'answers', 'image_raw', 'ground_truth'],
        num_rows: 58
    })
    checklists_ratings: Dataset({
        features: ['image', 'text'],
        num_rows: 58
    })
})

In [None]:
import re
import json
import time
from distance import levenshtein

Test_TED = 0
total_norm_leven_sim = 0
total_inference_time = 0
total_samples = 0  # To track total samples across splits

# Iterate over each split
for split in category_dataset:
    print(f"Processing split: {split}")
    print("------------------------------------------------------------")
    print("------------------------------------------------------------")
    print("------------------------------------------------------------")
    data = category_dataset[split]

    split_TED = 0  # For tracking TED within a split
    split_norm_leven_sim = 0  # For tracking normalized Levenshtein within a split
    split_inference_time = 0  # To track time for each split

    for i in range(len(data)):
        print(f"Sample {i+1}/{len(data)} in split {split}")

        # Timing inference
        start_time = time.time()

        Sample = data[i]
        GT = json.loads(Sample['ground_truth'])
        question = GT['gt_parses'][0]['question']
        target = GT['gt_parses'][0]['answer']
        image = Sample['image_raw']

        Prediction = Inference('DocVQA', question, image)
        Prediction = Prediction['DocVQA']

        # Clean up Prediction and target
        Prediction = Prediction.upper().replace(" ", "").rstrip(".")
        target = target.upper().replace(" ", "").rstrip(".")

        # Calculate TED score
        Sample_TED_Score = compare_json(target, Prediction)
        split_TED += Sample_TED_Score

        print({
            "PREDICTION": Prediction,
            "TARGET": target,
            "SAMPLE TED": Sample_TED_Score
        })

        # Calculate Levenshtein similarity
        gt_text = json.dumps(target)
        pred_text = json.dumps(Prediction)
        len_gt = max(len(gt_text), 1)  # Ensure denominator is not zero
        norm_leven_sim = (len_gt - levenshtein(gt_text, pred_text)) / len_gt
        split_norm_leven_sim += norm_leven_sim

        # End timing inference
        end_time = time.time()
        inference_time = end_time - start_time
        split_inference_time += inference_time

    # Average TED and Levenshtein similarity for the split
    TED_Score_TestDataset = split_TED / len(data)
    average_norm_leven_sim = split_norm_leven_sim / len(data)

    print(f"Average TED on Split {split}: {TED_Score_TestDataset}")
    print(f"Average Normalized Levenshtein Similarity for Split {split}: {average_norm_leven_sim}")
    print(f"Total Inference Time for Split {split}: {split_inference_time} seconds")

    # Aggregate results across all splits
    Test_TED += split_TED
    total_norm_leven_sim += split_norm_leven_sim
    total_inference_time += split_inference_time
    total_samples += len(data)

# Calculate overall metrics across all splits
if total_samples > 0:
    TED_Score_TestDataset = Test_TED / total_samples
    average_norm_leven_sim = total_norm_leven_sim / total_samples

    print("Overall Average TED on Test Dataset:", TED_Score_TestDataset)
    print("Overall Average Normalized Levenshtein Similarity:", average_norm_leven_sim)
    print("Overall Total Inference Time:", total_inference_time, "seconds")


Processing split: text
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
Sample 1/58 in split text
What is the Grand Total for Gross Block(At Cost) As of 31.3.2011?
{'PREDICTION': '49910.91', 'TARGET': '49910.91', 'SAMPLE TED': 1.0}
Sample 2/58 in split text
what is the percent reported a decrease in hip pain ?
{'PREDICTION': '98.1%', 'TARGET': '98.1%', 'SAMPLE TED': 1.0}
Sample 3/58 in split text
How many total number of cases per shipment are mentioned in the form?
{'PREDICTION': '8', 'TARGET': '8', 'SAMPLE TED': 1.0}
Sample 4/58 in split text
What is the chain corporate name?
{'PREDICTION': 'RACETRE', 'TARGET': 'RACETRAC', 'SAMPLE TED': 0.75}
Sample 5/58 in split text
what is the schedule at the time of 12:15 - 2:00 in session 2 ?
{'PREDICTION': 'LUNCH', 'TARGET': 'LUNCH', 'SAMPLE TED': 1.0}
Sample 6/58 in split text
what is the standard form no.?
{'PR

KeyError: 'ground_truth'