In [None]:
%pip install huggingface_hub --quiet

In [None]:
%pip install -U datasets --quiet

In [None]:
%pip install -U peft --quiet

In [None]:
%pip install -U transformers  accelerate bitsandbytes --quiet

In [None]:
%pip install evaluate==0.4.3 --quiet

In [None]:
%pip install -U rouge_score --quiet

In [None]:
from datasets import load_dataset
import json
from PIL import Image
import torch

from transformers import PaliGemmaForConditionalGeneration
from transformers import PaliGemmaProcessor
from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
from transformers import Trainer

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
dataset = load_dataset("amohseni/receipt_VLM_information_extraction")
dataset

In [None]:
MODEL_ID ="google/paligemma-3b-mix-448" 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = PaliGemmaProcessor.from_pretrained(MODEL_ID)

In [None]:
USE_LORA = True
USE_QLORA = False
FREEZE_VISION = False

if USE_LORA or USE_QLORA: 
    lora_config = LoraConfig(
    r=8,
    target_modules=[
        "q_proj", 
        "o_proj", 
        "k_proj", 
        "v_proj", 
        "gate_proj", 
        "up_proj", 
        "down_proj"
    ],
    task_type="CAUSAL_LM",
    )
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_type=torch.bfloat16)
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto", 
        quantization_config=bnb_config if USE_QLORA else None,
        torch_dtype=torch.bfloat16)
    model = get_peft_model(model, lora_config)
    model = model.to(DEVICE)
    model.print_trainable_parameters()
else:
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        MODEL_ID, device_map="auto").to(DEVICE)
    model = model.to(DEVICE)

    if FREEZE_VISION:
        for param in model.vision_tower.parameters():
            param.requires_grad = False

        for param in model.multi_modal_projector.parameters():
            param.requires_grad = False

TORCH_DTYPE = model.dtype

In [None]:
def collate_fn(batch):

    images = [example['image'].convert("RGB") for example in batch]
    prefixes = ["<image>" + example['prefix'] for example in batch]
    suffixes = [example['suffix'] for example in batch]
    
    inputs = processor(
        text=prefixes,
        images=images,
        return_tensors="pt",
        suffix=suffixes,
        padding="longest"
    ).to(TORCH_DTYPE).to(DEVICE)

    return inputs

In [None]:
from transformers import TrainingArguments
args = TrainingArguments(
    num_train_epochs=3,
    remove_unused_columns=False,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    max_steps=1000,
    warmup_steps=2,
    learning_rate=2e-5,
    weight_decay=1e-6,
    adam_beta2=0.999,
    logging_steps=1,
    optim="paged_adamw_8bit" if USE_QLORA else "adamw_hf",
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=1,
    output_dir="paligemma2_json_extraction",
    bf16=True,
    report_to=["tensorboard"],
    dataloader_pin_memory=False
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=dataset['train'],
    data_collator=collate_fn,
    args=args
)



In [None]:
trainer.train()

## Load the Model

In [None]:
from peft import PeftModel, PeftConfig
MODEL_ID ="google/paligemma-3b-mix-448" 
peft_model_base = PaliGemmaForConditionalGeneration.from_pretrained(MODEL_ID,device_map="auto", torch_dtype=torch.bfloat16)
processor = PaliGemmaProcessor.from_pretrained(MODEL_ID)
peft_model = PeftModel.from_pretrained(peft_model_base, 
                                       './paligemma2_json_extraction/checkpoint-1000/',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TORCH_DTYPE = peft_model.dtype
predictions = []
targets = dataset['test'][0:]['suffix']
instructions = dataset['test'][0:]['prefix']
for i in range(len(targets)):
    prompt = "<image>" + instructions[i]
    image_file = dataset['test'][i]['image'].convert("RGB")
    model_inputs = processor(text=prompt, images=image_file, return_tensors="pt").to(TORCH_DTYPE).to(device)
    input_len = model_inputs["input_ids"].shape[-1]

    with torch.inference_mode():
        generation = peft_model.generate(**model_inputs, max_new_tokens=512, do_sample=False)
        generation = generation[0][input_len:]
        decoded = processor.decode(generation, skip_special_tokens=True)
        predictions.append(decoded)
        print(i)

In [None]:
import pandas as pd
zipped_performance = list(zip(targets, predictions))
df = pd.DataFrame(zipped_performance, columns = ['human baseline', 'model extraction'])

In [None]:
df.to_csv('trained_model_extraction_performance.csv', index=False)

In [None]:
import pandas as pd
df = pd.read_csv('./trained_model_extraction_performance.csv')
human_base_line_extraction = df['human baseline']
finetuned_model_extraction = df['model extraction']

In [None]:
import evaluate
import numpy as np
rouge = evaluate.load('rouge')
finetuned_model_results = rouge.compute(
    predictions=finetuned_model_extraction,
    references=human_base_line_extraction[0:len(original_model_extraction)],
    use_aggregator=True,
    use_stemmer=True,
)
print('FINE TUNED MODEL:')
print(finetuned_model_results)

In [None]:
performance = pd.DataFrame.from_dict([finetuned_model_results])
performance.head()
performance.to_csv('./Rouge_trained_model_extraction.csv', index=False)