In [1]:
from datasets import load_dataset
ds = load_dataset("merve/vqav2-small",split="validation[:1%]")
ds, ds[0]

(Dataset({
     features: ['multiple_choice_answer', 'question', 'image'],
     num_rows: 214
 }),
 {'multiple_choice_answer': 'carnival ride',
  'question': 'Where are the kids riding?',
  'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x424>})

In [1]:
import os
os.environ['HF_HOME'] = '/home/sa5u24/VQA'
hf_home = os.path.expanduser(
    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
)
print(hf_home)


/home/sa5u24/VQA


In [2]:
# note the image is not provided in the prompt its included as part of the "processor"

# prompt= """Create a Short Product description based on the provided ##PRODUCT NAME## and ##CATEGORY## and image.
# Only return description. The description should be SEO optimized and for a better mobile search experience.

# ##PRODUCT NAME##: {product_name}
# ##CATEGORY##: {category}"""

prompt= """Answer the question based on the provided ##Question## and image. ##Question##: {question}."""

from datasets import load_dataset

# Convert dataset to OAI messages
def format_data(sample):
    return {"messages": [
                {
                    "role": "question",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt.format(question=sample["question"]),
                        },{
                            "type": "image",
                            "image": sample["image"],
                        }
                    ],
                },
                {
                    "role": "answer",
                    "content": [{"type": "text", "text": sample["multiple_choice_answer"]}],
                },
            ],
        }

# Load dataset from the hub
ds_train = load_dataset("merve/vqav2-small",split="validation[:1%]")
ds_val = load_dataset("merve/vqav2-small",split="validation[99%:]")
# Convert dataset to OAI messages
# need to use list comprehension to keep Pil.Image type, .mape convert image to bytes
dataset_train = [format_data(sample) for sample in ds_train]
dataset_val = [format_data(sample) for sample in ds_val]


In [3]:
dataset_val[0]["messages"], len(dataset_train), len(dataset_val)

([{'role': 'question',
   'content': [{'type': 'text',
     'text': 'Answer the question based on the provided ##Question## and image. ##Question##: What type of weather is there?.'},
    {'type': 'image',
     'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=480x640>}]},
  {'role': 'answer', 'content': [{'type': 'text', 'text': 'sunny'}]}],
 214,
 214)

In [4]:
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
PYTORCH_CUDA_ALLOC_CONF=True
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
from trl import SFTConfig



# Hugging Face model id
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="auto",
    # attn_implementation="flash_attention_2", # not supported for training
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
processor = AutoProcessor.from_pretrained(model_id)

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM",
)

# adding lora layers in the model, with random initialized weights 
model_lora_v0 = get_peft_model(model, peft_config)
# print("model weight", model_lora_v0.base_model.model.vision_model.transformer.layers[0].self_attn.q_proj.lora_A.default.weight)


#load the lora weights
lora_path = "/home/sa5u24/VQA/fine-tuned-visionllama/checkpoint-6"
model_lora = PeftModel.from_pretrained(model_lora_v0, lora_path)

args = SFTConfig(
    output_dir="fine-tuned-visionllama", # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=8,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=5,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    # tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    # push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    gradient_checkpointing_kwargs = {"use_reentrant": False}, # use reentrant checkpointing
    dataset_text_field="", # need a dummy field for collator
    dataset_kwargs = {"skip_prepare_dataset": True} # important for collator
)
args.remove_unused_columns=False

def collate_fn(examples):
    # Get the texts and images, and apply the chat template
    texts = [processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
    image_inputs = [process_vision_info(example["messages"])[0] for example in examples]

    # Tokenize the texts and process the images
    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
    # print("batch", batch.input_ids.size(), batch.pixel_values.size())
    # ([8, 43]) torch.Size([8, 1, 4, 3, 560, 560])
    # print("batch", batch.keys()) 
    # batch dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'])

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100  #
    # Ignore the image token index in the loss computation (model specific)
    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652,151653,151655]
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    batch["labels"] = labels

    return batch



The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
import nltk
nltk.download('wordnet')
import re
from torch.utils.data import DataLoader, Dataset
import evaluate
from nltk.translate.meteor_score import meteor_score, single_meteor_score
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

rouge = evaluate.load("rouge")

batch_size = 4
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle = False, collate_fn=collate_fn)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle = False, collate_fn=collate_fn)

all_gt = []
all_gens = []

all_gt_ = []
all_gens_ = []
unk_gt_count = 0

model_lora.eval()
with torch.no_grad():
    for batch in dataloader_val:
        
        output = model_lora.generate(**batch, max_new_tokens=30, eos_token_id=processor.tokenizer.eos_token_id)
        # print("output", len(output),output.size() )
        # print("label",batch['labels'] )

        generation = processor.batch_decode(output, skip_special_tokens=True)
        # print("batch_decode", len(generation), type(generation) )

        for each_gen in generation:
            #only the generated answer
            gen_ans = each_gen.split("assistant")[-1].strip()
            
            try:
                gt = re.search('answer\n\n(.*)assistant', each_gen).group(1)
            except:
                gt = ""
                unk_gt_count+=1
            
            all_gens.append(gen_ans)
            all_gt.append(gt)
            
            all_gens_.append(gen_ans.split())
            all_gt_.append([gt.split()])
            
        rouge_results = rouge.compute(predictions=all_gens, references=all_gt)
        
        m_score=0
        for line in zip(all_gt, all_gens):
            ref = word_tokenize(line[0])
            hypo = word_tokenize(line[1])
            m_score += meteor_score([ref], hypo)
        meteors = m_score/len(all_gt)
        bleu_score = corpus_bleu(all_gt_, all_gens_, weights=(1.0, 0.0, 0.0, 0.0))
        
            
    


[nltk_data] Downloading package wordnet to /home/sa5u24/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/sa5u24/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding

In [6]:
rouge_results, meteors, bleu_score

({'rouge1': 0.1386054256046663,
  'rouge2': 0.015633266143595705,
  'rougeL': 0.13763483870768017,
  'rougeLsum': 0.13837150536622814},
 0.17988559812911947,
 0.022629310344827583)

In [6]:
len(all_gens), len(all_gt), all_gens[1], all_gt[1], unk_gt_count

(214,
 214,
 'The image shows people in the background, but they are not skiing. They are standing on a snowy hill, but there is no',
 'no',
 4)

In [7]:
rouge_results = rouge.compute(predictions=all_gens, references=all_gt)
rouge_results

{'rouge1': 0.1481178710314811,
 'rouge2': 0.0156487437795849,
 'rougeL': 0.14743072392524897,
 'rougeLsum': 0.1475788670914207}

In [8]:
from nltk.translate.meteor_score import meteor_score, single_meteor_score
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
m_score=0
for line in zip(all_gt, all_gens):
    ref = word_tokenize(line[0])
    hypo = word_tokenize(line[1])
    m_score += meteor_score([ref], hypo)

print("meteor_score", m_score/len(all_gt))

[nltk_data] Downloading package punkt_tab to /home/sa5u24/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


meteor_score 0.18524082550328952


In [9]:
from nltk.translate.bleu_score import corpus_bleu
bleu_score = corpus_bleu(all_gt_, all_gens_, weights=(1.0, 0.0, 0.0, 0.0))
print("bleu_score", bleu_score)

bleu_score 0.022238425082026978


In [None]:
with lora pre-trained weight on val dataset

{'rouge1': 0.15045387699529927,
 'rouge2': 0.014528630766948526,
 'rougeL': 0.14957247124561862,
 'rougeLsum': 0.1492740451009666}
meteor_score 0.18661183211828292
bleu_score 0.023290203327171907

with lora pre-trained weight on traindataset
{'rouge1': 0.14348497273817096,
 'rouge2': 0.023673930927080054,
 'rougeL': 0.1435822234212641,
 'rougeLsum': 0.14408427723443865}
meteor_score 0.19548155570972522
bleu_score 0.020076150917272415

zeroshot on val dataset
{'rouge1': 0.1481178710314811,
 'rouge2': 0.0156487437795849,
 'rougeL': 0.14743072392524897,
 'rougeLsum': 0.1475788670914207}
meteor_score 0.18524082550328952
bleu_score 0.022238425082026978

In [20]:
from nltk.translate.bleu_score import corpus_bleu

# List of reference sentences (ground truth)
references = [
    [["this", "is", "a", "test"]],  # Multiple references for the first candidate
    [["another", "test", "sentence"]]  # Multiple references for the second candidate
]

# List of candidate sentences (model output)
candidates = [
    ["this", "is", "a", "test"],  # Candidate sentence 1
    ["another", "test", "sentence"]  # Candidate sentence 2
]

# Calculate the BLEU score for the corpus
bleu_score = corpus_bleu(references, candidates)

print("Corpus BLEU score:", bleu_score)

Corpus BLEU score: 0.8408964152537145


In [21]:
import nltk
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.chrf_score import chrf_precision_recall_fscore_support
from nltk.translate.meteor_score import single_meteor_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

BLEU_1 = sentence_bleu([gt.split()], gen_ans.split(), weights=(1, 0, 0, 0))
BLEU_2 = sentence_bleu([gt.split()], gen_ans.split(), weights=(1, 1, 0, 0))
BLEU_3 = sentence_bleu([gt.split()], gen_ans.split(), weights=(1, 1, 1, 0))
BLEU_4 = sentence_bleu([gt.split()], gen_ans.split(), weights=(1, 1, 1, 1))

print('BLEU_1:%.4f, BLEU_2:%.4f, BLEU_3:%.4f, BLEU_4:%.4f'%(BLEU_1, BLEU_2, BLEU_3, BLEU_4))

prec, rec, f1, tp = chrf_precision_recall_fscore_support(gt.split(), gen_ans.split(), n=1)
print('prec:%.4f, rec:%.4f, f1:%.4f, tp:%.4f'%(prec, rec, f1, tp))

meteor = single_meteor_score(gt.split(), gen_ans.split())
print('Meteor:%.4f'%meteor)



[nltk_data] Downloading package wordnet to /home/sa5u24/nltk_data...


BLEU_1:0.0000, BLEU_2:0.0000, BLEU_3:0.0000, BLEU_4:0.0000
prec:0.0000, rec:0.0000, f1:0.0000, tp:0.0000
Meteor:0.0000


In [35]:
#single example for model generation

from PIL import Image
import requests

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}
    ]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
print("input_text", input_text)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(model.device)

print("inputs", inputs.keys(), inputs.input_ids.size(), inputs.pixel_values.size() )

output = model_lora.generate(**inputs, max_new_tokens=30)
print(processor.decode(output[0]))



input_text <|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|>


inputs dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask']) torch.Size([1, 28]) torch.Size([1, 1, 4, 3, 560, 560])
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here is a haiku for the image:

Whiskers, ears so bright,
Hopping through the countryside,
Spring's gentle delight.<|eot_id|>


In [10]:

#single example for model generation

from PIL import Image
import requests

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)

messages1 = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}
    ]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
print("input_text", input_text)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
)

print(inputs.input_ids.shape, inputs.pixel_values.shape)


input_text <|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|>


torch.Size([1, 28]) torch.Size([1, 1, 4, 3, 560, 560])
