In [1]:
from datasets import load_dataset
ds = load_dataset("flaviagiammarino/path-vqa")
ds

DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 19654
    })
    validation: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 6259
    })
    test: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 6719
    })
})

In [1]:
import os
os.environ['HF_HOME'] = '/home/sa5u24/VQA'
hf_home = os.path.expanduser(
    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
)
print(hf_home)


/home/sa5u24/VQA


In [2]:
# note the image is not provided in the prompt its included as part of the "processor"

# prompt= """Create a Short Product description based on the provided ##PRODUCT NAME## and ##CATEGORY## and image.
# Only return description. The description should be SEO optimized and for a better mobile search experience.

# ##PRODUCT NAME##: {product_name}
# ##CATEGORY##: {category}"""

prompt= """Answer the question based on the provided ##Question## and image. ##Question##: {question}."""

from datasets import load_dataset

# Convert dataset to OAI messages
def format_data(sample):
    return {"messages": [
                {
                    "role": "question",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt.format(question=sample["question"]),
                        },{
                            "type": "image",
                            "image": sample["image"],
                        }
                    ],
                },
                {
                    "role": "answer",
                    "content": [{"type": "text", "text": sample["answer"]}],
                },
            ],
        }

# Load dataset from the hub
ds = load_dataset("flaviagiammarino/path-vqa")
# dataset_train = [format_data(sample) for sample in ds['train']]
dataset_val = [format_data(sample) for sample in ds["validation"]]


README.md:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

(…)-00000-of-00007-f2d0e9ef9f022d38.parquet:   0%|          | 0.00/42.8M [00:00<?, ?B/s]

(…)-00001-of-00007-47d8e0220bf6c933.parquet:   0%|          | 0.00/81.0M [00:00<?, ?B/s]

(…)-00002-of-00007-7fb5037c4c5da7be.parquet:   0%|          | 0.00/104M [00:00<?, ?B/s]

(…)-00003-of-00007-74b9b7b81cc55f90.parquet:   0%|          | 0.00/90.0M [00:00<?, ?B/s]

(…)-00004-of-00007-77eea90af4a55dce.parquet:   0%|          | 0.00/46.1M [00:00<?, ?B/s]

(…)-00005-of-00007-5332ec423be520bd.parquet:   0%|          | 0.00/55.8M [00:00<?, ?B/s]

(…)-00006-of-00007-637a58c700b604af.parquet:   0%|          | 0.00/57.3M [00:00<?, ?B/s]

(…)-00000-of-00003-90a5518d26493b67.parquet:   0%|          | 0.00/41.3M [00:00<?, ?B/s]

(…)-00001-of-00003-cbfe947a3418595c.parquet:   0%|          | 0.00/45.7M [00:00<?, ?B/s]

(…)-00002-of-00003-9ec816895bd3bc20.parquet:   0%|          | 0.00/64.7M [00:00<?, ?B/s]

(…)-00000-of-00003-e9adadb4799f44d3.parquet:   0%|          | 0.00/41.2M [00:00<?, ?B/s]

(…)-00001-of-00003-7ea98873fc919813.parquet:   0%|          | 0.00/45.3M [00:00<?, ?B/s]

(…)-00002-of-00003-1628308435019820.parquet:   0%|          | 0.00/69.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19654 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6259 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6719 [00:00<?, ? examples/s]



In [3]:
dataset_val[0]["messages"],  len(dataset_val)

([{'role': 'question',
   'content': [{'type': 'text',
     'text': 'Answer the question based on the provided ##Question## and image. ##Question##: what have lost their nuclei?.'},
    {'type': 'image',
     'image': <PIL.JpegImagePlugin.JpegImageFile image mode=CMYK size=492x286>}]},
  {'role': 'answer', 'content': [{'type': 'text', 'text': 'neutrophils'}]}],
 6259)

In [4]:
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
PYTORCH_CUDA_ALLOC_CONF=True
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
from trl import SFTConfig



# Hugging Face model id
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="auto",
    # attn_implementation="flash_attention_2", # not supported for training
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
processor = AutoProcessor.from_pretrained(model_id)

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM",
)

# adding lora layers in the model, with random initialized weights 
model_lora_v0 = get_peft_model(model, peft_config)
# print("model weight", model_lora_v0.base_model.model.vision_model.transformer.layers[0].self_attn.q_proj.lora_A.default.weight)


#load the lora weights
lora_path = "/home/sa5u24/VQA/fine-tuned-visionllama-pvqa/checkpoint-1955"
model_lora = PeftModel.from_pretrained(model_lora_v0, lora_path)

args = SFTConfig(
    output_dir="fine-tuned-visionllama", # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=8,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=5,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    # tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    # push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    gradient_checkpointing_kwargs = {"use_reentrant": False}, # use reentrant checkpointing
    dataset_text_field="", # need a dummy field for collator
    dataset_kwargs = {"skip_prepare_dataset": True} # important for collator
)
args.remove_unused_columns=False

def collate_fn(examples):
    # Get the texts and images, and apply the chat template
    texts = [processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
    # print("texts", texts[0])
    image_inputs = [process_vision_info(example["messages"])[0] for example in examples]

    # Tokenize the texts and process the images
    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
    # batch dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'])

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100  #
    # Ignore the image token index in the loss computation (model specific)
    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652,151653,151655]
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    batch["labels"] = labels
    
    batch['gt']= [example["messages"][1]["content"][0]['text']for example in examples]

    return batch



The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
import nltk
nltk.download('wordnet')
import re
from torch.utils.data import DataLoader, Dataset
import evaluate
from nltk.translate.meteor_score import meteor_score, single_meteor_score
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

rouge = evaluate.load("rouge")

batch_size = 4
# dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle = False, collate_fn=collate_fn)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle = False, collate_fn=collate_fn)

all_gt = []
all_gens = []

all_gt_ = []
all_gens_ = []
unk_gt_count = 0
acc = 0
model_lora.eval()
with torch.no_grad():
    
    for (i,batch) in enumerate(dataloader_val):
        
        output = model_lora.generate(input_ids = batch['input_ids'],
                                     
                                     max_new_tokens=30,
                                     eos_token_id=processor.tokenizer.eos_token_id)
                                     
        
        # output = model_lora.generate(
        #     **batch, max_new_tokens=50, eos_token_id=processor.tokenizer.eos_token_id)
    
        generation = processor.batch_decode(output, skip_special_tokens=True)
        gt = batch['gt']


        for k in range(len(generation)):
            gen_ans = generation[k].split("assistant")[-1].strip()
            gt_ = gt[k]

            if gt_.lower()=='yes' or gt_.lower()=='no':
                if gen_ans.lower()==gt_.lower():
                    acc += 1
            else:        
                all_gens.append(gen_ans)
                all_gens_.append(gen_ans.split())
                all_gt.append(gt_)
        if i>2:
                break       



[nltk_data] Downloading package wordnet to /home/sa5u24/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/sa5u24/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [12]:
len(all_gens), len(all_gt), all_gens, all_gt

(10,
 10,
 ['The correct answer is: Neutrophils.\n\nIn the image, it appears to be a microscopic image of neutrophils,',
  'The correct answer is "neutrophils".\n\nIn the provided image, it appears to be a microscopic image of a blood smear',
  'Based on the provided question, it seems like the answer is related to the histopathological features of acute viral hepatitis. Acute',
  'Based on the provided information, the cells appear to have wavy nuclei.',
  "I don't see an image provided. Could you provide the image or more context about the area you're referring to?",
  'The image appears to be a medical illustration or a photograph of a human abdomen with various tumors or masses scattered throughout the peritoneal',
  "It's difficult to pinpoint the exact location without more context or a clear description of the image. However, based on common medical imaging",
  "I don't see an image provided. Can you please provide the image or describe it so I can better understand and provide 

In [14]:
rouge_results = rouge.compute(predictions=all_gens, references=all_gt)
    
m_score=0
for line in zip(all_gt, all_gens):
    ref = word_tokenize(line[0])
    hypo = word_tokenize(line[1])
    m_score += meteor_score([ref], hypo)
meteors = m_score/len(all_gt)

bleu_score = corpus_bleu(all_gt, all_gens_, weights=(1.0, 0.0, 0.0, 0.0))
    
print(rouge_results, meteors, bleu_score, acc)

{'rouge1': 0.08205128205128205, 'rouge2': 0.03666666666666667, 'rougeL': 0.08441558441558442, 'rougeLsum': 0.08258408258408259} 0.15397361380091645 0.01485148514851485 0


In [26]:
from evaluate import load
bert_score = load("bertscore")
bert_avg1 = 0
bert_avg2 =0
bert_avg3 = 0
a = bert_score.compute(references = all_gt,predictions = all_gens,model_type = 'bert-base-uncased')
bert_avg1+= a['precision'][0]
bert_avg2+= a['recall'][0]
bert_avg3+= a['f1'][0]

print("BERTScore {}".format(round(bert_avg2/len(all_gt),3)), a)

BERTScore 0.081 {'precision': [0.48905348777770996, 0.4112253487110138, 0.36092254519462585, 0.34743326902389526, 0.2422865629196167, 0.3820139467716217, 0.24373994767665863, 0.278962105512619, 0.5160807371139526, 0.18653450906276703], 'recall': [0.806541919708252, 0.8162267804145813, 0.4444631040096283, 0.6153796315193176, 0.43657997250556946, 0.5710670948028564, 0.5034968256950378, 0.2525448203086853, 0.797782838344574, 0.22030799090862274], 'f1': [0.6088970899581909, 0.5469103455543518, 0.39836010336875916, 0.444122314453125, 0.31162962317466736, 0.45779022574424744, 0.3284696042537689, 0.2650969624519348, 0.626732349395752, 0.20201942324638367], 'hashcode': 'bert-base-uncased_L9_no-idf_version=0.3.12(hug_trans=4.45.1)'}


In [24]:
len(all_gt), a['precision'][0]

(10, 0.48905348777770996)

In [9]:
all_gt, all_gens

(['neutrophils',
  'neutrophils',
  'yes',
  'predominantly lymphocytic infiltrate',
  'wavy nuclei',
  'yes',
  'no',
  'abdomen',
  'peritoneal carcinomatosis',
  'yes',
  'abdomen',
  'hemorrhage secondary to ruptured aneurysm',
  'yes',
  'no',
  'by predominantly lymphocytic infiltrate',
  'abdomen'],
 ['Reticulocytes',
  'Based on the provided question and the image, the answer',
  'Based on the description, it appears that the two small',
  'Based on the general knowledge of acute viral hepatitis, I',
  'Based on the provided information, it appears that the cells',
  "Based on the provided information, I'm unable to determine",
  'A histology question!\n\nYes, individual myocardial fibers',
  "I don't see an image provided. Could you please",
  "I don't see an image provided. Can you please",
  'Based on the image, it appears to show a condition',
  'Based on the provided text, it seems there is no',
  "I don't see an image provided. Could you please",
  "Based on the provided 

In [None]:
({'rouge1': 0.01923076923076923,
  'rouge2': 0.0,
  'rougeL': 0.01923076923076923,
  'rougeLsum': 0.01923076923076923},
 0.0,
 0.02013422818791946)

{'rouge1': 0.011363636363636364, 'rouge2': 0.0, 'rougeL': 0.011363636363636364, 'rougeLsum': 0.011363636363636364} 0.0 0.013605442176870753


In [51]:
len(all_gens), len(all_gt), len(all_gens_)

(16, 16, 16)

In [13]:
import nltk
nltk.download('wordnet')
import re
from torch.utils.data import DataLoader, Dataset
import evaluate
from nltk.translate.meteor_score import meteor_score, single_meteor_score
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

batch_size = 4

dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle = False, collate_fn=collate_fn)

for (i,batch) in enumerate(dataloader_val):
    print("i", i)
    print(batch['input_ids'][0], batch['labels'][0], batch['gt'][0])
    break

i 0
tensor([128000, 128000, 128006,   7998, 128007,    271,  16533,    279,   3488,
          3196,    389,    279,   3984,   7860,  14924,    567,    323,   2217,
            13,   7860,  14924,    567,     25,   1148,    617,   5675,    872,
         97192,   4710, 128256, 128009, 128006,   9399, 128007,    271,    818,
           332,  22761,   8839, 128009, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004]) tensor([128000, 128000, 128006,   7998, 128007,    271,  16533,    279,   3488,
          3196,    389,    279,   3984,   7860,  14924,    567,    323,   2217,
            13,   7860,  14924,    567,     25,   1148,    617,   5675,    872,
         97192,   4710,   -100, 128009, 128006,   9399, 128007,    271,    818,
           332,  22761,   8839, 128009,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100]) neutrophils


[nltk_data] Downloading package wordnet to /home/sa5u24/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/sa5u24/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [52]:
rouge_results = rouge.compute(predictions=all_gens, references=all_gt)
    
m_score=0
for line in zip(gt, all_gens):
    ref = word_tokenize(line[0])
    hypo = word_tokenize(line[1])
    m_score += meteor_score([ref], hypo)
meteors = m_score/len(all_gt)

bleu_score = corpus_bleu(all_gt, all_gens_, weights=(1.0, 0.0, 0.0, 0.0))
    
           

In [53]:
rouge_results, meteors, bleu_score

({'rouge1': 0.01923076923076923,
  'rouge2': 0.0,
  'rougeL': 0.01923076923076923,
  'rougeLsum': 0.01923076923076923},
 0.0,
 0.02013422818791946)

In [10]:
for i in range (len(all_gens)):
    print("prediction:",all_gens[i])
    print("gt:", all_gt[i] )

prediction: The image depicts a cell with a long tail and two bulbous structures at the end, which is characteristic of a neutrophil
gt: neutrophils
prediction: The image shows a neutrophil with its nucleus lost, which is a characteristic of apoptosis.
gt: neutrophils
prediction: **Step 1: Identify the condition**

The image shows a pulmonary arteriole with a large, pink, amorphous
gt: yes
prediction: **Step 1: Identify the key characteristics of the image.**

The image shows a predominantly lymphocytic infiltrate in the
gt: predominantly lymphocytic infiltrate
prediction: The image shows a microscopic view of cells with wavy nuclei. The cells are likely to be cancerous, as indicated by the
gt: wavy nuclei
prediction: The cells in the image have wavy nuclei.
gt: yes
prediction: To determine if individual myocardial fibers have wavy nuclei, we need to analyze the provided microscopic image. The image shows a microscopic
gt: no
prediction: question

Answer the question based on the provi

In [27]:
example = dataloader_val.dataset[0] 
texts = processor.apply_chat_template(example["messages"], tokenize=False)
gt = example["messages"][1]["content"][0]['text']
gt

'neutrophils'

In [6]:
len(all_gens), len(all_gt), all_gens[1], all_gt[1], unk_gt_count

(214,
 214,
 'The image shows people in the background, but they are not skiing. They are standing on a snowy hill, but there is no',
 'no',
 4)

In [7]:
rouge_results = rouge.compute(predictions=all_gens, references=all_gt)
rouge_results

{'rouge1': 0.1481178710314811,
 'rouge2': 0.0156487437795849,
 'rougeL': 0.14743072392524897,
 'rougeLsum': 0.1475788670914207}

In [8]:
from nltk.translate.meteor_score import meteor_score, single_meteor_score
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
m_score=0
for line in zip(all_gt, all_gens):
    ref = word_tokenize(line[0])
    hypo = word_tokenize(line[1])
    m_score += meteor_score([ref], hypo)

print("meteor_score", m_score/len(all_gt))

[nltk_data] Downloading package punkt_tab to /home/sa5u24/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


meteor_score 0.18524082550328952


In [9]:
from nltk.translate.bleu_score import corpus_bleu
bleu_score = corpus_bleu(all_gt_, all_gens_, weights=(1.0, 0.0, 0.0, 0.0))
print("bleu_score", bleu_score)

bleu_score 0.022238425082026978


In [None]:
with lora pre-trained weight on val dataset

{'rouge1': 0.15045387699529927,
 'rouge2': 0.014528630766948526,
 'rougeL': 0.14957247124561862,
 'rougeLsum': 0.1492740451009666}
meteor_score 0.18661183211828292
bleu_score 0.023290203327171907

with lora pre-trained weight on traindataset
{'rouge1': 0.14348497273817096,
 'rouge2': 0.023673930927080054,
 'rougeL': 0.1435822234212641,
 'rougeLsum': 0.14408427723443865}
meteor_score 0.19548155570972522
bleu_score 0.020076150917272415

zeroshot on val dataset
{'rouge1': 0.1481178710314811,
 'rouge2': 0.0156487437795849,
 'rougeL': 0.14743072392524897,
 'rougeLsum': 0.1475788670914207}
meteor_score 0.18524082550328952
bleu_score 0.022238425082026978

training 10 epochs and test on val_data
'rouge1': 0.14341272794751442,
  'rouge2': 0.016518267372332758,
  'rougeL': 0.14293373496184048,
  'rougeLsum': 0.14295980134037134},
 0.1821632742536163,
 0.02035006326444538

In [20]:
from nltk.translate.bleu_score import corpus_bleu

# List of reference sentences (ground truth)
references = [
    [["this", "is", "a", "test"]],  # Multiple references for the first candidate
    [["another", "test", "sentence"]]  # Multiple references for the second candidate
]

# List of candidate sentences (model output)
candidates = [
    ["this", "is", "a", "test"],  # Candidate sentence 1
    ["another", "test", "sentence"]  # Candidate sentence 2
]

# Calculate the BLEU score for the corpus
bleu_score = corpus_bleu(references, candidates)

print("Corpus BLEU score:", bleu_score)

Corpus BLEU score: 0.8408964152537145


In [21]:
import nltk
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.chrf_score import chrf_precision_recall_fscore_support
from nltk.translate.meteor_score import single_meteor_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

BLEU_1 = sentence_bleu([gt.split()], gen_ans.split(), weights=(1, 0, 0, 0))
BLEU_2 = sentence_bleu([gt.split()], gen_ans.split(), weights=(1, 1, 0, 0))
BLEU_3 = sentence_bleu([gt.split()], gen_ans.split(), weights=(1, 1, 1, 0))
BLEU_4 = sentence_bleu([gt.split()], gen_ans.split(), weights=(1, 1, 1, 1))

print('BLEU_1:%.4f, BLEU_2:%.4f, BLEU_3:%.4f, BLEU_4:%.4f'%(BLEU_1, BLEU_2, BLEU_3, BLEU_4))

prec, rec, f1, tp = chrf_precision_recall_fscore_support(gt.split(), gen_ans.split(), n=1)
print('prec:%.4f, rec:%.4f, f1:%.4f, tp:%.4f'%(prec, rec, f1, tp))

meteor = single_meteor_score(gt.split(), gen_ans.split())
print('Meteor:%.4f'%meteor)



[nltk_data] Downloading package wordnet to /home/sa5u24/nltk_data...


BLEU_1:0.0000, BLEU_2:0.0000, BLEU_3:0.0000, BLEU_4:0.0000
prec:0.0000, rec:0.0000, f1:0.0000, tp:0.0000
Meteor:0.0000


In [35]:
#single example for model generation

from PIL import Image
import requests

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}
    ]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
print("input_text", input_text)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(model.device)

print("inputs", inputs.keys(), inputs.input_ids.size(), inputs.pixel_values.size() )

output = model_lora.generate(**inputs, max_new_tokens=30)
print(processor.decode(output[0]))



input_text <|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|>


inputs dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask']) torch.Size([1, 28]) torch.Size([1, 1, 4, 3, 560, 560])
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here is a haiku for the image:

Whiskers, ears so bright,
Hopping through the countryside,
Spring's gentle delight.<|eot_id|>


In [10]:

#single example for model generation

from PIL import Image
import requests

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)

messages1 = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}
    ]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
print("input_text", input_text)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
)

print(inputs.input_ids.shape, inputs.pixel_values.shape)


input_text <|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|>


torch.Size([1, 28]) torch.Size([1, 1, 4, 3, 560, 560])
