In [1]:
#!/usr/bin/env python3
import os
# set CUDA_VISIBLE_DEVICES=1
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import json
import torch
import argparse
from tqdm import tqdm
import pandas as pd
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoProcessor, BitsAndBytesConfig, Qwen2VLForConditionalGeneration, HfArgumentParser, Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path="/data1/dnapolitano/MM/VQAv2/validation.json"
image_path="/data2/dnapolitano/MM/data/VQA/Images/mscoco/val2014"

In [3]:
with open(data_path, "r") as f:
    data = json.load(f)
len(data)

88319

In [4]:
from transformers import BitsAndBytesConfig, AutoProcessor, AutoConfig, Qwen2_5_VLForConditionalGeneration

In [5]:
model_path="/data2/dnapolitano/MM/scripts/Qwen2-VL-Finetune/output/lora_vision_test"

In [6]:
model_base="Qwen/Qwen2.5-VL-3B-Instruct"

In [13]:
lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
if hasattr(lora_cfg_pretrained, 'quantization_config'):
    del lora_cfg_pretrained.quantization_config
# processor = AutoProcessor.from_pretrained(model_base)
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained(
    model_base, min_pixels=min_pixels, max_pixels=max_pixels
)

In [19]:
kwargs = {"device_map": "auto"}
kwargs['torch_dtype'] = torch.float16
kwargs['attn_implementation'] = 'flash_attention_2'
kwargs['cache_dir'] = "/data1/hf_cache/models"

In [20]:
print('Loading Qwen2.5-VL from base model...')

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)

Loading Qwen2.5-VL from base model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.16s/it]


In [21]:
token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
if model.lm_head.weight.shape[0] != token_num:
    model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
    model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))

In [22]:
print('Loading additional Qwen2.5-VL weights...')
non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_state_dict.bin'), map_location='cpu')
non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
if any(k.startswith('model.model.') for k in non_lora_trainables):
    non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
model.load_state_dict(non_lora_trainables, strict=False)

Loading additional Qwen2-VL weights...


_IncompatibleKeys(missing_keys=['visual.blocks.0.attn.qkv.weight', 'visual.blocks.0.attn.qkv.bias', 'visual.blocks.0.attn.proj.weight', 'visual.blocks.0.attn.proj.bias', 'visual.blocks.0.mlp.gate_proj.weight', 'visual.blocks.0.mlp.gate_proj.bias', 'visual.blocks.0.mlp.up_proj.weight', 'visual.blocks.0.mlp.up_proj.bias', 'visual.blocks.0.mlp.down_proj.weight', 'visual.blocks.0.mlp.down_proj.bias', 'visual.blocks.1.attn.qkv.weight', 'visual.blocks.1.attn.qkv.bias', 'visual.blocks.1.attn.proj.weight', 'visual.blocks.1.attn.proj.bias', 'visual.blocks.1.mlp.gate_proj.weight', 'visual.blocks.1.mlp.gate_proj.bias', 'visual.blocks.1.mlp.up_proj.weight', 'visual.blocks.1.mlp.up_proj.bias', 'visual.blocks.1.mlp.down_proj.weight', 'visual.blocks.1.mlp.down_proj.bias', 'visual.blocks.2.attn.qkv.weight', 'visual.blocks.2.attn.qkv.bias', 'visual.blocks.2.attn.proj.weight', 'visual.blocks.2.attn.proj.bias', 'visual.blocks.2.mlp.gate_proj.weight', 'visual.blocks.2.mlp.gate_proj.bias', 'visual.blocks.2

In [24]:
from peft import PeftModel

In [25]:
print('Loading LoRA weights...')
model = PeftModel.from_pretrained(model, model_path)

This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



Loading LoRA weights...


In [26]:
print('Merging LoRA weights...')
model = model.merge_and_unload()

Merging LoRA weights...


In [None]:
# predictions = {}
# ground_truth = {}
# for el in tqdm(data):
#     # print(el)
#     question_id = el["id"]
#     image = el["image"]
#     conversation = el["conversations"]
#     ground_truth[question_id] = conversation[-1]["value"]
#     question = conversation[0]["value"]
#     question = question.replace("<image>\n", "")
#     file_path = image_path + "/" + image
#     messages = [
#         {
#             "role": "user",
#             "content": [
#                 {"type": "image", "image": f"{file_path}"},
#                 {"type": "text", "text": f"Provide a direct answer to the following question:\n {question}"},
#             ]
#         }
#     ]
#     # print(messages)
#     # Preparation for inference
#     text = processor.apply_chat_template(
#         messages, tokenize=False, add_generation_prompt=True
#     )
#     image_inputs, video_inputs = process_vision_info(messages)
#     inputs = processor(
#         text=[text],
#         images=image_inputs,
#         videos=video_inputs,
#         padding=True,
#         return_tensors="pt",
#     )
#     inputs = inputs.to("cuda")

#     # Inference: Generation of the output
#     generated_ids = model.generate(**inputs, max_new_tokens=128)
#     generated_ids_trimmed = [
#         out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
#     ]
#     output_text = processor.batch_decode(
#         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
#     )[0]
#     print(f"Question: {question}")
#     print(f"Answer: {output_text}")
#     print(f"Ground Truth: {ground_truth[question_id]}")
#     predictions[question_id] = output_text[0]
#     # print("-----------------------------------------------------")
#     break

  0%|          | 0/88319 [00:01<?, ?it/s]

Question: Why is there a gap between the roof and wall?
Answer: for ventilation
Ground Truth: yes





In [None]:
import torch
from tqdm import tqdm

# Define batch size
BATCH_SIZE = 128  # Adjust based on your GPU memory
# tokenizer.padding_side  = 'left'
processor.tokenizer.padding_side = "left"

predictions = {}
ground_truth = {}
questions = {}

# Process data in batches
for i in tqdm(range(484, len(data), BATCH_SIZE)):
    batch_data = data[i:i+BATCH_SIZE]
    
    batch_messages = []
    batch_ids = []
    
    # Prepare batch inputs
    for el in batch_data:
        question_id = el["id"]
        image = el["image"]
        conversation = el["conversations"]
        
        # Store ground truth
        ground_truth[question_id] = conversation[-1]["value"]
        
        # Prepare question
        question = conversation[0]["value"]
        question = question.replace("<image>\n", "")
        questions[question_id]=question
        file_path = image_path + "/" + image
        
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": f"{file_path}"},
                    {"type": "text", "text": f"Provide a direct answer to the following question:\n {question}"},
                ]
            }
        ]
        
        batch_messages.append(messages)
        batch_ids.append(question_id)
    
    # Process batch
    batch_texts = [
        processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        ) for messages in batch_messages
    ]
    
    # Process all images and videos in the batch
    # batch_image_inputs = []
    # batch_video_inputs = []
    
    # for messages in batch_messages:
    #     image_inputs, video_inputs = process_vision_info(messages)
    #     batch_image_inputs.append(image_inputs[0] if image_inputs else None)
    #     batch_video_inputs.extend(video_inputs)
    batch_image_inputs, batch_video_inputs = process_vision_info(batch_messages)
    
    # Handle inputs with processor
    inputs = processor(
        text=batch_texts,
        images=batch_image_inputs, # if batch_image_inputs else None,
        videos=batch_video_inputs, # if batch_video_inputs else None,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    # Batch inference
    with torch.no_grad():  # Add this to save memory
        generated_ids = model.generate(**inputs, max_new_tokens=10)
    
    # Process generated outputs
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    
    output_texts = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    # Store predictions
    for idx, question_id in enumerate(batch_ids):
        if idx < len(output_texts):
            # Make sure we're getting the first element if output_texts[idx] is a list
            if isinstance(output_texts[idx], list):
                predictions[question_id] = output_texts[idx][0]
            else:
                predictions[question_id] = output_texts[idx]
    # break