In [1]:
!pip install --upgrade pip
!pip install -q accelerate -U
!pip install -q bitsandbytes -U
!pip install -q trl -U
!pip install -q peft -U
!pip install -q transformers -U
!pip install -q datasets -U
!pip install qwen-vl-utils

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed pip-25.0.1
Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.10-py3-none-any.whl.metadata (6.3 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-14.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Downloading qwen_vl_utils-0.0.10-py3-none-any.whl (6.7 kB)
Downloading av-14.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.7/39.7 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages

In [2]:
system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.
The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""

In [3]:
from datasets import load_dataset

dataset_id = "HuggingFaceM4/ChartQA"
dataset= load_dataset(dataset_id)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 28299/28299 [00:02<00:00, 11858.73 examples/s]
Generating val split: 100%|██████████| 1920/1920 [00:00<00:00, 17219.10 examples/s]
Generating test split: 100%|██████████| 2500/2500 [00:00<00:00, 12490.62 examples/s]


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'query', 'label', 'human_or_machine'],
        num_rows: 28299
    })
    val: Dataset({
        features: ['image', 'query', 'label', 'human_or_machine'],
        num_rows: 1920
    })
    test: Dataset({
        features: ['image', 'query', 'label', 'human_or_machine'],
        num_rows: 2500
    })
})

In [5]:
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, BitsAndBytesConfig,AutoModelForCausalLM,TrainingArguments

torch.cuda.set_device(0)
model_id = "Qwen/Qwen2-VL-7B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map={"": 0},  # Explicitly map to GPU 0
    quantization_config=bnb_config,
).to(device)

processor = Qwen2VLProcessor.from_pretrained(model_id)


Using device: cuda


Downloading shards: 100%|██████████| 5/5 [07:31<00:00, 90.25s/it] 
Loading checkpoint shards: 100%|██████████| 5/5 [00:17<00:00,  3.54s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
from qwen_vl_utils import process_vision_info

def collate_fn(data):
    message = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": data["image"],
                },
                {
                    "type": "text",
                    "text": data["query"],
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": data["label"][0]}],
        },
    ]
    data["text"] = processor.apply_chat_template(message, tokenize=False)
    image_inputs = [process_vision_info(message)[0]]
    data = processor(text=data["text"],images=image_inputs,return_tensors="pt",padding=True)
    labels = data["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    if isinstance(processor, Qwen2VLProcessor):  # Check if the processor is Qwen2VLProcessor
        image_tokens = [151652, 151653, 151655]  # Specific image token IDs for Qwen2VLProcessor
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    data["labels"] = labels
    return data

In [8]:
formatted_dataset = dataset["test"].select(range(50)).map(collate_fn)
formatted_dataset_test = dataset["val"].select(range(100)).map(collate_fn)

Map: 100%|██████████| 50/50 [00:04<00:00, 10.13 examples/s]
Map: 100%|██████████| 100/100 [00:11<00:00,  8.64 examples/s]


In [9]:
from qwen_vl_utils import process_vision_info

def collate_fn_new(batch):
    batch_texts = []
    batch_images = []

    for data in batch:  # Iterate over batch items
        message = [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": data["image"]},
                    {"type": "text", "text": data["query"]},
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": data["label"][0]}],
            },
        ]
        
        text = processor.apply_chat_template(message, tokenize=False)
        image_input = process_vision_info(message)[0]
        
        batch_texts.append(text)
        batch_images.append(image_input)

    # Process batch using the processor
    processed_data = processor(
        text=batch_texts,
        images=batch_images,
        return_tensors="pt",
        padding=True,
    )

    labels = processed_data["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    # Handling Image Token IDs for Qwen2VLProcessor
    if isinstance(processor, Qwen2VLProcessor):  
        image_tokens = [151652, 151653, 151655]  
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
    
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100

    processed_data["labels"] = labels
    return processed_data


In [10]:
formatted_dataset

Dataset({
    features: ['image', 'query', 'label', 'human_or_machine', 'text', 'input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw', 'labels'],
    num_rows: 50
})

In [11]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config).to(device)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 2,523,136 || all params: 8,293,898,752 || trainable%: 0.0304


In [12]:
from trl import SFTConfig

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    gradient_accumulation_steps=32,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=10,
    weight_decay=0.01,
    evaluation_strategy='steps',
    eval_steps=10, # evaluate every 10 steps
    logging_steps=1,
    logging_strategy="steps",     # Log at steps instead of silent mode
    gradient_checkpointing=True, # recomputes forward pass activations in backward pass to save memory
    save_steps=500 # checkpoint every 500 steps
)

training_args.remove_unused_columns = False  # Keep unused columns in dataset



In [13]:
from trl import SFTTrainer

from datasets import Dataset

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    eval_dataset=formatted_dataset_test,
    data_collator=collate_fn_new,
    peft_config=peft_config,
    tokenizer=processor.tokenizer,
)

  trainer = SFTTrainer(
Converting train dataset to ChatML: 100%|██████████| 50/50 [00:07<00:00,  6.60 examples/s]
Applying chat template to train dataset: 100%|██████████| 50/50 [00:02<00:00, 16.77 examples/s]
Truncating train dataset: 100%|██████████| 50/50 [00:03<00:00, 12.66 examples/s]
Converting eval dataset to ChatML: 100%|██████████| 100/100 [00:09<00:00, 10.13 examples/s]
Applying chat template to eval dataset: 100%|██████████| 100/100 [00:06<00:00, 14.30 examples/s]
Truncating eval dataset: 100%|██████████| 100/100 [00:02<00:00, 43.96 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [14]:
# ✅ Check dataset before training
print(f"Training dataset size: {len(trainer.train_dataset)}")
print(f"Eval dataset size: {len(trainer.eval_dataset)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")

# ✅ Run a single batch manually to verify dataset & model
print("Testing a single batch forward pass...")
try:
    batch = next(iter(trainer.get_train_dataloader()))
    print("Batch keys:", batch.keys())  # Check if inputs are correctly formatted
    outputs = model(**batch)
    print("Single batch forward pass success.")
except Exception as e:
    print("Error in single batch forward pass:", e)
print("Starting training...")
trainer.train(resume_from_checkpoint=False)

# ✅ Print trainer state
print("Trainer state after training:")
print(trainer.state)

Training dataset size: 50
Eval dataset size: 100
Batch size: 1
Testing a single batch forward pass...
Batch keys: dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw', 'labels'])
Single batch forward pass success.
Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


OutOfMemoryError: CUDA out of memory. Tried to allocate 260.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 10.40 GiB is allocated by PyTorch, and 79.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.train()