## Setup

In [None]:
!pip install torch==2.6.0 torchvision torchaudio


In [None]:
!pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.3.12/flash_attn-2.8.0+cu124torch2.6-cp310-cp310-linux_x86_64.whl


In [None]:
!pip install -U trl>=0.9.6 transformers>=4.42 peft>=0.12.0 accelerate>=0.33.0 bitsandbytes>=0.43.3 datasets>=2.18 qwen-vl-utils pillow


In [None]:
!pip install -U scipy

In [6]:
import os, torch
from datasets import load_dataset
import torch.nn.functional as F
from transformers import (AutoProcessor, AutoTokenizer, BitsAndBytesConfig, Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor, EarlyStoppingCallback)
from qwen_vl_utils import process_vision_info
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTConfig, SFTTrainer
from PIL import Image, ImageOps
import re

  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing

In [2]:
# --------- Config ---------
MODEL_ID    = "Qwen/Qwen2.5-VL-7B-Instruct"
DATASET_REPO = "AI-4-Everyone/Visual-TableQA"
#DATASET_REPO = "hewei2001/ReachQA"
OUTPUT_DIR_TIER_A   = "qwen-vl-sft-lora-tableqa-tierA" #for Visual-TableQA
#OUTPUT_DIR_TIER_A   = "qwen-vl-sft-lora-reachqa-tierA" #for ReachQA

# --------- Load data ---------
ds = load_dataset(DATASET_REPO)

"""
from datasets import ClassLabel
labels = sorted(set(ds["train"]["qa_type"]))
ds = ds.cast_column("qa_type", ClassLabel(names=labels))
split = ds["train"].train_test_split(test_size=0.1, seed=42, stratify_by_column="qa_type")
ds["train"], ds["validation"] = split["train"], split["test"]
"""

train = ds.get("train")

evald = ds.get("validation")


In [3]:
system_message = """You are a Vision Language Model specialized in interpreting visual data from charts and diagrams images.
Answer the questions strictly from the image, with clear, rigorous step-by-step justification. Stay concise, but include all reasoning that’s relevant."""

In [4]:
def to_pil(img):
    return img if isinstance(img, Image.Image) else Image.fromarray(img)

In [5]:
def format_data(sample):
    return [
        {"role": "system",
         "content": [{"type": "text", "text": system_message}],
        },
        {"role": "user",
         "content": [
                {"type": "image",
                 "image": to_pil(sample["image"]),},
                {"type": "text",
                 "text": sample["question"],
                },
            ],
        },
        {"role": "assistant",
         "content": [{"type": "text", "text": sample["answer"]}],
        },
    ]

In [6]:
def collate_fn(examples):
    
    msgs = [format_data(sample) for sample in examples]
    
    texts = [processor.apply_chat_template(m, tokenize=False, add_generation_prompt=True) for m in msgs]
        
    image_inputs, _ = process_vision_info(msgs)
    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True, truncation=False, max_length=None,)

    labels = batch["input_ids"].detach().clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    
    # collect image/video token ids in a model-agnostic way
    img_ids = set()
    for attr in ("image_token_id", "video_token_id"):
        v = getattr(model.config, attr, None)
        if v is not None:
            img_ids.add(v)
    # fallback to common special tokens if present
    for tok in ("<image>", "<img>", "<video>", '<|vision_start|>', '<|vision_end|>'):
        tid = processor.tokenizer.convert_tokens_to_ids(tok)
        if tid not in (None, -1, processor.tokenizer.unk_token_id):
            img_ids.add(tid)

    if img_ids:
        mask = torch.isin(labels, torch.tensor(sorted(img_ids), dtype=labels.dtype, device=labels.device))
        labels[mask] = -100
        
    batch["labels"] = labels
    return batch

In [7]:
use_bf16 = torch.cuda.is_bf16_supported()

## Tier A

In [None]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16 if use_bf16 else torch.float16,
    # Optional (if your setup supports it):
    attn_implementation="flash_attention_2",
    low_cpu_mem_usage=True,
)
model.config.use_cache = False                      # crucial for training memory
model.gradient_checkpointing_enable()
model.config.pretraining_tp = 1  # ensure tensor parallelism is disabled

min_pixels = 256*28*28
max_pixels = 2560*28*28
processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=min_pixels, max_pixels=max_pixels)

In [15]:
# --------- LoRA config (r=16, alpha=8) ---------
TARGETS = [
  "q_proj", "v_proj", "k_proj","o_proj",
  "gate_proj", "up_proj", "down_proj",
  # optional but sometimes helpful
  #"lm_head",
  "multi_modal_projector",   # Qwen2 / Qwen2.5 VL (HF naming)
  ]

r, lora_alpha = 16, 8

peft_cfg = LoraConfig(r=r, lora_alpha=lora_alpha, lora_dropout=0.05, bias="none",
                      target_modules=TARGETS, task_type="CAUSAL_LM")

In [16]:
# Apply PEFT model adaptation

peft_model = get_peft_model(model, peft_cfg)

# Print trainable parameters

peft_model.print_trainable_parameters()

trainable params: 47,589,376 || all params: 8,339,756,032 || trainable%: 0.5706


In [17]:
from trl import SFTConfig
from transformers import EarlyStoppingCallback

args = SFTConfig(
    output_dir=OUTPUT_DIR_TIER_A,

    num_train_epochs=2,       #1 for ReachQA due to datasets sizes difference 

    # ---- batch / memory ----
    per_device_train_batch_size=3,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,   # global batch ~= 24
    dataloader_pin_memory=True,

    # ---- stability & speed ----
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    bf16=use_bf16,
    fp16=not use_bf16,
    tf32=True,

    # ---- optimization (LoRA-friendly) ----
    learning_rate=1e-4,             
    lr_scheduler_type="cosine",
    warmup_ratio=0.07,              # 5–10% warmup works well
    max_grad_norm=1.0,              
    weight_decay=0.01,              
    optim="adamw_torch_fused",
    adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-8,

    # ---- logging / eval / save ----
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=100,                  
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=3,

    remove_unused_columns=False,
    dataset_kwargs={"skip_prepare_dataset": True},
    dataset_text_field="",  # keep TRL from looking for "text"
)

# Add early stopping (patience 2 evals)
callbacks = [EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)]


### Training

In [18]:
trainer = SFTTrainer(
    model=peft_model,
    args=args,
    train_dataset=train,
    eval_dataset=evald,
    data_collator=collate_fn,
    callbacks=callbacks,
)

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
100,0.6018,0.627113,3.011601,5979027.0,0.827814
200,0.5863,0.606202,3.059967,11945729.0,0.830864


In [None]:
trainer.save_model(OUTPUT_DIR_TIER_A)      # saves the adapters
processor.save_pretrained(OUTPUT_DIR_TIER_A)

#### Resume training

In [19]:
trainer.train(resume_from_checkpoint=True) 

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
500,0.5389,0.577489,3.019494,5962153.0,0.834886


TrainOutput(global_step=549, training_loss=0.14997396399631743, metrics={'train_runtime': 13927.6756, 'train_samples_per_second': 0.946, 'train_steps_per_second': 0.039, 'total_flos': 1.706012246911703e+18, 'train_loss': 0.14997396399631743, 'epoch': 1.9981785063752278})

In [None]:
trainer.save_model(OUTPUT_DIR_TIER_A)      # saves the adapters
processor.save_pretrained(OUTPUT_DIR_TIER_A)

## B Tiers

In [8]:
OUTPUT_DIR_TIER_B= "qwen-vl-sft-lora-tableqa-tierB" 
#OUTPUT_DIR_TIER_B= "qwen-vl-sft-lora-reachqa-tierB" #for ReachQA
MERGED_DIR = "qwen-vl-merged-tierA-tableqa"             # <- new folder to save merged base
#MERGED_DIR = "qwen-vl-merged-tierA-reachqa"             # #for ReachQA

### Merge Tier A adapter and base model (run once)

In [14]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16 if use_bf16 else torch.float16,
    # Optional (if your setup supports it):
    attn_implementation="flash_attention_2",
    low_cpu_mem_usage=True,
)
model.config.use_cache = False                      
model.gradient_checkpointing_enable()
model.config.pretraining_tp = 1  

`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 5 files: 100%|██████████| 5/5 [00:50<00:00, 10.10s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:03<00:00,  1.44it/s]


In [15]:
peft_model = PeftModel.from_pretrained(model, OUTPUT_DIR_TIER_A, is_trainable=False)
merged = peft_model.merge_and_unload() 
merged.save_pretrained(MERGED_DIR)

### (re)load merged base for tier B phase

In [9]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MERGED_DIR, device_map="auto", torch_dtype="auto",
    attn_implementation="flash_attention_2", low_cpu_mem_usage=True
)
model.config.use_cache = False
model.gradient_checkpointing_enable()
model.config.pretraining_tp = 1

min_pixels = 256*28*28
max_pixels = 2560*28*28
processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=min_pixels, max_pixels=max_pixels)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [10]:
tierB_cfg = LoraConfig(
    r=8, lora_alpha=32, lora_dropout=0.10, bias="none",
    task_type="CAUSAL_LM",
    target_modules=["attn.qkv", "attn.proj"],
)

peft_model = get_peft_model(model, tierB_cfg)

In [11]:
for _, p in peft_model.named_parameters():
    p.requires_grad_(False)

# capture indices from your paths: model.visual.blocks.<idx>.
layer_re = re.compile(r"model\.visual\.blocks\.(\d+)\.")

# discover how many blocks the vision tower has
vision_layers = sorted({int(m.group(1)) for n, _ in peft_model.named_modules()
                        if (m := layer_re.search(n))})
N = 4                                 # tune last 4; adjust 2–6 if needed
lastN = set(vision_layers[-N:])

def is_lastN_vision_attn_lora(name: str) -> bool:
    if "lora_" not in name:
        return False
    m = layer_re.search(name)
    if not m:
        return False
    idx = int(m.group(1))
    if idx not in lastN:
        return False
    # only attention projections
    return ("attn.qkv" in name) or ("attn.proj" in name)

for n, p in peft_model.named_parameters():
    if is_lastN_vision_attn_lora(n):
        p.requires_grad_(True)

#(Optional) keep the multimodal projector from Tier A trainable a bit more
for n, p in peft_model.named_parameters():
    if "multi_modal_projector" in n and ("lora_" in n) and ("default" in n):
        p.requires_grad_(True)

peft_model.print_trainable_parameters()

trainable params: 245,760 || all params: 8,294,132,736 || trainable%: 0.0030


In [12]:
from trl import SFTConfig, SFTTrainer
from transformers import EarlyStoppingCallback

args_tierB = SFTConfig(
    output_dir=OUTPUT_DIR_TIER_B,
    num_train_epochs=1,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    bf16=use_bf16, fp16=not use_bf16, tf32=True,

    learning_rate=2e-5,            # ↓ a bit for vision LoRA; safe starting point
    lr_scheduler_type="cosine",
    warmup_ratio=0.07,
    max_grad_norm=1.0,
    weight_decay=0.01,
    optim="adamw_torch_fused",

    logging_steps=20,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=3,

    remove_unused_columns=False,
    dataset_kwargs={"skip_prepare_dataset": True},
    dataset_text_field="",
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]

In [13]:
trainer = SFTTrainer(
    model=peft_model,
    args=args_tierB,
    train_dataset=train,
    eval_dataset=evald,
    data_collator=collate_fn,
    processing_class=processor,
    callbacks=callbacks,
)

In [14]:
trainer.train()
trainer.save_model(OUTPUT_DIR_TIER_B)      # saves the adapters
processor.save_pretrained(OUTPUT_DIR_TIER_B)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
100,0.5721,0.607478,2.971453,5979027.0,0.831044
200,0.5843,0.607474,2.972311,11945729.0,0.831172


[]

## Final Model Loading and Inferance

In [14]:
base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MERGED_DIR, device_map="auto", torch_dtype="auto",
    attn_implementation="flash_attention_2", low_cpu_mem_usage=True
)

model = PeftModel.from_pretrained(base, OUTPUT_DIR_TIER_B)

del base                       
torch.cuda.empty_cache()    
model.eval()
min_pixels = 256*28*28
max_pixels = 2560*28*28
processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=min_pixels, max_pixels=max_pixels)

`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 5 files: 100%|██████████| 5/5 [00:59<00:00, 11.94s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:03<00:00,  1.35it/s]


In [17]:
def generate_text_from_sample(model, sample, max_new_tokens=5000, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(format_data(sample)[:2], tokenize=False, add_generation_prompt=True)
    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(format_data(sample)[:2])
    
    # Prepare the inputs for the model
    model_inputs = processor(text=[text_input], images=image_inputs, return_tensors="pt").to(device)  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens, do_sample=False)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text[0]  # Return the first decoded output text

In [None]:
output = generate_text_from_sample(model, train[0])
output