In [1]:
!pip install --upgrade --quiet transformers datasets accelerate peft bitsandbytes pillow --no-deps

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import __version__ as transformers_version
from accelerate import __version__ as accelerate_version
from peft import __version__ as peft_version

def display_library_versions():
    """Print the installed versions of key libraries."""
    print(f"Transformers library version: {transformers_version}")
    print(f"Accelerate library version  : {accelerate_version}")
    print(f"PEFT library version        : {peft_version}")

if __name__ == "__main__":
    display_library_versions()

2025-05-13 00:34:18.332157: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747096458.519099      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747096458.573457      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Transformers library version: 4.51.3
Accelerate library version  : 1.6.0
PEFT library version        : 0.15.2


In [3]:
import os
from datasets import load_dataset, Features, Value, Image
from transformers import (
    BlipProcessor,
    BlipForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType,
)
import bitsandbytes as bnb

In [4]:
import torch
from PIL import Image
from datasets import load_dataset
from transformers import (
    BlipProcessor,
    BlipForQuestionAnswering,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
)
from peft import LoraConfig, get_peft_model
from torch.nn.utils.rnn import pad_sequence

# ─── 1) Ingest CSV and split into train/validation ───────────────────────
def load_and_split(csv_path, test_frac=0.10, seed=42):
    dataset = load_dataset("csv", data_files=csv_path, split="train")
    shuffled = dataset.shuffle(seed=seed)
    return shuffled.train_test_split(test_size=test_frac)

splits = load_and_split("/kaggle/input/vqa-train-dataset/blip_vqa_train.csv")
train_ds, val_ds = splits["train"], splits["test"]
print(f"Training samples: {len(train_ds)} | Validation samples: {len(val_ds)}")

# ─── 2) Initialize processor and define constants ───────────────────────
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", use_fast=True)
IMAGE_ROOT = "/kaggle/input/abo-small/images/small"
PAD_ID = processor.tokenizer.pad_token_id

# ─── 3) Load model config and set decoder start token ────────────────────
model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    low_cpu_mem_usage=True
)
DECODER_START = model.config.decoder_start_token_id or processor.tokenizer.cls_token_id

Generating train split: 0 examples [00:00, ? examples/s]

Training samples: 67477 | Validation samples: 7498


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

In [5]:
# ─── FILTER OUT EMPTY ANSWERS ────────────────────────────────────────
def has_valid_answer(record):
    ans = record.get("answer", "")
    return bool(ans and ans.strip())

training_data   = train_ds.filter(has_valid_answer,    num_proc=1)
validation_data = val_ds.filter(has_valid_answer,      num_proc=1)

# ─── BATCH PREPROCESSING FUNCTION ───────────────────────────────────
def batch_preprocessor(samples):
    # tokenize the questions
    q_tokens = processor.tokenizer(
        samples["question"],
        truncation=True,
        max_length=128,
        padding=False,
    )
    # tokenize the answers
    a_tokens = processor.tokenizer(
        samples["answer"],
        truncation=True,
        max_length=32,
        padding=False,
    )
    labels = a_tokens["input_ids"]

    # build decoder inputs and masks
    decoder_inputs = [[DECODER_START] + lbl[:-1] for lbl in labels]
    decoder_masks  = [[1] * len(seq) for seq in decoder_inputs]

    return {
        "image_path":                    samples["image_path"],
        "input_ids":               q_tokens["input_ids"],
        "attention_mask":          q_tokens["attention_mask"],
        "labels":                  labels,
        "decoder_input_ids":       decoder_inputs,
        "decoder_attention_mask":  decoder_masks,
    }

# ─── APPLY MAPPING TO DATASETS ───────────────────────────────────────
tokenized_train = training_data.map(
    batch_preprocessor,
    batched=True,
    batch_size=32,
    num_proc=1,
    remove_columns=["image_path", "question", "answer"],
    load_from_cache_file=True,
    keep_in_memory=False,
)

tokenized_val = validation_data.map(
    batch_preprocessor,
    batched=True,
    batch_size=32,
    num_proc=1,
    remove_columns=["image_path", "question", "answer"],
    load_from_cache_file=True,
    keep_in_memory=False,
)

# ─── CONFIGURE TORCH FORMAT ─────────────────────────────────────────
for ds in (tokenized_train, tokenized_val):
    ds.set_format(
        type="torch",
        columns=[
            "image_path",
            "input_ids", "attention_mask",
            "labels",
            "decoder_input_ids", "decoder_attention_mask",
        ],
    )


Filter:   0%|          | 0/67477 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Map:   0%|          | 0/67467 [00:00<?, ? examples/s]

Map:   0%|          | 0/7498 [00:00<?, ? examples/s]

In [6]:
# ── 5. Model + PEFT (LoRA) setup
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,        # HuggingFace helper to reduce peak CPU RAM
).to(device)


peft_config = LoraConfig(
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["query", "value","key","dense"],
)

peft_model = get_peft_model(model, peft_config)
peft_model = peft_model.to(device)
peft_model.print_trainable_parameters()

trainable params: 7,692,288 || all params: 392,364,860 || trainable%: 1.9605


In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
  tokenizer=processor.tokenizer,
  model=peft_model,            # you’ve already provided decoder_input_ids
  label_pad_token_id=-100,
  padding="longest",           # dynamic pad each training batch
  return_tensors="pt",
)

In [8]:
from transformers import DataCollatorWithPadding

data_collator_pad = DataCollatorWithPadding(
    tokenizer=processor.tokenizer,
    padding="longest",      # pad each batch to its own max length
    return_tensors="pt",    # return PyTorch tensors
)

In [9]:
from torch.nn.utils.rnn import pad_sequence
from PIL import Image
import os

def collate_fn(batch):
    # A) load & preprocess *this batch* of images on CPU
    imgs = [
        Image.open(os.path.join(IMAGE_ROOT, ex["image_path"])).convert("RGB")
        for ex in batch
    ]
    # processor call returns CPU tensors
    pixel_values = processor(
        images=imgs,
        return_tensors="pt",
    ).pixel_values  # shape: (B, C, H, W), on CPU

    # B) gather & pad text fields (all on CPU)
    input_ids         = [torch.tensor(f["input_ids"],         dtype=torch.long) for f in batch]
    attention_mask    = [torch.tensor(f["attention_mask"],    dtype=torch.long) for f in batch]
    labels            = [torch.tensor(f["labels"],            dtype=torch.long) for f in batch]
    decoder_input_ids = [torch.tensor(f["decoder_input_ids"], dtype=torch.long) for f in batch]
    decoder_attn_mask = [torch.tensor(f["decoder_attention_mask"], dtype=torch.long) for f in batch]

    input_ids         = pad_sequence(input_ids,        batch_first=True, padding_value=PAD_ID)
    attention_mask    = pad_sequence(attention_mask,   batch_first=True, padding_value=0)
    labels            = pad_sequence(labels,           batch_first=True, padding_value=-100)
    decoder_input_ids = pad_sequence(decoder_input_ids, batch_first=True, padding_value=PAD_ID)
    decoder_attn_mask = pad_sequence(decoder_attn_mask, batch_first=True, padding_value=0)

    return {
        "pixel_values":           pixel_values,
        "input_ids":              input_ids,
        "attention_mask":         attention_mask,
        "labels":                 labels,
        "decoder_input_ids":      decoder_input_ids,
        "decoder_attention_mask": decoder_attn_mask,
    }

In [10]:
import os, warnings

# 1) Turn off tokenizers’ internal thread-pool when forked
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 2) Suppress HF “attention_mask” reminder
warnings.filterwarnings(
    "ignore",
    message=r"We strongly recommend passing in an `attention_mask`"
)

# 3) Suppress the DataParallel gather warning
warnings.filterwarnings(
    "ignore",
    message=r"Was asked to gather along dimension 0, but all input tensors were scalars"
)

warnings.filterwarnings(
    "ignore",
    message="To copy construct from a tensor.*"
)


In [11]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback, TrainerState, TrainerControl
from transformers import EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="vqa_peft_out",

    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,

    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",

    num_train_epochs=5,

    # ← run eval & save every 500 steps (instead of only at epoch end)
    do_eval=True,

    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,

    # ← logging every 20 steps
    logging_strategy="steps",
    logging_steps=50,
    logging_first_step=True,

    # ← rolling back to the best checkpoint by eval_loss
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # ← actually generate answers to compute your metric
    predict_with_generate=True,
    generation_max_length=32,

    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    remove_unused_columns=False,
    report_to=["none"],

    label_names=["labels"],
    dataloader_prefetch_factor=2,

    
    bf16=True,
)


# 3) Instantiate your Trainer, adding PrintLogsCallback *before* the bar redraws
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=collate_fn,
    tokenizer=processor.tokenizer,
    callbacks=[ EarlyStoppingCallback(early_stopping_patience=3) ],
)

  trainer = Seq2SeqTrainer(


In [12]:
trainer.train()

Step,Training Loss,Validation Loss
200,2.0958,2.007812
400,1.6809,1.598633
600,1.4368,1.449219
800,1.3529,1.357422
1000,1.3289,1.293945
1200,1.2315,1.246094
1400,1.1398,1.207031
1600,1.0992,1.172852
1800,1.1194,1.151367
2000,1.0409,1.131836


TrainOutput(global_step=5275, training_loss=1.1406139384626777, metrics={'train_runtime': 9624.7015, 'train_samples_per_second': 35.049, 'train_steps_per_second': 0.548, 'total_flos': 7913666491933584.0, 'train_loss': 1.1406139384626777, 'epoch': 5.0})

In [13]:
tokenized_val[0]

{'image_path': '68/68b7a03d.jpg',
 'input_ids': tensor([ 101, 2054, 2828, 1997, 5860, 1029,  102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([ 101, 3729, 1011, 1054,  102]),
 'decoder_input_ids': tensor([ 101,  101, 3729, 1011, 1054]),
 'decoder_attention_mask': tensor([1, 1, 1, 1, 1])}

In [14]:
# 8) Merge your PEFT adapters & save a standalone model + processor
merged = trainer.model.merge_and_unload()
merged.save_pretrained("vqa-blip-base-final-model")
processor.save_pretrained("vqa-blip-base-final-model")


[]