In [28]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [29]:
import os
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
from PIL import Image
import json

# Reduce fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"

# Check for GPU
print("✅ CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("🖥️ Device:", torch.cuda.get_device_name(0))

✅ CUDA available: True
🖥️ Device: NVIDIA GeForce RTX 3080 Laptop GPU


In [30]:
class OKVQADataset(Dataset):
    def __init__(self, questions_path, annotations_path, image_folder, processor):
        self.processor = processor
        self.image_folder = image_folder

        with open(questions_path, 'r') as f:
            questions = json.load(f)['questions']
        with open(annotations_path, 'r') as f:
            annotations = json.load(f)['annotations']

        answers_by_qid = {a["question_id"]: a["answers"][0]["answer"] for a in annotations}

        self.samples = []
        for q in questions:
            qid = q["question_id"]
            if qid not in answers_by_qid:
                continue

            image_id = q["image_id"]
            prefix = "COCO_train2014" if "train" in self.image_folder else "COCO_val2014"
            filename = f"{prefix}_{image_id:012}.jpg"
            path = os.path.join(self.image_folder, filename)

            if not os.path.exists(path):
                continue

            self.samples.append({
                "image_path": path,
                "question": q["question"],
                "answer": answers_by_qid[qid]
            })

        self.samples = self.samples[:8000]  # Limit for speed

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        image = Image.open(item["image_path"]).convert("RGB").resize((224, 224))
        question = item["question"]
        answer = item["answer"]

        encoding = self.processor(
            image,
            question,
            text_target=answer,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=32
        )

        if "attention_mask" not in encoding:
            input_ids = encoding["input_ids"]
            pad_token_id = self.processor.tokenizer.pad_token_id
            encoding["attention_mask"] = (input_ids != pad_token_id).long()

        return {k: v.squeeze(0) for k, v in encoding.items()}

In [31]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

questions_path = "OpenEnded_mscoco_train2014_questions.json"
annotations_path = "mscoco_train2014_annotations.json"
image_folder = "train2014"

dataset = OKVQADataset(questions_path, annotations_path, image_folder, processor)

In [32]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Moved model to:", next(model.parameters()).device)

✅ Moved model to: cuda:0


In [33]:
import torch

print("🔍 CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("✅ CUDA device name:", torch.cuda.get_device_name(0))
    print("💾 Total GPU memory (MB):", torch.cuda.get_device_properties(0).total_memory // (1024 * 1024))
    print("🆔 CUDA device index:", torch.cuda.current_device())
else:
    print("⚠️ No CUDA GPU detected — running on CPU.")


print("Model is on:", next(model.parameters()).device)


🔍 CUDA available: True
✅ CUDA device name: NVIDIA GeForce RTX 3080 Laptop GPU
💾 Total GPU memory (MB): 8191
🆔 CUDA device index: 0
Model is on: cuda:0


In [34]:
# ========================
# 3. Load dataset
# ========================
questions_path = "OpenEnded_mscoco_train2014_questions.json"
annotations_path = "mscoco_train2014_annotations.json"
image_folder = "train2014"  # or "val2014" if you're using validation set

dataset = OKVQADataset(questions_path, annotations_path, image_folder, processor)

In [35]:
import os

sample_id = 9  # From COCO_train2014_000000000009.jpg
filename = f"COCO_train2014_{sample_id:012}.jpg"
path = os.path.join("train2014", filename)
print("Exists:", os.path.exists(path))


Exists: True


In [36]:
training_args = TrainingArguments(
    output_dir="./blip-vqa-checkpoints",             # Folder for saving checkpoints
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=4,                              # Update if you want more epochs
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,
    logging_steps=10,
    save_steps=500,
    remove_unused_columns=False,
    report_to="none",
    lr_scheduler_type="cosine"
)

In [38]:
from transformers import BlipForQuestionAnswering
import gc
import torch

torch.cuda.empty_cache()
gc.collect()

# ✅ Load model from checkpoint
model = BlipForQuestionAnswering.from_pretrained("./blip-vqa-checkpoints/checkpoint-2000")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train(resume_from_checkpoint="./blip-vqa-checkpoints/checkpoint-2000")

There were missing keys in the checkpoint model loaded: ['text_decoder.cls.predictions.decoder.bias'].


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 13.91 GiB is allocated by PyTorch, and 603.39 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [34]:
# ========================
# 6. Save the fine-tuned model
# ========================
model.save_pretrained("./blip-vqa-okvqa")
processor.save_pretrained("./blip-vqa-okvqa")
print("✅ Model saved to ./blip-vqa-okvqa")

✅ Model saved to ./blip-vqa-okvqa
