In [1]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Configure 8-bit quantization to reduce memory usage
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)

# Load model with quantization and automatic device mapping
model = AutoModelForImageTextToText.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    quantization_config=quantization_config,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

Using device: cuda




Loading weights:   0%|          | 0/1247 [00:00<?, ?it/s]

The image processor of type `BlipImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


In [2]:
import zipfile
import os
import shutil
from PIL import Image
from pathlib import Path

# Define paths
zip_file_path = "uic.zip"
extract_to_path = "data/uic"
dataset_base = "data/uic/UIC(underwater image captioning dataset)"
image_dir = os.path.join(dataset_base, "uic_224x224_image")
captions_file = os.path.join(dataset_base, "UIC-captions.txt")

# Delete existing data/uic directory if it exists
if os.path.exists(extract_to_path):
    print(f"Removing existing {extract_to_path}...")
    shutil.rmtree(extract_to_path)

# Extract zip file if it exists
if os.path.exists(zip_file_path):
    os.makedirs(extract_to_path, exist_ok=True)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)
    print(f"Successfully extracted {zip_file_path} to {extract_to_path}")

# Load captions from UIC-captions.txt
def load_captions(captions_path):
    """Parse UIC captions file and return a dictionary mapping image filenames to their captions."""
    image_captions = {}
    
    with open(captions_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                # Split on first space to separate image#id from caption
                parts = line.split(' ', 1)
                if len(parts) == 2:
                    img_id, caption = parts
                    # Extract image filename (before the #)
                    img_filename = img_id.split('#')[0]
                    
                    # Store captions in a list for each image
                    if img_filename not in image_captions:
                        image_captions[img_filename] = []
                    image_captions[img_filename].append(caption)
    
    return image_captions

# Load the dataset
captions_dict = load_captions(captions_file)
image_paths = sorted([f for f in os.listdir(image_dir) if f.endswith('.jpg')])

# Create dataset as list of (image_path, captions) tuples
dataset = []
for img_filename in image_paths:
    img_path = os.path.join(image_dir, img_filename)
    captions = captions_dict.get(img_filename, [])
    if captions:  # Only include images that have captions
        dataset.append({
            'image_path': img_path,
            'image_filename': img_filename,
            'captions': captions
        })

print(f"Loaded {len(dataset)} images with captions")
print(f"Example entry:")
print(f"  Image: {dataset[0]['image_filename']}")
print(f"  Number of captions: {len(dataset[0]['captions'])}")
print(f"  First caption: {dataset[0]['captions'][0]}")

Removing existing data/uic...
Successfully extracted uic.zip to data/uic
Loaded 3176 images with captions
Example entry:
  Image: uic_img_1.jpg
  Number of captions: 5
  First caption: A dark brown turtle paddles through the water with its limbs .


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from PIL import Image
import random

# Create PyTorch Dataset class
class UICDataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        image = Image.open(item['image_path']).convert('RGB')
        # Use the first caption for training
        caption = item['captions'][0]
        
        # Process image and text
        encoding = self.processor(
            images=image,
            text=caption,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        
        # Remove batch dimension
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        
        # Set labels for language modeling
        encoding["labels"] = encoding["input_ids"].clone()
        
        return encoding

# Split dataset into train and validation (90/10 split)
random.seed(42)
random.shuffle(dataset)
split_idx = int(0.9 * len(dataset))
train_dataset = dataset[:split_idx]
eval_dataset = dataset[split_idx:]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

# Create dataset objects
train_data = UICDataset(train_dataset, processor)
eval_data = UICDataset(eval_dataset, processor)

# Configure LoRA for PEFT
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,  # LoRA scaling factor
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Apply PEFT to model
peft_model = get_peft_model(model, lora_config)
peft_model = peft_model.to(device)
peft_model.print_trainable_parameters()
print(f"PEFT model on device: {next(peft_model.parameters()).device}")

# Training arguments
training_args = TrainingArguments(
    output_dir="./blip2-uic-peft",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-4,
    warmup_steps=100,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=False,
    remove_unused_columns=False,
)

# Create trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
)

# Start training
print("Starting PEFT fine-tuning...")
trainer.train()

# Save the fine-tuned model
peft_model.save_pretrained("./blip2-uic-finetuned")

processor.save_pretrained("./blip2-uic-finetuned")
print("Fine-tuned model saved to ./blip2-uic-finetuned")

Training samples: 2858
Validation samples: 318
trainable params: 5,242,880 || all params: 3,750,004,736 || trainable%: 0.1398
PEFT model on device: cuda:0
Starting PEFT fine-tuning...




Step,Training Loss,Validation Loss


In [None]:
# Test the fine-tuned model on a sample image
from peft import PeftModel
from PIL import Image

# Load a test image
test_sample = eval_dataset[0]
test_image = Image.open(test_sample['image_path']).convert('RGB')

# Prepare image for inference
inputs = processor(images=test_image, return_tensors="pt")

# Move inputs to the same device as model
device = next(peft_model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate caption
with torch.no_grad():
    generated_ids = peft_model.generate(**inputs, max_length=128)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("Test Image:", test_sample['image_filename'])
print("Generated Caption:", generated_caption)
print("\nGround Truth Captions:")
for i, caption in enumerate(test_sample['captions'], 1):
    print(f"  {i}. {caption}")

: 

: 