In [4]:
# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip2-opt-2.7b")

Loading weights:   0%|          | 0/1247 [00:00<?, ?it/s]

In [None]:
import zipfile
import os
from PIL import Image
from pathlib import Path

# Define paths
zip_file_path = "uic.zip"
extract_to_path = "data/uic"
dataset_base = "data/uic/UIC(underwater image captioning dataset)"
image_dir = os.path.join(dataset_base, "uic_224x224_image")
captions_file = os.path.join(dataset_base, "UIC-captions.txt")

# Extract zip file if it exists
if os.path.exists(zip_file_path) and not os.path.exists(dataset_base):
    os.makedirs(extract_to_path, exist_ok=True)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)
    print(f"Successfully extracted {zip_file_path} to {extract_to_path}")

# Load captions from UIC-captions.txt
def load_captions(captions_path):
    """Parse UIC captions file and return a dictionary mapping image filenames to their captions."""
    image_captions = {}
    
    with open(captions_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                # Split on first space to separate image#id from caption
                parts = line.split(' ', 1)
                if len(parts) == 2:
                    img_id, caption = parts
                    # Extract image filename (before the #)
                    img_filename = img_id.split('#')[0]
                    
                    # Store captions in a list for each image
                    if img_filename not in image_captions:
                        image_captions[img_filename] = []
                    image_captions[img_filename].append(caption)
    
    return image_captions

# Load the dataset
captions_dict = load_captions(captions_file)
image_paths = sorted([f for f in os.listdir(image_dir) if f.endswith('.jpg')])

# Create dataset as list of (image_path, captions) tuples
dataset = []
for img_filename in image_paths:
    img_path = os.path.join(image_dir, img_filename)
    captions = captions_dict.get(img_filename, [])
    if captions:  # Only include images that have captions
        dataset.append({
            'image_path': img_path,
            'image_filename': img_filename,
            'captions': captions
        })

print(f"Loaded {len(dataset)} images with captions")
print(f"Example entry:")
print(f"  Image: {dataset[0]['image_filename']}")
print(f"  Number of captions: {len(dataset[0]['captions'])}")
print(f"  First caption: {dataset[0]['captions'][0]}")

Successfully extracted uic.zip to data/uic
Loaded 3176 images with captions
Example entry:
  Image: uic_img_1.jpg
  Number of captions: 5
  First caption: A dark brown turtle paddles through the water with its limbs .
