In [1]:
!pip install datasets torch numpy transformers



In [2]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from datasets import load_dataset
from transformers import BlipProcessor, BlipForConditionalGeneration

# Custom dataset class
class FootballImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(
            images=item["image"], padding="max_length", return_tensors="pt"
        )
        # Remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]
        return encoding

def collator(batch):
    processed_batch = {}
    for key in batch[0].keys():
        if key != "text":
            processed_batch[key] = torch.stack([example[key] for example in batch])
        else:
            text_inputs = processor.tokenizer(
                [example["text"] for example in batch], padding=True, return_tensors="pt"
            )
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
    return processed_batch

# Load the dataset
dataset = load_dataset("tomytjandra/h-and-m-fashion-caption-12k")

# Split the dataset into train, validation, and test sets
train_size = int(0.8 * len(dataset['train']))
val_size = int(0.1 * len(dataset['train']))
test_size = len(dataset['train']) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset['train'], [train_size, val_size, test_size])

# Initialize the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Create datasets
train_dataset = FootballImageCaptioningDataset(train_dataset, processor)
val_dataset = FootballImageCaptioningDataset(val_dataset, processor)
test_dataset = FootballImageCaptioningDataset(test_dataset, processor)

# Create DataLoader for each subset
batch_size = 2
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=collator)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size, collate_fn=collator)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size, collate_fn=collator)

# Move model to the appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Training loop with early stopping
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Early stopping parameters
patience = 3
min_delta = 0.001
best_val_loss = float('inf')
epochs_no_improve = 0

# Training loop
for epoch in range(4):
    print(f"Epoch {epoch + 1}/50")

    # Training phase
    model.train()
    running_train_loss = 0.0

    for idx, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
        train_loss = outputs.loss
        running_train_loss += train_loss.item()

        # Print the batch training loss
        print(f"Batch {idx + 1}/{len(train_dataloader)} - Training Loss: {train_loss.item():.4f}")

        # Backward pass and optimization
        train_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Calculate average training loss for this epoch
    avg_train_loss = running_train_loss / len(train_dataloader)
    print(f"Average Training Loss for Epoch {epoch + 1}: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    running_val_loss = 0.0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
            val_loss = outputs.loss
            running_val_loss += val_loss.item()

    # Calculate average validation loss for this epoch
    avg_val_loss = running_val_loss / len(val_dataloader)
    print(f"Validation Loss for Epoch {epoch + 1}: {avg_val_loss:.4f}")

    # Early stopping check
    if avg_val_loss < best_val_loss - min_delta:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print("Early stopping triggered. Training stopped to prevent overfitting.")
        break

# After training, you can evaluate on the test set to see the model's performance on unseen data


Epoch 1/50


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Batch 4956/4975 - Training Loss: 0.9339
Batch 4957/4975 - Training Loss: 0.6665
Batch 4958/4975 - Training Loss: 0.4894
Batch 4959/4975 - Training Loss: 1.0001
Batch 4960/4975 - Training Loss: 0.4361
Batch 4961/4975 - Training Loss: 0.6849
Batch 4962/4975 - Training Loss: 0.8695
Batch 4963/4975 - Training Loss: 0.8750
Batch 4964/4975 - Training Loss: 0.6406
Batch 4965/4975 - Training Loss: 0.8629
Batch 4966/4975 - Training Loss: 0.8260
Batch 4967/4975 - Training Loss: 1.0050
Batch 4968/4975 - Training Loss: 0.8100
Batch 4969/4975 - Training Loss: 1.2503
Batch 4970/4975 - Training Loss: 0.2608
Batch 4971/4975 - Training Loss: 0.8673
Batch 4972/4975 - Training Loss: 0.6038
Batch 4973/4975 - Training Loss: 0.5367
Batch 4974/4975 - Training Loss: 0.4726
Batch 4975/4975 - Training Loss: 1.1184
Average Training Loss for Epoch 3: 0.7127
Validation Loss for Epoch 3: 0.8162
Epoch 4/50
Batch 1/4975 - Training Loss: 0.5633
Batch 2/4

In [6]:
# After the training loop (at the end of your training script)
model_save_path = "/content/drive/MyDrive/BLIP Fashion Captioning"  # Specify your save path here

# Save the fine-tuned model
model.save_pretrained(model_save_path)

# Optionally, save the processor if you want to reuse it later
processor.save_pretrained(model_save_path)

print(f"Model and processor saved to {model_save_path}")

Model and processor saved to /content/drive/MyDrive/BLIP Fashion Captioning


In [7]:
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Load the fine-tuned model and processor
model_path = "/content/drive/MyDrive/BLIP Fashion Captioning"  # Replace with the path where your fine-tuned model is saved
processor = BlipProcessor.from_pretrained(model_path)
model = BlipForConditionalGeneration.from_pretrained(model_path)

# Move the model to the appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def generate_caption(image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")  # Ensure the image is in RGB format

    # Process the image
    encoding = processor(images=image, return_tensors="pt").to(device)

    # Generate caption
    with torch.no_grad():  # Disable gradient calculation
        outputs = model.generate(**encoding)

    # Decode the generated tokens to text
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# Example usage
image_path = "/content/0211143036.jpg"  # Replace with your image path
caption = generate_caption(image_path)
print(f"Generated Caption: {caption}")




Generated Caption: solid white shirt in a cotton weave with notch lapels and a tie at the waist long sleeves


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
