<h1 style="text-align: center;">Fine-Tuning BLIP Using HuggingFace Transformers</h1>

### Import Necessary Libraries

In [1]:
import torch
import os
import matplotlib.pyplot as plt
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from datasets import Image as HFImage
from textwrap import wrap

### Load Dataset

In [None]:
# Load Dataset From CSV File
dataset = load_dataset("csv", data_files=r"Dataset/metadata.csv").cast_column("image", HFImage())
dataset  = dataset['train']

### Dataset Snippet

In [None]:
print(dataset[0]["text"])
dataset[0]["image"]

### Create PyTorch Dataset

In [6]:
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(images=item["image"], text=item["text"], padding="max_length", return_tensors="pt")
        # remove batch dimension
        encoding = {k:v.squeeze() for k,v in encoding.items()}
        return encoding

### Load Model And Processor

In [None]:
# Load From HuggingFace
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to('cuda')
print(f"Model Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [8]:
train_dataset = ImageCaptioningDataset(dataset, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2)

### Train The Model

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu" 

print(device)

model.to(device)

model.train()

for epoch in range(5):
  print("Epoch:", epoch)
  for idx, batch in enumerate(train_dataloader):
    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids,
                    pixel_values=pixel_values,
                    labels=input_ids)
    
    loss = outputs.loss

    print("Loss:", loss.item())

    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

### Saving The Fine-tuned Model

In [None]:
model.save_pretrained("model-weights-base-finetuned.pth")
processor.save_pretrained("processor-config-base-finetuned.json")

## Inference

In [None]:
# Path to the folder containing test images for inference
input_folder = r"Dataset\test"


# Load base model from Huggingface
base_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
base_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to('cuda')


# Load the fine-tuned model locally
processor = BlipProcessor.from_pretrained("processor-config-base-finetuned.json")
model = BlipForConditionalGeneration.from_pretrained("model-weights-base-finetuned.pth").to('cuda')


# Iterate through all files in the input folder
for filename in os.listdir(input_folder):
    if filename.lower().endswith(('png', 'jpg', 'jpeg')):
        # Open the image
        image_path = os.path.join(input_folder, filename)
        img = Image.open(image_path).convert("RGB")
        
        # Perform inference on fine-tuned
        inputs = processor(img, return_tensors="pt").to('cuda')
        out = model.generate(**inputs, max_new_tokens=50)
        result = processor.decode(out[0], skip_special_tokens=True).replace("arafed", "").strip()

        # Perform inference on base
        inputs = base_processor(img, return_tensors="pt").to('cuda')
        out = base_model.generate(**inputs, max_new_tokens=50)
        result_base = base_processor.decode(out[0], skip_special_tokens=True).replace("arafed", "").strip()

        # Calculate dynamic text width based on image width
        image_width, image_height = img.size
        char_width = image_width // 60  # Adjust factor (60) for desired text size
            
        # Wrap the response text to ensure it fits
        wrapped_result = "\n".join(wrap(result, width=char_width))  # Adjust the width as necessary
        wrapped_result_base = "\n".join(wrap(result_base, width=char_width))  # Adjust the width as necessary

        text = f"Fine-Tuned Model Result:\n{wrapped_result}\n\nBase Model Result:\n{wrapped_result_base}"

        # Display the image and result
        plt.figure(figsize=(10, 10))
        plt.imshow(img)
        plt.title(
            text,
            fontsize=12,
            fontweight='bold',
            loc='center'
            )
        plt.axis('off')
        plt.show()
        

### Conditional Image Captioning

In [None]:
"""will start generating the caption from the text input. If no text input is provided, the decoder will start with the [BOS] token only."""

text = "from"
inputs = processor(img, text, return_tensors="pt").to('cuda')

out = model.generate(**inputs , max_new_tokens=50)
print(processor.decode(out[0], skip_special_tokens=True).replace("arafed", "").strip())
img