In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/adityajn105/flickr8k")


In [None]:
import os
import cv2
import matplotlib.pyplot as plt
from collections import defaultdict

captions_path = '/content/flickr8k/captions.txt'
images_dir = '/content/flickr8k/Images'

# Read the captions file
with open(captions_path, 'r') as f:
    captions = f.readlines()

# Remove the header line
captions = captions[1:]


In [None]:
import os
import json


# Dictionary to keep only the first caption for each image
image_caption_map = {}

for line in captions:
    line = line.strip()
    if not line:
        continue
    image_name, caption = line.split(',', 1)
    image_name = image_name.split('#')[0].strip()  # Remove #0, #1 etc.
    if image_name not in image_caption_map:
        image_path = os.path.join(images_dir, image_name)
        image_caption_map[image_name] = {
            "image": image_path,
            "caption": caption.strip()
        }

# Convert to the required list format
final_data = list(image_caption_map.values())

# Optional: Print first few entries
for entry in final_data[:5]:
    print(entry)


In [None]:
print(len(final_data))

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import matplotlib.pyplot as plt
import cv2

def display_image_with_caption(data_entry):
    image_path = data_entry["image"]
    caption = data_entry["caption"]

    # Read and convert image (from BGR to RGB)
    image = cv2.imread(image_path)
    if image is None:
        print(f"Couldn't load image from path: {image_path}")
        return
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Display image with caption
    plt.figure(figsize=(8, 6))
    plt.imshow(image)
    plt.title(caption, fontsize=12)
    plt.axis('off')
    plt.show()

In [None]:
from transformers import pipeline

# Load the pipeline
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")


In [None]:
# Run the pipeline on the image
# result = pipe("R.jpg")
result = pipe(final_data[1598]['image'])

# Print the result
print(result[0]['generated_text'])

In [None]:
display_image_with_caption(final_data[1598])

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer

# Load the processor and model for image captioning using BLIP
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer
from PIL import Image
from torch.utils.data import Dataset
import torch
import os
import random

# Disable Weights and Biases logging
os.environ["WANDB_DISABLED"] = "true"

# Load the processor and base BLIP model for image captioning
model_name = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name)

# Custom dataset class for image-caption pairs
class CaptionDataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image = Image.open(item["image"]).convert("RGB")  # Load and convert image to RGB
        caption = item["caption"]

        # Preprocess the image and caption using the processor
        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=128
        )

        return {
            "pixel_values": inputs["pixel_values"].squeeze(),  # Image tensor
            "input_ids": inputs["input_ids"].squeeze(),        # Tokenized caption
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": inputs["input_ids"].squeeze()            # Labels are same as input_ids for captioning
        }

# Randomly sample 500 items from final_data for training
sampled_data = random.sample(final_data, 500)

# Create the dataset using the sampled data
train_dataset = CaptionDataset(sampled_data, processor)


In [None]:
from transformers import TrainingArguments, Trainer, BlipProcessor, BlipForConditionalGeneration
from transformers import TFAutoModel
import os
import matplotlib.pyplot as plt  # Needed for plotting

# 1) Define training configuration
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/DEPI_Project/blip-finetuned",  # Directory to save model checkpoints
    per_device_train_batch_size=4,     # Batch size per GPU/CPU
    num_train_epochs=1,                # Total number of training epochs
    save_steps=125,                    # Save model every 125 steps
    save_total_limit=2,                # Keep only the last 2 checkpoints
    logging_dir='./logs',              # Directory for logs
    logging_steps=10,                  # Log training loss every 10 steps
    remove_unused_columns=False,       # Needed when using image inputs
    fp16=True,                         # Use 16-bit floating point precision if available
    run_name="blip-captioning",        # Name of the training run
    report_to="none",                  # Disable logging to external services (like WandB)
)

# 2) Initialize Trainer
trainer = Trainer(
    model=model,                # The BLIP model loaded earlier
    args=training_args,         # Training arguments defined above
    train_dataset=train_dataset, # The dataset to train on
)

# 3) Start training and save training metrics
train_result = trainer.train()
trainer.save_metrics("train", train_result.metrics)

# Plot training loss over steps
logs = trainer.state.log_history
steps = []
losses = []

for entry in logs:
    if "loss" in entry and "step" in entry:
        steps.append(entry["step"])
        losses.append(entry["loss"])

plt.figure(figsize=(8, 5))
plt.plot(steps, losses, marker='o')
plt.title("Training Loss over Steps")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.grid(True)
plt.show()

# 4) Delete .safetensors files if they exist (to avoid conflicts)
model_path = "/content/drive/MyDrive/DEPI_Project/blip-finetuned"
for file in os.listdir(model_path):
    if file.endswith(".safetensors"):
        os.remove(os.path.join(model_path, file))

# 5) Save the model in PyTorch format
model.save_pretrained(model_path, safe_serialization=False)

# 6) Save the processor (tokenizer + image processor)
processor.save_pretrained(model_path)

# 7) Convert PyTorch model to TensorFlow format
tf_model = TFAutoModel.from_pretrained(
    model_path,    # Path to the saved PyTorch model
    from_pt=True   # Convert from PyTorch to TensorFlow
)
tf_model.save_pretrained(model_path, save_format="h5")

# 8) Confirm saved files
print(f"✅ Artifacts saved to {model_path}:")
for file in os.listdir(model_path):
    if file.endswith((".json", ".bin", ".h5", ".txt")):
        print(f"- {file}")
