The goal of this project is to fine-tune an existing image captioning model to generate captions in Brazilian Portuguese. In order to do that, the Flickr8k database is used and the following steps are performed:

1. Translate captions using a translation model
2. Fine-tune an existing image captioning model
3. Evaluate model performance

# 1. Importing Libraries

Installing required libraries which are not available by default on Kaggle

In [None]:
!pip install evaluate
!pip install rouge_score

In [None]:
import pandas as pd
import numpy as np
from PIL import Image
import torch
from datasets import load_dataset, Dataset
from transformers import pipeline
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    TrainingArguments,
    Trainer,
)
import evaluate  # biblioteca Hugging Face Evaluate
import nltk
from torch.utils.data import RandomSampler
import warnings

nltk.download('punkt')
warnings.filterwarnings("ignore")

# 2. Step A - Translating captions to Brazilian Portuguese

In [None]:
# loading captions file
captions_file = "/kaggle/input/flickr8k/captions.txt"
#captions_file = "/kaggle/input/flickr8k-portuguese-captions/captions_pt.csv" # uncomment if captions were already translated
captions_df = pd.read_csv(captions_file)

captions_df.head()

In [None]:
# setting up the translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-pt")

# translating
captions_df["caption_pt"] = captions_df["caption"].apply(
    lambda x: translator(x)[0]["translation_text"]
)

# saving file
captions_df.to_csv("/kaggle/working/captions_pt.csv", index=False)

In [None]:
# check translated content
captions_pt_df = pd.read_csv("/kaggle/working/captions_pt.csv")
#captions_pt_df = captions_df # uncomment if captions already translated
captions_pt_df.head()

# 3. Step B - Fine-Tuning Image Captioning Model

First, generate a `Dataset`

In [None]:
# loading translated captions
captions = pd.read_csv("/kaggle/working/captions_pt.csv")  # Must have columns: image, caption_pt
#captions = captions_df # uncomment if captions already translated

# get image file path for each image
captions["image"] = captions["image"].apply(lambda x: f"/kaggle/input/flickr8k/Images/{x}")

In [None]:
# generate dataset

# helper function
def preprocess(example):
    example["pil_image"] = Image.open(example["image"]).convert("RGB")
    return example

dataset = Dataset.from_pandas(captions)
dataset = dataset.map(preprocess, remove_columns = ["image"])

Second, loading model to be fine-tuned

In [None]:
# loading model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

Third, preparing and performing the training

In [None]:
# collation function
def collate_fn(batch):
    images = [item["pil_image"] for item in batch]
    texts = [item["caption_pt"] for item in batch]
    
    # process image and captions
    inputs = processor(images, texts, return_tensors="pt", padding="longest", truncation=True)

    return {
        "pixel_values": inputs["pixel_values"],
        "input_ids": inputs["input_ids"], 
        "labels": inputs["input_ids"],     # labels = input_ids to allow autoregressive training
    }


In [None]:
# defining training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    learning_rate=5e-5,
    remove_unused_columns=False,
    eval_strategy="no",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn,
)

print("Start training...")
trainer.train()
print("Training completed.")

In [None]:
# saving model
model.save_pretrained('./results/fine_tuned_blip_pt')
processor.save_pretrained('./results/fine_tuned_blip_pt')

# 4. Testing Fine-Tuned Model

In [None]:
# loading fine-tuned model
processor = BlipProcessor.from_pretrained('./results/fine_tuned_blip_pt')
model = BlipForConditionalGeneration.from_pretrained('./results/fine_tuned_blip_pt')

# generating a caption in pt-BR for a sample image
img = Image.open("/kaggle/input/flickr8k/Images/1020651753_06077ec457.jpg").convert("RGB")
inputs = processor(img, return_tensors="pt")

with torch.no_grad():
    ids = model.generate(**inputs, max_new_tokens=30)

caption = processor.decode(ids[0], skip_special_tokens=True)
print(caption)  # prints generated caption

In [None]:
# showing sample image to verify
img

# 5. Evaluate using BLEU score

In [None]:
# getting fine-tuned model and captions path
model_path = "./results/fine_tuned_blip_pt"
captions_path = "/kaggle/input/flickr8k-portuguese-captions/captions_pt.csv"

# loading model and processor
processor = BlipProcessor.from_pretrained(model_path)
model = BlipForConditionalGeneration.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

# loading dataset
df = pd.read_csv(captions_path)
df["image"] = df["image"].apply(lambda x: f"/kaggle/input/flickr8k/Images/{x}")  # ajustar se necessário
dataset = Dataset.from_pandas(df)

# function to generate caption
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=30)
    return processor.decode(output[0], skip_special_tokens=True)

# preparing data to evaluation
references = []
predictions = []

for example in dataset.select(range(100)):  # using a few samples to perform comparisons
    gold_caption = example["caption_pt"]
    image_path = example["image"]

    pred_caption = generate_caption(image_path)

    references.append([gold_caption.lower()])
    predictions.append(pred_caption.lower())

# computing BLEU score
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)

print(f"BLEU score: {results['bleu']:.4f}")


# 6. Evaluate with other metrics

In [None]:
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")

meteor_score = meteor.compute(predictions=predictions, references=references)
rouge_score = rouge.compute(predictions=predictions, references=[r[0] for r in references])  # ROUGE espera string

print(f"METEOR score: {meteor_score['meteor']:.4f}")
print(f"ROUGE-L score: {rouge_score['rougeL']:.4f}")

# 7. Exporting fine tuned model to be used in app

In [None]:
!zip -r mymodel.zip ./results/fine_tuned_blip_pt