In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers accelerate torch torchvision bitsandbytes peft



In [None]:
import os
from transformers import BlipProcessor, BlipForConditionalGeneration
from peft import PeftModel
import torch

# Unzip the lora weights
!unzip -o /content/lora.zip -d /content/lora_weights

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP base model and processor
model_id = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(model_id)
base_model = BlipForConditionalGeneration.from_pretrained(model_id).to(device)

# The adapter files are in a subfolder after unzipping
adapter_path = "/content/lora_weights/blip_lora_best"

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, adapter_path)
model.to(device)
model.eval()

print("BLIP base model with LoRA adapter loaded successfully.")

Archive:  /content/lora.zip
  inflating: /content/lora_weights/blip_lora_best/README.md  
  inflating: /content/lora_weights/blip_lora_best/adapter_config.json  
  inflating: /content/lora_weights/blip_lora_best/adapter_model.safetensors  


Loading weights:   0%|          | 0/473 [00:00<?, ?it/s]

BlipForConditionalGeneration LOAD REPORT from: Salesforce/blip-image-captioning-base
Key                                       | Status     |  | 
------------------------------------------+------------+--+-
text_decoder.bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


BLIP base model with LoRA adapter loaded successfully.


In [None]:
def generate_raw_caption(image):
    # Prepare inputs for BLIP
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=60
        )

    # Decode and return
    return processor.decode(output[0], skip_special_tokens=True)

In [None]:
image_files = [
    "photo_0001.jpg",
    "photo_0002.jpg",
    "photo_0003.jpg",
    "photo_0004.jpg",
    "photo_0005.jpg",
    "photo_0006.jpg",
    "photo_0007.jpg"
]
print(image_files)

['photo_0001.jpg', 'photo_0002.jpg', 'photo_0003.jpg', 'photo_0004.jpg', 'photo_0005.jpg', 'photo_0006.jpg', 'photo_0007.jpg']


In [None]:
from google import genai
from google.genai import types

# 1. Setup Client
client = genai.Client(api_key="YOUR_GEMINI_API_KEY_HERE")

def rewrite_caption_french_cloud(raw_caption):
    # Your specific RAG rules
    rules = """
RÈGLES DE GÉNÉRATION DE LÉGENDES

1. Format
- La sortie doit contenir une seule phrase.
- La phrase doit être rédigée au présent de l’indicatif.
- La longueur maximale est de 50 mots.

2. Contenu autorisé
- Décrire uniquement les éléments directement visibles dans l’image.
- Ne pas formuler d’hypothèses, d’interprétations ou de déductions.
- Éviter toute expression d’incertitude (ex. : probablement, semble, pourrait).

3. Structure obligatoire
- La description doit suivre l’ordre suivant :
  sujet principal → action observable → contexte (lieu, objets visibles).
- Cet ordre ne doit pas être modifié.

4. Action
- Décrire uniquement des actions clairement observables.
- En l’absence d’action évidente, utiliser une description statique
  (ex. : est debout, est assis, se tient immobile).

5. Ton et style
- Employer un ton neutre, factuel et objectif.
- Utiliser un vocabulaire simple et descriptif.
- Éviter les adjectifs subjectifs ou évaluatifs.

6. Restrictions
- Ne pas inclure de descriptions graphiques ou choquantes
  (blessures détaillées, sang, violence explicite).
- Ne pas mentionner d’éléments hors champ ou non visibles.
- Ne pas identifier des personnes, marques ou entités spécifiques.


    """

    prompt = f"{rules}\n\nTraduit cette phrase : {raw_caption}"

    # Gemini 2.5 Flash is incredibly fast and free
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )

    return response.text.strip()

# Test run
print(rewrite_caption_french_cloud("a man is cooking food on a grill"))
# Expected: "Un homme fait cuire des aliments sur un gril."

Un homme cuisine de la nourriture sur un grill.


In [None]:
from PIL import Image

for filename in image_files:
    image_path = f"/content/{filename}"
    try:
        image = Image.open(image_path).convert("RGB")
        raw_caption = generate_raw_caption(image)

        print(f"Image: {filename}")
        print(f"Generated Caption: {raw_caption}\n")
    except Exception as e:
        print(f"Error processing {filename}: {e}")

Image: photo_0001.jpg
Generated Caption: a train on a track near a tree

Image: photo_0002.jpg
Generated Caption: a person on a surfboard in the water

Image: photo_0003.jpg
Generated Caption: a stop sign with stickers on it

Image: photo_0004.jpg
Generated Caption: a man riding a snowboard down a snow covered slope

Image: photo_0005.jpg
Generated Caption: a small toy duck sitting on a wooden bench

Image: photo_0006.jpg
Generated Caption: a man standing on a snow covered ski slope holding a ski pole

Image: photo_0007.jpg
Generated Caption: a group of three people on the snow skis



In [None]:
!mkdir -p /content/coco
!wget -q http://images.cocodataset.org/zips/val2014.zip
!unzip -q val2014.zip -d /content/coco


In [None]:
!wget -q http://images.cocodataset.org/annotations/annotations_trainval2014.zip
!unzip -q annotations_trainval2014.zip -d /content/coco


replace /content/coco/annotations/instances_train2014.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/coco/annotations/instances_val2014.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/coco/annotations/person_keypoints_train2014.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/coco/annotations/person_keypoints_val2014.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/coco/annotations/captions_train2014.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/coco/annotations/captions_val2014.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
import json
from PIL import Image
import os

ANN_PATH = "/content/coco/annotations/captions_val2014.json"
IMG_DIR = "/content/coco/val2014"

with open(ANN_PATH, "r") as f:
    coco_data = json.load(f)

# Build image_id → captions mapping
imgid_to_captions = {}
for ann in coco_data["annotations"]:
    imgid_to_captions.setdefault(ann["image_id"], []).append(ann["caption"])

# Image metadata
imgid_to_filename = {
    img["id"]: img["file_name"]
    for img in coco_data["images"]
}

print("Images:", len(imgid_to_filename))
print("Captions per image:", len(next(iter(imgid_to_captions.values()))))


Images: 40504
Captions per image: 5


In [None]:
from tqdm import tqdm
import torch

MAX_WORDS = 50
results = []

def truncate(text, max_words=50):
    return " ".join(text.split()[:max_words])

image_ids = list(imgid_to_filename.keys())[:1000]  # start with 1k

for img_id in tqdm(image_ids):
    img_path = os.path.join(IMG_DIR, imgid_to_filename[img_id])
    image = Image.open(img_path).convert("RGB")

    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_WORDS
        )

    caption = processor.decode(
        output_ids[0],
        skip_special_tokens=True
    )

    results.append({
        "image_id": img_id,
        "generated": truncate(caption),
        "references": imgid_to_captions[img_id]
    })


100%|██████████| 1000/1000 [07:22<00:00,  2.26it/s]


In [None]:
import json

SAVE_PATH = "/content/drive/MyDrive/AICV_captioning/blip_base_lora_captions.json"

with open(SAVE_PATH, "w") as f:
    json.dump(results, f, indent=2)

print("Saved:", SAVE_PATH)

Saved: /content/drive/MyDrive/AICV_captioning/blip_base_lora_captions.json


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
!git clone https://github.com/cocodataset/cocoapi.git
!pip install -q pycocotools


Cloning into 'cocoapi'...
remote: Enumerating objects: 975, done.[K
remote: Total 975 (delta 0), reused 0 (delta 0), pack-reused 975 (from 1)[K
Receiving objects: 100% (975/975), 11.72 MiB | 11.99 MiB/s, done.
Resolving deltas: 100% (576/576), done.


In [None]:
!pip install pycocoevalcap



In [None]:
PRED_PATH = "/content/blip2_predictions.json"

predictions = [
    {
        "image_id": r["image_id"],
        "caption": r["generated"]
    }
    for r in results
]

with open(PRED_PATH, "w") as f:
    json.dump(predictions, f)

print("Saved predictions:", PRED_PATH)


Saved predictions: /content/blip2_predictions.json


In [None]:
import evaluate

bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

preds = [r["generated"] for r in results]
refs = [r["references"] for r in results]

bleu1 = bleu.compute(predictions=preds, references=refs, max_order=1)["bleu"]nn
bleu4 = bleu.compute(predictions=preds, references=refs, max_order=4)["bleu"]
meteor_score = meteor.compute(predictions=preds, references=refs)["meteor"]


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
import json

RESULTS_PATH = "/content/drive/MyDrive/AICV_captioning/blip_base_lora_captions.json"

with open(RESULTS_PATH, "r") as f:
    results = json.load(f)

print("Loaded results:", len(results))
print(results[0].keys())


Loaded results: 1000
dict_keys(['image_id', 'generated', 'references'])


In [None]:
preds = [r["generated"] for r in results]
refs = [r["references"] for r in results]

print(len(preds), len(refs))


1000 1000


In [None]:
PRED_PATH = "/content/blip2_predictions.json"

predictions = [
    {
        "image_id": r["image_id"],
        "caption": r["generated"]
    }
    for r in results
]

with open(PRED_PATH, "w") as f:
    json.dump(predictions, f)

print("Saved:", PRED_PATH)


Saved: /content/blip2_predictions.json


In [None]:
ANN_PATH = "/content/coco/annotations/captions_val2014.json"

with open(ANN_PATH, "r") as f:
    coco_gt = json.load(f)

eval_image_ids = set(r["image_id"] for r in results)

filtered_images = [
    img for img in coco_gt["images"]
    if img["id"] in eval_image_ids
]

filtered_annotations = [
    ann for ann in coco_gt["annotations"]
    if ann["image_id"] in eval_image_ids
]

filtered_coco_gt = {
    "info": coco_gt.get("info", {}),
    "licenses": coco_gt.get("licenses", []),
    "images": filtered_images,
    "annotations": filtered_annotations
}

FILTERED_GT_PATH = "/content/captions_val2014_filtered.json"

with open(FILTERED_GT_PATH, "w") as f:
    json.dump(filtered_coco_gt, f)

print("Filtered images:", len(filtered_images))
print("Filtered annotations:", len(filtered_annotations))


Filtered images: 1000
Filtered annotations: 5005


In [None]:
!pip install -q evaluate
import evaluate

bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

bleu1 = bleu.compute(predictions=preds, references=refs, max_order=1)["bleu"]
bleu4 = bleu.compute(predictions=preds, references=refs, max_order=4)["bleu"]
meteor_score = meteor.compute(predictions=preds, references=refs)["meteor"]


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [35]:
!pip install -q pycocotools pycocoevalcap

from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

coco = COCO(FILTERED_GT_PATH)
coco_res = coco.loadRes(PRED_PATH)

coco_eval = COCOEvalCap(coco, coco_res)

try:
    # evaluate() runs Bleu, METEOR, Rouge, CIDEr, and SPICE
    coco_eval.evaluate()
except Exception as e:
    print(f"\nEvaluation warning: {e}")
    print("Continuing with metrics calculated before error...")

cider_score = coco_eval.eval.get("CIDEr", 0)
for metric, score in coco_eval.eval.items():
    print(f"{metric}: {score:.4f}")

loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.
computing Bleu score...
{'testlen': 9819, 'reflen': 9707, 'guess': [9819, 8819, 7819, 6819], 'correct': [6144, 2737, 1031, 368]}
ratio: 1.0115380653135868
Bleu_1: 0.626
Bleu_2: 0.441
Bleu_3: 0.295
Bleu_4: 0.193
computing METEOR score...
METEOR: 0.205
computing Rouge score...
ROUGE_L: 0.453
computing CIDEr score...
CIDEr: 0.690
computing SPICE score...

Continuing with metrics calculated before error...
Bleu_1: 0.6257
Bleu_2: 0.4407
Bleu_3: 0.2947
Bleu_4: 0.1928
METEOR: 0.2053
ROUGE_L: 0.4534
CIDEr: 0.6897


In [36]:
metrics = {
    "BLEU-1": bleu1,
    "BLEU-4": bleu4,
    "METEOR": meteor_score,
    "CIDEr": cider_score
}

metrics


{'BLEU-1': 0.5875012681343208,
 'BLEU-4': 0.16188373338890472,
 'METEOR': np.float64(0.37485571574607324),
 'CIDEr': np.float64(0.6897176782692018)}