In [1]:
pip install transformers torch scikit-learn  rouge_score




In [2]:
pip install bert-score


Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to c:\users\tito\appdata\local\temp\pip-req-build-4l9h6gi8
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Note: you may need to restart the kernel to use updated packages.


  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git 'C:\Users\Tito\AppData\Local\Temp\pip-req-build-4l9h6gi8'


In [4]:
pip install git+https://github.com/salaniz/pycocoevalcap

Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to c:\users\tito\appdata\local\temp\pip-req-build-ionfkfzh
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Note: you may need to restart the kernel to use updated packages.


  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap 'C:\Users\Tito\AppData\Local\Temp\pip-req-build-ionfkfzh'


In [None]:
import os
import json
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import torch
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration

# ---- Métricas externas ----
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from bert_score import score as bert_score

# Configuración
device = "cuda" if torch.cuda.is_available() else "cpu"
image_dir = "./COCO/images/val2017"
instances_path = "./COCO/annotations/instances_val2017.json"
captions_path = "./COCO/annotations/captions_val2017.json"
output_json = "captions_llava_val2017.json"
output_coco_json = "captions_llava_val2017_coco.json"

# Clases de vehículos en COCO
vehicle_classes = ["car", "motorcycle", "bus", "truck", "bicycle"]

# Cargar modelo LLaVA
processor = LlavaNextProcessor.from_pretrained(
    "llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
).to(device)

# Cargar anotaciones COCO
coco_instances = COCO(instances_path)
coco_captions = COCO(captions_path)
image_id_to_filename = {img["id"]: img["file_name"]
                        for img in coco_instances.dataset["images"]}
filename_to_image_id = {v: k for k, v in image_id_to_filename.items()}

# Filtrar imágenes con vehículos
vehicle_cat_ids = coco_instances.getCatIds(catNms=vehicle_classes)
img_ids = set()
for cat_id in vehicle_cat_ids:
    ids = coco_instances.getImgIds(catIds=[cat_id])
    img_ids.update(ids)

selected_images = [image_id_to_filename[i] for i in list(img_ids)]

# ----------------------------
# Limpieza de captions
# ----------------------------


def clean_caption(raw_caption: str) -> str:
    """Elimina bloques [INST] ... [/INST] y devuelve la predicción limpia."""
    if "[/INST]" in raw_caption:
        return raw_caption.split("[/INST]")[-1].strip()
    return raw_caption.strip()


# Generar captions con LLaVA
captions = {}
coco_results = []  # formato COCO

for filename in tqdm(selected_images, desc="Generando captions"):
    try:
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path).convert("RGB")

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "List all vehicles visible in the image using only the following options: car, bus, truck, motorcycle, bicycle. Respond only with a comma-separated list of these words. Do not include any other text."},
                    {"type": "image"},
                ],

            },
        ]

        prompt = processor.apply_chat_template(
            messages, add_generation_prompt=True)
        inputs = processor(images=image, text=prompt,
                           return_tensors="pt").to(device)

        output = model.generate(**inputs, max_new_tokens=100)
        raw_caption = processor.decode(output[0], skip_special_tokens=True)
        caption = clean_caption(raw_caption)

        captions[filename] = caption

        # Guardar en formato COCO
        image_id = filename_to_image_id[filename]
        coco_results.append({"image_id": image_id, "caption": caption})

    except Exception as e:
        print(f"Error con {filename}: {e}")
        captions[filename] = ""

# Guardar captions crudos
with open(output_json, "w") as f:
    json.dump(captions, f, indent=2)

# Guardar en formato COCO
with open(output_coco_json, "w") as f:
    json.dump(coco_results, f, indent=2)

print(f"\n✅ Captions guardados en {output_json}")
print(f"✅ Formato COCO guardado en {output_coco_json}")

# ----------------------------
# Evaluación multi-label (vehículos)
# ----------------------------


def extract_vehicles_from_caption(caption):
    caption = caption.lower()
    return [v for v in vehicle_classes if v in caption]


y_true_all = []
y_pred_all = []

for filename, caption in captions.items():
    image_id = filename_to_image_id[filename]

    # vehículos en ground truth
    ann_ids = coco_instances.getAnnIds(
        imgIds=[image_id], catIds=vehicle_cat_ids)
    anns = coco_instances.loadAnns(ann_ids)
    gt_cats = [ann["category_id"] for ann in anns]
    gt_vehicles = [coco_instances.loadCats(cid)[0]["name"] for cid in gt_cats]

    # predicción del modelo
    pred_vehicles = extract_vehicles_from_caption(caption)

    # vectores binarios
    y_true_vec = [1 if v in gt_vehicles else 0 for v in vehicle_classes]
    y_pred_vec = [1 if v in pred_vehicles else 0 for v in vehicle_classes]

    y_true_all.append(y_true_vec)
    y_pred_all.append(y_pred_vec)

if y_true_all:
    precision = precision_score(
        y_true_all, y_pred_all, average="samples", zero_division=0)
    recall = recall_score(y_true_all, y_pred_all,
                          average="samples", zero_division=0)
    f1 = f1_score(y_true_all, y_pred_all, average="samples", zero_division=0)

    print("\n🔹 Vehicle detection (multi-label)")
    print(f"Precision: {precision:.2%}, Recall: {recall:.2%}, F1: {f1:.2%}")
else:
    print("\n⚠️ No se encontraron vehículos en las imágenes seleccionadas.")


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  3.84it/s]


loading annotations into memory...
Done (t=1.96s)
creating index...
index created!
loading annotations into memory...
Done (t=0.06s)
creating index...
index created!


Generando captions:   0%|          | 0/870 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generando captions:   0%|          | 1/870 [00:56<13:42:09, 56.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generando captions:   0%|          | 2/870 [02:03<15:03:49, 62.48s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generando captions:   0%|          | 3/870 [02:58<14:17:45, 59.36s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generando captions:   0%|          | 4/870 [03:59<14:23:19, 59.81s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generando captions:   1%|          | 5/870 [05:10<15:19:27, 63.78s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generando captions:   1%|          | 6/870 [06:11<15:07:15, 63.00s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generando captions:   1%|          | 7/870 [0


✅ Captions guardados en captions_llava_val2017.json
✅ Formato COCO guardado en captions_llava_val2017_coco.json

🔹 Vehicle detection (multi-label)
Precision: 89.16%, Recall: 89.81%, F1: 86.82%


In [1]:
import json

# Abrir y leer el archivo
with open("captions_llava_val2017.json", "r", encoding="utf-8") as f:
    data = json.load(f)

total_detections = 0
per_class = {}

for img, objs in data.items():
    # separar por coma y limpiar espacios
    labels = [o.strip() for o in objs.split(",") if o.strip()]
    total_detections += len(labels)
    for label in labels:
        per_class[label] = per_class.get(label, 0) + 1

print("Detecciones totales:", total_detections)
print("Por clase:")
for cls, count in per_class.items():
    print(f"  {cls}: {count}")


Detecciones totales: 4505
Por clase:
  car: 3282
  bus: 349
  truck: 378
  motorcycle: 341
  bicycle: 150
  motor: 5


In [10]:
pip install accelerate


Collecting accelerate
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.10.1-py3-none-any.whl (374 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.10.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install huggingface_hub[hf_xet]

In [None]:
!git clone https://github.com/deepseek-ai/DeepSeek-VL

Cloning into 'DeepSeek-VL'...


In [5]:
cd DeepSeek-VL

c:\Users\Tito\Desktop\Master\TFM\development\DeepSeek-VL


In [6]:
pip install -e .

Obtaining file:///C:/Users/Tito/Desktop/Master/TFM/development/DeepSeek-VL
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Collecting sentencepiece (from deepseek_vl==1.0.0)
  Downloading sentencepiece-0.2.1-cp310-cp310-win_amd64.whl.metadata (10 kB)
Collecting attrdict (from deepseek_vl==1.0.0)
  Downloading attrdict-2.0.1-py2.py3-none-any.whl.metadata (6.7 kB)
Downloading attrdict-2.0.1-py2.py3-none-any.whl (9.9 kB)
Downloading sentencepiece-0.2.1-cp310-cp310-win_amd64.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:

In [8]:
pip install --upgrade transformers


Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install git+https://github.com/huggingface/transformers accelerate


Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to c:\users\tito\appdata\local\temp\pip-req-build-rwt41tbu
  Resolved https://github.com/huggingface/transformers to commit e11a00a16f925b7d3b52f5007bdce3464edb361f
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting huggingface-hub==1.0.0.rc2 (from transformers==4.57.0.dev0)
  Using cached huggingface_hub-1.0.0rc2-py3-none-any.whl.metadata (14 kB)
Using cached huggingface_hub-1.0.0rc2-py3-none-any.whl (528 kB)
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml): started
  Building wheel for transformers (pyproject.toml): finished with stat

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers 'C:\Users\Tito\AppData\Local\Temp\pip-req-build-rwt41tbu'


In [15]:
pip install qwen-vl-utils[decord]==0.0.8

Collecting qwen-vl-utils==0.0.8 (from qwen-vl-utils[decord]==0.0.8)Note: you may need to restart the kernel to use updated packages.

  Downloading qwen_vl_utils-0.0.8-py3-none-any.whl.metadata (3.6 kB)
Collecting av (from qwen-vl-utils==0.0.8->qwen-vl-utils[decord]==0.0.8)
  Downloading av-15.1.0-cp310-cp310-win_amd64.whl.metadata (4.7 kB)
Collecting decord (from qwen-vl-utils[decord]==0.0.8)
  Downloading decord-0.6.0-py3-none-win_amd64.whl.metadata (422 bytes)
Downloading qwen_vl_utils-0.0.8-py3-none-any.whl (5.9 kB)
Downloading av-15.1.0-cp310-cp310-win_amd64.whl (31.3 MB)
   ---------------------------------------- 0.0/31.3 MB ? eta -:--:--
   --- ------------------------------------ 2.4/31.3 MB 12.2 MB/s eta 0:00:03
   ------------ --------------------------- 10.0/31.3 MB 24.9 MB/s eta 0:00:01
   ---------------------------------- ----- 27.3/31.3 MB 44.3 MB/s eta 0:00:01
   ---------------------------------------  31.2/31.3 MB 47.2 MB/s eta 0:00:01
   ----------------------------

In [None]:
import os
import json
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
from pycocotools.coco import COCO
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor

# -----------------------------
# Configuración
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
image_dir = "./COCO/images/val2017"
instances_path = "./COCO/annotations/instances_val2017.json"
captions_path = "./COCO/annotations/captions_val2017.json"
output_json = "captions_qwen_val2017.json"
output_coco_json = "captions_qwen_val2017_coco.json"

# Clases de vehículos en COCO
vehicle_classes = ["car", "motorcycle", "bus", "truck", "bicycle"]

# -----------------------------
# Cargar modelo Qwen2.5-VL-7B
# -----------------------------
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# -----------------------------
# Cargar anotaciones COCO
# -----------------------------
coco_instances = COCO(instances_path)
coco_captions = COCO(captions_path)
image_id_to_filename = {img["id"]: img["file_name"] for img in coco_instances.dataset["images"]}
filename_to_image_id = {v: k for k, v in image_id_to_filename.items()}

# Filtrar imágenes con vehículos
vehicle_cat_ids = coco_instances.getCatIds(catNms=vehicle_classes)
img_ids = set()
for cat_id in vehicle_cat_ids:
    ids = coco_instances.getImgIds(catIds=[cat_id])
    img_ids.update(ids)
selected_images = [image_id_to_filename[i] for i in list(img_ids)]

# -----------------------------
# Función de limpieza
# -----------------------------
def clean_caption(raw_caption: str) -> str:
    """Elimina posibles bloques extra y deja solo la lista"""
    return raw_caption.strip()

# -----------------------------
# Generar captions con Qwen
# -----------------------------
captions = {}
coco_results = []

prompt_template = (
    "List all vehicles visible in the image using only these options: "
    "car, bus, truck, motorcycle, bicycle. Respond with a comma-separated list only."
)

for filename in tqdm(selected_images, desc="Generando captions"):
    try:
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path).convert("RGB")

        # Preparar inputs
        inputs = processor(images=image, text=prompt_template, return_tensors="pt").to(device)

        # Generar salida
        output_ids = model.generate(**inputs, max_new_tokens=50)
        raw_caption = processor.decode(output_ids[0], skip_special_tokens=True)
        caption = clean_caption(raw_caption)

        captions[filename] = caption

        # Formato COCO
        image_id = filename_to_image_id[filename]
        coco_results.append({"image_id": image_id, "caption": caption})

    except Exception as e:
        print(f"Error con {filename}: {e}")
        captions[filename] = ""

# Guardar JSONs
with open(output_json, "w") as f:
    json.dump(captions, f, indent=2)

with open(output_coco_json, "w") as f:
    json.dump(coco_results, f, indent=2)

print(f"\n✅ Captions guardados en {output_json}")
print(f"✅ Formato COCO guardado en {output_coco_json}")

# -----------------------------
# Evaluación multi-label
# -----------------------------
def extract_vehicles_from_caption(caption):
    caption = caption.lower()
    return [v for v in vehicle_classes if v in caption]

y_true_all, y_pred_all = [], []

for filename, caption in captions.items():
    image_id = filename_to_image_id[filename]

    # Ground truth
    ann_ids = coco_instances.getAnnIds(imgIds=[image_id], catIds=vehicle_cat_ids)
    anns = coco_instances.loadAnns(ann_ids)
    gt_cats = [ann["category_id"] for ann in anns]
    gt_vehicles = [coco_instances.loadCats(cid)[0]["name"] for cid in gt_cats]

    # Predicción
    pred_vehicles = extract_vehicles_from_caption(caption)

    y_true_vec = [1 if v in gt_vehicles else 0 for v in vehicle_classes]
    y_pred_vec = [1 if v in pred_vehicles else 0 for v in vehicle_classes]

    y_true_all.append(y_true_vec)
    y_pred_all.append(y_pred_vec)

if y_true_all:
    precision = precision_score(y_true_all, y_pred_all, average="samples", zero_division=0)
    recall = recall_score(y_true_all, y_pred_all, average="samples", zero_division=0)
    f1 = f1_score(y_true_all, y_pred_all, average="samples", zero_division=0)
    print("\n🔹 Vehicle detection (multi-label)")
    print(f"Precision: {precision:.2%}, Recall: {recall:.2%}, F1: {f1:.2%}")
else:
    print("\n⚠️ No se encontraron vehículos en las imágenes seleccionadas.")


Loading checkpoint shards:  20%|██        | 1/5 [00:14<00:57, 14.42s/it]