In [16]:
# 🔧 Kurulum
#!pip install transformers accelerate -q
#!pip install torchvision pandas tqdm -q

# 📚 Kütüphaneler
import sys
import os
import pandas as pd
from PIL import Image
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BlipProcessor, BlipForConditionalGeneration
from tqdm import tqdm
import matplotlib as plt
import time
import gc

In [2]:
import torch
def print_gpu_info():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        device_index = torch.cuda.current_device()
        device_name = torch.cuda.get_device_name(device_index)
        total_memory = torch.cuda.get_device_properties(device_index).total_memory / (1024**2)  # MB
        allocated = torch.cuda.memory_allocated(device_index) / (1024**2)  # MB
        reserved = torch.cuda.memory_reserved(device_index) / (1024**2)  # MB

        print(f"🖥️  CUDA Device         : {device_name}")
        print(f"🧠  Total GPU Memory    : {total_memory:.2f} MB")
        print(f"📦  Allocated Memory    : {allocated:.2f} MB")
        print(f"🕳️  Reserved (cached)   : {reserved:.2f} MB")
    else:
        print("❌ CUDA is not available. Using CPU.")


In [3]:
print_gpu_info()

🖥️  CUDA Device         : NVIDIA GeForce RTX 4060 Laptop GPU
🧠  Total GPU Memory    : 8187.50 MB
📦  Allocated Memory    : 0.00 MB
🕳️  Reserved (cached)   : 0.00 MB


In [4]:
# 🚀 Hardware
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [5]:
# 🧠 Model 
model_name = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)


# Data loader

In [6]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_image_dir = "train/train"
test_image_dir = "test/test"

In [7]:
class CaptionDataset(Dataset):
    def __init__(self, dataframe, processor, image_folder):
        self.data = dataframe
        self.processor = processor
        self.image_folder = image_folder

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_name = str(row["image_id"]) + ".jpg"  # Örn: 0 → 0.jpg
        image_path = os.path.join(self.image_folder, image_name)
        
        image = Image.open(image_path).convert("RGB")
        caption = row["caption"]
        
        encoding = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=32
        )
        
        return {k: v.squeeze(0) for k, v in encoding.items()}

# Train

In [8]:
train_dataset = CaptionDataset(train_df, processor, train_image_dir)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

In [9]:
len(train_dataset)

21367

In [10]:
train_df

Unnamed: 0,image_id,caption
0,0,The image features a comic-style panel depicti...
1,1,"Colorful postcard featuring ""Greetings from Ch..."
2,2,Two vending machines display a variety of drin...
3,3,A man speaks at the eGovernment Conference 201...
4,4,A close-up of several silver coins stacked tog...
...,...,...
21362,21362,A female athlete in a UC San Diego softball un...
21363,21363,The image showcases well-manicured nails featu...
21364,21364,"The image shows various stainless steel pots, ..."
21365,21365,"The image showcases several bottles of wine, p..."


In [13]:
history = {"epoch": [], "loss": [], "time": []}

model.train()
for epoch in range(1):
    start_time = time.time()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        pixel_values = batch["pixel_values"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        labels = input_ids.clone().detach()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            labels=labels
        )

        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    epoch_time = time.time() - start_time
    avg_loss = total_loss / len(train_loader)

    print(f"📘 Epoch {epoch+1} — Loss: {avg_loss:.4f} — Time: {epoch_time:.2f}s")

    history["epoch"].append(epoch + 1)
    history["loss"].append(avg_loss)
    history["time"].append(epoch_time)


Epoch 1: 100%|██████████| 10684/10684 [55:39<00:00,  3.20it/s]

📘 Epoch 1 — Loss: 3.0897 — Time: 3339.60s





# Test

In [19]:
gc.collect()
torch.cuda.empty_cache()

In [20]:
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=30)
    return processor.decode(output[0], skip_special_tokens=True)

In [21]:
results = []
model.eval()

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    image_id = str(row["image_id"]) + ".jpg" 
    image_path = os.path.join(test_image_dir, image_id)
    caption = generate_caption(image_path)
    results.append({
        "id": idx + 1,
        "image_id": image_id,
        "image_model_response": caption
    })

100%|██████████| 3771/3771 [31:51<00:00,  1.97it/s]


In [23]:
# 💾 CSV Kaydı
submission_df = pd.DataFrame(results)
submission_df[["image_id", "image_model_response"]].rename(columns={"image_model_response": "caption"}).to_csv("submission.csv", index=False)
submission_df.to_csv("caption_results.csv", index=False)

In [24]:
from scipy.linalg import sqrtm
from numpy import cov, trace, iscomplexobj

In [25]:
import numpy as np
import pandas as pd
from scipy.linalg import sqrtm
from numpy import cov, trace, iscomplexobj
from sentence_transformers import SentenceTransformer

In [26]:

def calculate_fgd(solution_embed: np.ndarray, submission_embed: np.ndarray) -> float:
    fgd_list = []
    for _idx, (sol_emb_sample, sub_emb_sample) in enumerate(zip(solution_embed, submission_embed)):
        sol_emb_sample_rshaped, sub_emb_sample_rshaped = sol_emb_sample.reshape((1, 384)), sub_emb_sample.reshape((1, 384))
        e1 = np.concatenate([sol_emb_sample_rshaped, sol_emb_sample_rshaped])
        e2 = np.concatenate([sub_emb_sample_rshaped, sub_emb_sample_rshaped])

        mu1, sigma1 = e1.mean(axis=0), cov(e1, rowvar=False)
        mu2, sigma2 = e2.mean(axis=0), cov(e2, rowvar=False)

        ssdiff = np.sum((mu1 - mu2)**2.0)
        covmean = sqrtm(sigma1.dot(sigma2))

        if iscomplexobj(covmean):
            covmean = covmean.real

        fgd = ssdiff + trace(sigma1 + sigma2 - 2.0 * covmean)
        fgd_list.append(fgd)

        if _idx % 100 == 0:
            print(f"Processed {_idx} samples", end="\r")

    return float(np.mean(fgd_list))


def evaluate_fgd(train_csv_path, submission_csv_path):
    # Load CSVs
    gt = pd.read_csv(train_csv_path)
    pred = pd.read_csv(submission_csv_path)

    # Sort by image_id for alignment
    gt_sorted = gt.sort_values("image_id").reset_index(drop=True)
    pred_sorted = pred.sort_values("image_id").reset_index(drop=True)

    # Load GTE-small model
    model = SentenceTransformer("thenlper/gte-small", device="cpu") 

    # Embed captions
    gt_embed = model.encode(gt_sorted["caption"].tolist(), convert_to_numpy=True, normalize_embeddings=True)
    pred_embed = model.encode(pred_sorted["caption"].tolist(), convert_to_numpy=True, normalize_embeddings=True)

    # Calculate FGD score
    score = calculate_fgd(gt_embed, pred_embed)
    print(f"\n🔍 FGD Score: {score:.6f} (lower is better)")
    return score


In [27]:
score = evaluate_fgd("train.csv", "submission.csv")
print(f"Final FGD Score: {score}")

Processed 3700 samples
🔍 FGD Score: 0.449373 (lower is better)
Final FGD Score: 0.4493729586714745
