<a href="https://colab.research.google.com/github/BerhanDemiralp/FileOrganizationProject/blob/master/ImageCaptioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting of Project and Downloading Data

Make suitable for GitHub

In [14]:
import nbformat

input_path = "ImageCaptioning.ipynb"
output_path = "ImageCaptioning_new.ipynb"

with open(input_path, "r", encoding="utf-8") as f:
    nb = nbformat.read(f, as_version=4)

# widgets metadata'sını temizle
if "widgets" in nb.metadata:
    del nb.metadata["widgets"]

# Temiz kopyayı kaydet
with open(output_path, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

print(f"✅ Temiz notebook kaydedildi: {output_path}")


In [15]:
# !pip install kaggle
# from google.colab import files
# files.upload()  # Burada kaggle.json dosyasını seç

In [16]:
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c obss-intern-competition-2025

In [17]:
# import zipfile

# with zipfile.ZipFile("obss-intern-competition-2025.zip", "r") as zip_ref:
#     zip_ref.extractall()

In [18]:
# !mv train/train/* train/
# !mv test/test/* test/

In [19]:
# from google.colab import drive
# drive.mount('/content/drive')

In [20]:
import os
# Klasör yolları
train_dir = "train"
test_dir = "test"

# .jpg uzantılı dosyaları say
num_train_images = len([f for f in os.listdir(train_dir) if f.endswith(".jpg")])
num_test_images = len([f for f in os.listdir(test_dir) if f.endswith(".jpg")])

print(f"📸 train klasöründe {num_train_images} adet .jpg dosyası var.")
print(f"🧪 test klasöründe {num_test_images} adet .jpg dosyası var.")


📸 train klasöründe 21367 adet .jpg dosyası var.
🧪 test klasöründe 3771 adet .jpg dosyası var.


# ✅ Model bileşenlerini yükle


In [21]:
from sentence_transformers import SentenceTransformer
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm
import torch
import numpy as np
from numpy import cov, trace, iscomplexobj
from scipy.linalg import sqrtm

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "nlpconnect/vit-gpt2-image-captioning"
model = VisionEncoderDecoderModel.from_pretrained(model_name).to(device)
feature_extractor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_to

Evaluation Metric


In [22]:
def calculate_fgd(solution_embed: np.ndarray, submission_embed: np.ndarray) -> float:
    fgd_list = []
    for _idx, (sol_emb_sample, sub_emb_sample) in enumerate(zip(solution_embed, submission_embed)):
        sol_emb_sample_rshaped = sol_emb_sample.reshape((1, 384))
        sub_emb_sample_rshaped = sub_emb_sample.reshape((1, 384))
        e1 = np.concatenate([sol_emb_sample_rshaped, sol_emb_sample_rshaped])
        e2 = np.concatenate([sub_emb_sample_rshaped, sub_emb_sample_rshaped])

        mu1, sigma1 = e1.mean(axis=0), cov(e1, rowvar=False)
        mu2, sigma2 = e2.mean(axis=0), cov(e2, rowvar=False)
        ssdiff = np.sum((mu1 - mu2) ** 2.0)
        covmean = sqrtm(sigma1.dot(sigma2))
        if iscomplexobj(covmean):
            covmean = covmean.real
        fgd = ssdiff + trace(sigma1 + sigma2 - 2.0 * covmean)
        fgd_list.append(fgd)
        if _idx % 25 == 0:
            print(f"Processed {_idx}/{len(solution_embed)} samples", end="\r")
    return float(np.mean(fgd_list))

In [12]:
# ✅ 500 örneklik veri al
train_df = pd.read_csv("train.csv").sample(n=20, random_state=42).reset_index(drop=True)
ground_truth_captions = list(train_df["caption"])

# ✅ Model caption üretimi
generated_captions = []

for image_id in tqdm(train_df["image_id"]):
    image_path = os.path.join("train", str(image_id))
    if not image_path.endswith(".jpg"):
        image_path += ".jpg"
    image = Image.open(image_path).convert("RGB")
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)

    output_ids = model.generate(pixel_values, max_length=12, num_beams=1)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
    generated_captions.append(caption)

# ✅ Embed işlemi
embed_model = SentenceTransformer("thenlper/gte-small")

embed_truth = embed_model.encode(ground_truth_captions, convert_to_numpy=True, show_progress_bar=True)
embed_pred = embed_model.encode(generated_captions, convert_to_numpy=True, show_progress_bar=True)



100%|██████████| 20/20 [00:56<00:00,  2.83s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FGD Skoru hesapla

In [13]:
score = calculate_fgd(embed_truth, embed_pred)
print(f"\n\n📉 FGD Skoru: {score:.4f} (daha düşük = daha iyi)")

Processed 0/20 samples

📉 FGD Skoru: 0.2567 (daha düşük = daha iyi)
