In [None]:
!pip install --upgrade transformers
!pip install --upgrade bitsandbytes

In [None]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from huggingface_hub import login

#login(HF_TOKEN)

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_id)

In [None]:
!wget "https://github.com/awsaf49/flickr-dataset/releases/download/v1.0/flickr8k.zip"
!unzip flickr8k.zip

In [None]:
!pip install pycocoevalcap

In [None]:
import json
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge

def load_flickr8k_annotations(annotations_path):
    with open(annotations_path, 'r') as f:
        annotations = f.readlines()[1:101] 

    references = {}
    for line in annotations:
        image_path, caption = line.strip().split(',',1)
        image_id = image_path.split('.',1)[0] # image id
        if image_id not in references:
            references[image_id] = []
        references[image_id].append(caption)
    return references

# Annotations yolunu belirt
annotations_path = 'captions.txt'
captions = load_flickr8k_annotations(annotations_path)

In [None]:
prompt="""
Briefly describe in one sentence...
"""
messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": prompt}
    ]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

image_path = f'Images/1000268201_693b08cb0e.jpg'
image = Image.open(image_path).convert("RGB")

inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(model.device)

output = model.generate(**inputs, max_new_tokens=30, temperature=0.4)

generated_text = processor.decode(output[0][inputs["input_ids"][0].shape[0]:-1])
print(generated_text)

In [None]:
references = {key: value for key, value in captions.items()}

generated_descriptions = {}

prompt="""
Briefly describe in one sentence...
"""
messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": prompt}
    ]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

for key in references.keys():
    image_path = f'Images/{key}.jpg'
    image = Image.open(image_path).convert("RGB")

    inputs = processor(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(model.device)

    output = model.generate(**inputs, max_new_tokens=50, temperature=0.4, top_p=0.8)

    generated_text = processor.decode(output[0][inputs["input_ids"][0].shape[0]:-1])
    generated_descriptions[key] = []
    generated_descriptions[key].append(generated_text)

candidates = {key: value for key, value in generated_descriptions.items()}

In [None]:
candidates

In [None]:
def calculate_metrics(references, candidates):

    bleu_scorer = Bleu(4)  # BLEU-1, BLEU-2, BLEU-3, BLEU-4
    bleu_score, _ = bleu_scorer.compute_score(references, candidates)


    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(references, candidates)


    meteor_scorer = Meteor()
    meteor_score, _ = meteor_scorer.compute_score(references, candidates)


    rouge_scorer = Rouge()
    rouge_score, _ = rouge_scorer.compute_score(references, candidates)


    return {
        "BLEU": bleu_score,
        "CIDEr": cider_score,
        "METEOR": meteor_score,
        "ROUGE": rouge_score
    }


scores = calculate_metrics(references, candidates)


print(f"BLEU Skorları: {scores['BLEU']}")
print(f"CIDEr Skoru: {scores['CIDEr']}")
print(f"METEOR Skoru: {scores['METEOR']}")
print(f"ROUGE Skoru: {scores['ROUGE']}")


In [None]:
import os
from PIL import Image
from transformers import AutoTokenizer, ViTFeatureExtractor
from torch.utils.data import Dataset, DataLoader
import torch

tokenizer = AutoTokenizer.from_pretrained(model_id)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)

def load_flickr8k(annotations_path):
    with open(annotations_path, 'r') as f:
        annotations = f.readlines()[1:]

    references = {}
    for line in annotations:
        image_path, caption = line.strip().split(',',1)
        image_id = image_path.split('.',1)[0] # image id
        if image_id not in references:
            references[image_id] = []
        references[image_id].append(caption)
    return references


image_dir = 'Images'
caption_file = 'captions.txt'
captions_dict = load_flickr8k(annotations_path)

In [None]:
import random
from sklearn.model_selection import train_test_split


image_names = list(captions_dict.keys())


train_images, test_images = train_test_split(image_names, test_size=0.2, random_state=42)

print(f"Train set size: {len(train_images)}")
print(f"Test set size: {len(test_images)}")

In [None]:
class Flickr8kDataset(Dataset):
    def __init__(self, image_dir, image_names, captions_dict, tokenizer, feature_extractor, max_length=128):
        self.image_dir = image_dir
        self.captions_dict = captions_dict
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.image_names = image_names
        self.max_length = max_length

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        img_name = self.image_names[idx]
        img_path = os.path.join(self.image_dir, img_name+".jpg")


        image = Image.open(img_path).convert("RGB")
        pixel_values = self.feature_extractor(image, return_tensors="pt").pixel_values


        captions = self.captions_dict[img_name]
        inputs = self.tokenizer(captions, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")

        return {
            'pixel_values': pixel_values.squeeze(),
            'input_ids': inputs['input_ids'].squeeze(),  
            'attention_mask': inputs['attention_mask'].squeeze()  
        }


In [None]:
train_dataset = Flickr8kDataset(
    image_dir=image_dir,
    image_names=train_images,
    captions_dict=captions_dict,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor
)

test_dataset = Flickr8kDataset(
    image_dir=image_dir,
    image_names=test_images,
    captions_dict=captions_dict,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor
)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)
