# Import Librariees

In [1]:
import numpy as np
import json
import torch
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Model Architecture

In [3]:
import torch
import torch.nn as nn
from transformers import (
    GPT2LMHeadModel,
    AdamW,
    get_linear_schedule_with_warmup,
    set_seed,
)
from tqdm import tqdm
from typing import Tuple, Optional
from accelerate import Accelerator

class MLP(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)

    def __init__(self, sizes: Tuple[int, ...], bias=True, act=nn.Tanh):
        """Project clip output to embedding of first prefix_length tokens"""
        super(MLP, self).__init__()
        layers = []
        for i in range(len(sizes) - 1):
            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
            if i < len(sizes) - 2:
                layers.append(act())
                # added some dropout here
                layers.append(nn.Dropout(p=0.2))
        self.model = nn.Sequential(*layers)


class ClipCaptionModel(nn.Module):
    def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor:
        """Generate prefix tokens, shape Bxprefix_length"""
        return torch.zeros(
            batch_size, self.prefix_length, dtype=torch.int64, device=device
        )

    def forward(
        self,
        tokens: torch.Tensor,
        prefix: torch.Tensor,
        mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
    ):
        embedding_text = self.gpt.transformer.wte(tokens)
        prefix_projections = self.clip_project(prefix).view(
            -1, self.prefix_length, self.gpt_embedding_size
        )

        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
        if labels is not None:
            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
            labels = torch.cat((dummy_token, tokens), dim=1)
        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
        return out

    def __init__(self, prefix_length: int = 10, prefix_size: int = 512):
        super(ClipCaptionModel, self).__init__()
        self.prefix_length = prefix_length
        self.gpt = GPT2LMHeadModel.from_pretrained("imthanhlv/gpt2news")
        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
        self.clip_project = MLP(
            (
                prefix_size,
                (self.gpt_embedding_size * prefix_length) // 2,
                self.gpt_embedding_size * prefix_length,
            )
        )


class ClipCaptionPrefix(ClipCaptionModel):
    def parameters(self, recurse: bool = True):
        return self.clip_project.parameters()

    def train(self, mode: bool = True):
        super(ClipCaptionPrefix, self).train(mode)
        self.gpt.eval()
        return self

# Dataset structure

In [4]:
class ClipDataset(torch.utils.data.Dataset):
    def __init__(self, data_path, prefix_length):
        # Load your data from the .pt file
        data = torch.load(data_path)
        self.tokens = data["target"]  # Replace with your actual key for tokens
        self.prefixes = data["clip_embedding"]  # Replace with your actual key for prefixes
        self.prefix_length = prefix_length
        self.masks = [torch.ones(len(token)) for token in self.tokens]

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        token = self.tokens[idx]
        mask = self.masks[idx]
        prefix = self.prefixes[idx].float()  # Chuyển đổi thành float
        return token, mask, prefix

# Function

In [5]:
def generate_beam(model, tokenizer, beam_size: int = 5, prompt=None, embed=None,
                  entry_length=67, temperature=1., stop_token: str = '.'):

    model.eval()
    stop_token_index = tokenizer.encode(stop_token)[0]
#     stop_token_index = 0

    tokens = None
    scores = None
    device = next(model.parameters()).device
    seq_lengths = torch.ones(beam_size, device=device)
    is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
    with torch.no_grad():
        if embed is not None:
            generated = embed
        else:
            if tokens is None:
                tokens = torch.tensor(tokenizer.encode(prompt))
                tokens = tokens.unsqueeze(0).to(device)
                generated = model.gpt.transformer.wte(tokens)
        for i in range(entry_length):
            outputs = model.gpt(inputs_embeds=generated)
            logits = outputs.logits
            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
            logits = logits.softmax(-1).log()
            if scores is None:
                scores, next_tokens = logits.topk(beam_size, -1)
                generated = generated.expand(beam_size, *generated.shape[1:])
                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
                if tokens is None:
                    tokens = next_tokens
                else:
                    tokens = tokens.expand(beam_size, *tokens.shape[1:])
                    tokens = torch.cat((tokens, next_tokens), dim=1)
            else:
                logits[is_stopped] = -float(np.inf)
                logits[is_stopped, 0] = 0
                scores_sum = scores[:, None] + logits
                seq_lengths[~is_stopped] += 1
                scores_sum_average = scores_sum / seq_lengths[:, None]
                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
                next_tokens_source = next_tokens // scores_sum.shape[1]
                seq_lengths = seq_lengths[next_tokens_source]
                next_tokens = next_tokens % scores_sum.shape[1]
                next_tokens = next_tokens.unsqueeze(1)
                tokens = tokens[next_tokens_source]
                tokens = torch.cat((tokens, next_tokens), dim=1)
                generated = generated[next_tokens_source]
                scores = scores_sum_average * seq_lengths
                is_stopped = is_stopped[next_tokens_source]
            next_token_embed = model.gpt.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
            generated = torch.cat((generated, next_token_embed), dim=1)
            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
            if is_stopped.all():
                break
    scores = scores / seq_lengths
    output_list = tokens.cpu().numpy()
    output_texts = [tokenizer.decode(output[:int(length)]) for output, length in zip(output_list, seq_lengths)]
    order = scores.argsort(descending=True)
    output_texts = [output_texts[i] for i in order]
    cleaned_texts = [text.replace('<pad>', '').strip() for text in output_texts]
    return cleaned_texts

In [6]:
def generate_references_and_hypotheses(model, tokenizer, test_loader, prefix_length, references_path, hypotheses_path):
    """
    Generate references and hypotheses from the model and save them to separate files.
    """
    model.eval()
    device = next(model.parameters()).device

    references = []
    references_into_file = []
    hypotheses = []
    hypotheses_into_file = []

    with torch.no_grad():
        for tokens, masks, prefixes in test_loader:
            tokens, masks, prefixes = tokens.to(device), masks.to(device), prefixes.to(device)

            for idx in range(tokens.size(0)):
                reference = tokenizer.decode(tokens[idx].cpu().numpy(), skip_special_tokens=True).split()
                references.append(reference)
                references_into_file.append(" ".join(reference))  # Join reference into a string

                prefix_embed = model.clip_project(prefixes[idx].unsqueeze(0)).reshape(1, prefix_length, -1)
                hypothesis = generate_beam(
                    model, tokenizer, embed=prefix_embed, temperature=1, beam_size=5
                )[0].split()
                hypotheses.append(hypothesis)
                hypotheses_into_file.append(" ".join(hypothesis))  # Join hypothesis into a string

    # Save references to a file
    with open(references_path, 'w', encoding='utf-8') as f:
        json.dump(references_into_file, f, ensure_ascii=False, indent=4)

    # Save hypotheses to a file
    with open(hypotheses_path, 'w', encoding='utf-8') as f:
        json.dump(hypotheses_into_file, f, ensure_ascii=False, indent=4)

    return references, hypotheses

# Preparation

## Loading model

In [7]:
CPU = torch.device('cpu')
model = ClipCaptionPrefix(10)

model_path = "/kaggle/input/cs420-final/checkpoints/best_model.pt"
device = "cuda:0" if torch.cuda.is_available() else CPU
model.load_state_dict(torch.load(model_path, map_location=device)) 
model = model.eval() 
model = model.to(device)

config.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

## Loading tokenizer

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("imthanhlv/gpt2news")

tokenizer.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

## Loading test_loader

In [9]:
test_dataset = ClipDataset("/kaggle/input/cs420-final/embedding_images/ViT_B_16/test_img.pt", 10)
batch_size = 16
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Predict

In [10]:
references, hypotheses = generate_references_and_hypotheses(model, tokenizer, test_loader, prefix_length = 10, 
                                                            references_path = "/kaggle/working/references.json" , 
                                                            hypotheses_path = "/kaggle/working/hypotheses.json")

# Evaluation

In [11]:
import json

In [12]:
def read_json_file(filepath):
  """Reads a JSON file and returns its contents as a Python dictionary.

  Args:
    filepath: The path to the JSON file.

  Returns:
    A Python dictionary representing the JSON data, or None if an error occurs.
  """
  try:
    with open(filepath, 'r', encoding='utf-8') as file:
      data = json.load(file)
    return data
  except FileNotFoundError:
    print(f"Error: File not found at {filepath}")
    return None
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in {filepath}")
    return None

In [13]:
def transform_data(hypotheses_data, references_data, group_size=5):
    hypotheses = []
    for i in range(0, len(hypotheses_data), group_size):
        hypotheses.append(hypotheses_data[i])  # Lấy mỗi nhóm 1 câu (câu đầu tiên)

    references = []
    for i in range(0, len(references_data), group_size):
        references.append(references_data[i:i+group_size])  # Nhóm thành các danh sách con

    return hypotheses, references

In [14]:
hypotheses_filepath = "/kaggle/working/hypotheses.json"
references_filepath = "/kaggle/working/references.json"

hypotheses_data = read_json_file(hypotheses_filepath)
references_data = read_json_file(references_filepath)
hypotheses, references = transform_data(hypotheses_data, references_data)

In [15]:
!pip install nltk rouge-score pycocoevalcap

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=78e8c0b26ee0ea055e65afc2db225e9f0ecfd7caa8774921585c613d3ddcca5b
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, pycocoevalcap
Successfully installed pycocoevalcap-1.2 rouge-score-0.1.2


In [16]:
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.spice.spice import Spice

In [17]:
def format_inputs_for_eval(hypotheses, references):
    """
    Định dạng lại dữ liệu đầu vào để phù hợp với pycocoevalcap:
    - hypotheses: danh sách các câu dự đoán.
    - references: danh sách các danh sách câu ground truth.
    """
    gts_dict = {i: refs for i, refs in enumerate(references)}
    res_dict = {i: [hyp] for i, hyp in enumerate(hypotheses)}
    return gts_dict, res_dict

def compute_metrics_with_pycocoevalcap(hypotheses, references):
    """
    Tính toán các metric BLEU, ROUGE, CIDEr, METEOR, SPICE bằng pycocoevalcap.
    """
    gts_dict, res_dict = format_inputs_for_eval(hypotheses, references)
    
    # Khởi tạo các scorer
    bleu_scorer = Bleu(n=4)
    rouge_scorer = Rouge()
    cider_scorer = Cider()
    meteor_scorer = Meteor()
    spice_scorer = Spice()
    
    bleu, _ = bleu_scorer.compute_score(gts=gts_dict, res=res_dict)
    
    rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict)
    
    cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict)
    
    meteor, _ = meteor_scorer.compute_score(gts=gts_dict, res=res_dict)
    
    spice, _ = spice_scorer.compute_score(gts=gts_dict, res=res_dict)
    
    return {
        "BLEU-1": bleu[0],
        "BLEU-2": bleu[1],
        "BLEU-3": bleu[2],
        "BLEU-4": bleu[3],
        "ROUGE-L": rouge,
        "CIDEr": cider,
        "METEOR": meteor,
        "SPICE": spice
    }

In [18]:
metrics = compute_metrics_with_pycocoevalcap(hypotheses, references)
metrics

Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.
{'testlen': 6342, 'reflen': 6388, 'guess': [6342, 5784, 5226, 4668], 'correct': [4300, 2559, 1431, 811]}
ratio: 0.9927989981213223


{'BLEU-1': 0.6731195108466851,
 'BLEU-2': 0.5437409866639468,
 'BLEU-3': 0.43155363310131817,
 'BLEU-4': 0.343131884038921,
 'ROUGE-L': 0.5204369228167688,
 'CIDEr': 0.8127252747517392,
 'METEOR': 0.31941977403138855,
 'SPICE': 0.08287945439731345}