# Libraries

In [None]:
%pip install "git+https://github.com/salaniz/pycocoevalcap.git"
%reload_ext autoreload
%autoreload 2
# I often use the line magics - Stackoverflow user
import sys
import os
IN_COLAB = 'google.colab' in sys.modules
repo_token = None
if IN_COLAB:
  from google.colab import userdata
  repo_token = userdata.get('GITHUB_TOKEN')
repo_url = None
if repo_token is None: #use ssh, for local development
    repo_url = f'git+ssh://git@github.com/Dantsz/aiimgdetect.git'
else:
    repo_url = f'git+https://Dantsz:{repo_token}@github.com/Dantsz/aiimgdetect.git'

%pip install --upgrade {repo_url}

In [None]:
import torch
import torch.nn as nn
import os
%pip install loguru
from loguru import logger
import sys
import torch
# allow all messages
logger.remove()
logger_id = logger.add(sys.stderr, level="TRACE", colorize=True, format="<level>{level}</level>: {message} | {name}:{function}:{line} | {time:HH:mm:ss DD-MM-YYYY}")
IN_COLAB = 'google.colab' in sys.modules
logger.info("Colab? : {}", IN_COLAB)
if IN_COLAB:
  logger.info("Mounitng Google drive")
  from google.colab import drive
  drive.mount('/content/drive')
logger.info("Python version: {}", sys.version)
logger.info("Torch version: {}", torch.__version__)
logger.info("Cuda available? : {}", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info("Running on {}", device)
%env KAGGLEHUB_CACHE=datasets
import kagglehub
import os
logger.info("Importing dataset to {}", os.environ["KAGGLEHUB_CACHE"])
path = kagglehub.dataset_download("clkmuhammed/microsoft-coco-2017-common-objects-in-context")
print("Path to dataset files:", path)
import sys
logger.remove(logger_id)
logger_id = logger.add(sys.stderr, level="WARNING", colorize=True, format="<level>{level}</level>: {message} | {name}:{function}:{line} | {time:HH:mm:ss DD-MM-YYYY}")

# Setup Model

In [None]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from adic_components.prototype2 import P2GPTBlock
from adic_components.prototype3 import P3ECDEC, P3Decoder
from adic_components.DyT import DyT
from adic_components.CaptionsDataset import add_bos_eos
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
gpt2_model_pretrained = GPT2Model.from_pretrained('gpt2')
# Get model config to know vocab size and hidden size
config = GPT2Config.from_pretrained('gpt2')
vocab_size = config.vocab_size
hidden_size = config.n_embd
gpt2 = P2GPTBlock(config)
gpt2.load_state_dict(gpt2_model_pretrained.state_dict(), strict=False)
decoder = P3Decoder(config)
decoder.gpt2 = gpt2
encodeco = P3ECDEC(3, 224, 224, hidden_size, decoder)
COLAB_WEIGHTS_FILE = '/content/drive/MyDrive/prototype3_release12.pth'
LOCAL_WIEIGHTS_FILE = 'prototype3_release12.pth'
model = encodeco.to(device)
#freze the decoder
# how about no for once
for name, param in model.decoder.gpt2.named_parameters():
    param.requires_grad = False

In [None]:
READ_WEIGHTS_FROM_FILE = True
if READ_WEIGHTS_FROM_FILE:
  if IN_COLAB:
   try:
      logger.info("Loading model from file")
      model.load_state_dict(torch.load(COLAB_WEIGHTS_FILE, map_location=torch.device(device)))
   except:
      logger.error("Loading model from file failed, going with default weights")
  else:
   logger.info("Loading model from file")
   try:
      model.load_state_dict(torch.load(LOCAL_WIEIGHTS_FILE, map_location=torch.device(device)))
   except:
      logger.error("Loading model from file failed, going with default weights")
model = model.to(device)

# Setup dataset

In [None]:
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from torch.utils.data import DataLoader, Subset
from adic_components.CaptionsDataset import CaptionDatasetPyCOCO, augmentation_train_transform, default_tokenizer, augmentation_test_transform
import tqdm
import os
import json
val_path = os.path.join(path, 'annotations_trainval2017/captions_val2017.json')
print("Path to validation split: ", val_path)

In [None]:
dataset = CaptionDatasetPyCOCO(
    images_dir=os.path.join(path, 'val2017'),
    json_path=val_path,
    transform=augmentation_test_transform,
    tokenizer=default_tokenizer
)

# Generate predictions.json

In [None]:
predictions = []
model.eval()

with torch.no_grad():
    for i in tqdm.tqdm(range(len(dataset))):
        img, c = dataset[i]
        id = dataset.get_image_id_by_index(i)
        img_pixel_values = img.to(device)
        decoder_output = model.generate(img_pixel_values.unsqueeze(0))
        generated = default_tokenizer.batch_decode(decoder_output.cpu().tolist(), skip_special_tokens=True)
        predictions.append({
            'image_id': id,
            'caption': generated[0]
        })
pred_file = 'predictions.json'
with open(pred_file, 'w') as f:
    json.dump(predictions, f)

# Benchmark

In [None]:
coco = COCO(val_path)  # Your ground-truth JSON file
cocoRes = coco.loadRes(pred_file)

cocoEval = COCOEvalCap(coco, cocoRes)
cocoEval.evaluate()