In [1]:
%matplotlib inline
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
import matplotlib.pyplot as plt
import skimage.io as io
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

import json
from json import encoder
encoder.FLOAT_REPR = lambda o: format(o, '.3f')

In [2]:
# set up file names and pathes
dataDir='.'
dataType='val2014'
algName = 'fakecap'
annFile='%s/annotations/captions_%s.json'%(dataDir,dataType)
subtypes=['results', 'evalImgs', 'eval']
[resFile, evalImgsFile, evalFile]= \
['%s/results/captions_%s_%s_%s.json'%(dataDir,dataType,algName,subtype) for subtype in subtypes]

# download Stanford models
! bash get_stanford_models.sh

# download Google word2vec model
! bash get_google_word2vec_model.sh

Found Stanford CoreNLP.
Found Google news word2vec model.


In [3]:
# create coco object and cocoRes object
coco = COCO(annFile)
cocoRes_base = coco.loadRes(resFile)
cocoRes_tflan = coco.loadRes('captions_instructblip_flan-t5_results.json')
cocoRes_vicuna = coco.loadRes('captions_instructblip_vicuna_results.json')
cocoRes_vit_gpt2 = coco.loadRes('captions_vit-gpt2-image-captioning_untrained.json')
cocoRes_clip = coco.loadRes('captions_instructblip_clip_results.json')
# print(resFile)

loading annotations into memory...
0:00:00.339247
creating index...
index created!
Loading and preparing results...     
DONE (t=0.03s)
creating index...
index created!
Loading and preparing results...     
DONE (t=0.02s)
creating index...
index created!
Loading and preparing results...     
DONE (t=0.02s)
creating index...
index created!
Loading and preparing results...     
DONE (t=0.02s)
creating index...
index created!
Loading and preparing results...     
DONE (t=0.02s)
creating index...
index created!


In [4]:
cocoEval_clip = COCOEvalCap(coco, cocoRes_clip)

# evaluate on a subset of images by setting
# cocoEval.params['image_id'] = cocoRes.getImgIds()
# please remove this line when evaluating the full validation set
cocoEval_clip.params['image_id'] = cocoRes_clip.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
cocoEval_clip.evaluate()

tokenization...


setting up scorers...
computing Bleu score...
{'testlen': 2385, 'reflen': 2366, 'guess': [2385, 2135, 1885, 1635], 'correct': [1839, 1037, 497, 224]}
ratio: 1.0080304311069281
Bleu_1: 0.771
Bleu_2: 0.612
Bleu_3: 0.462
Bleu_4: 0.341
computing METEOR score...
METEOR: 0.278
computing Rouge score...
ROUGE_L: 0.571
computing CIDEr score...
CIDEr: 0.868
computing CIDEr-R score...
CIDEr-R: 0.873
computing SPICE score...
SPICE: 0.199


In [5]:
# create cocoEval object by taking coco and cocoRes
cocoEval_base = COCOEvalCap(coco, cocoRes_base)

# evaluate on a subset of images by setting
# cocoEval.params['image_id'] = cocoRes.getImgIds()
# please remove this line when evaluating the full validation set
cocoEval_base.params['image_id'] = cocoRes_base.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
cocoEval_base.evaluate()

tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9893, 'reflen': 9855, 'guess': [9893, 8893, 7893, 6893], 'correct': [5732, 2510, 1043, 423]}
ratio: 1.003855910705124
Bleu_1: 0.579
Bleu_2: 0.404
Bleu_3: 0.279
Bleu_4: 0.191
computing METEOR score...
METEOR: 0.195
computing Rouge score...
ROUGE_L: 0.396
computing CIDEr score...
CIDEr: 0.505
computing CIDEr-R score...
CIDEr-R: 0.523
computing SPICE score...
SPICE: 0.133


In [6]:
# create cocoEval object by taking coco and cocoRes
cocoEval_tflan = COCOEvalCap(coco, cocoRes_tflan)

# evaluate on a subset of images by setting
# cocoEval.params['image_id'] = cocoRes.getImgIds()
# please remove this line when evaluating the full validation set
cocoEval_tflan.params['image_id'] = cocoRes_tflan.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
cocoEval_tflan.evaluate()

tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 2459, 'reflen': 2437, 'guess': [2459, 2209, 1959, 1709], 'correct': [2020, 1164, 609, 284]}
ratio: 1.0090274928186258
Bleu_1: 0.821
Bleu_2: 0.658
Bleu_3: 0.512
Bleu_4: 0.387
computing METEOR score...
METEOR: 0.297
computing Rouge score...
ROUGE_L: 0.588
computing CIDEr score...
CIDEr: 0.977
computing CIDEr-R score...
CIDEr-R: 0.993
computing SPICE score...
SPICE: 0.233


In [7]:
# create cocoEval object by taking coco and cocoRes
cocoEval_vicuna = COCOEvalCap(coco, cocoRes_vicuna)

# evaluate on a subset of images by setting
# cocoEval.params['image_id'] = cocoRes.getImgIds()
# please remove this line when evaluating the full validation set
cocoEval_vicuna.params['image_id'] = cocoRes_vicuna.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
cocoEval_vicuna.evaluate()

tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 2854, 'reflen': 2719, 'guess': [2854, 2604, 2354, 2104], 'correct': [2131, 1277, 701, 344]}
ratio: 1.0496506068403642
Bleu_1: 0.747
Bleu_2: 0.605
Bleu_3: 0.478
Bleu_4: 0.365
computing METEOR score...
METEOR: 0.305
computing Rouge score...
ROUGE_L: 0.570
computing CIDEr score...
CIDEr: 0.952
computing CIDEr-R score...
CIDEr-R: 0.969
computing SPICE score...
SPICE: 0.241


In [8]:
# create cocoEval object by taking coco and cocoRes
cocoEval_vit_gpt2 = COCOEvalCap(coco, cocoRes_vit_gpt2)

# evaluate on a subset of images by setting
# cocoEval.params['image_id'] = cocoRes.getImgIds()
# please remove this line when evaluating the full validation set
cocoEval_vit_gpt2.params['image_id'] = cocoRes_vit_gpt2.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
cocoEval_vit_gpt2.evaluate()

tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 2874, 'reflen': 2706, 'guess': [2874, 2624, 2374, 2124], 'correct': [1971, 980, 459, 217]}
ratio: 1.062084257205816
Bleu_1: 0.686
Bleu_2: 0.506
Bleu_3: 0.367
Bleu_4: 0.267
computing METEOR score...
METEOR: 0.244
computing Rouge score...
ROUGE_L: 0.507
computing CIDEr score...
CIDEr: 0.732
computing CIDEr-R score...
CIDEr-R: 0.765
computing SPICE score...
SPICE: 0.165


In [9]:
# print output evaluation scores
print("Base results")
for metric, score in cocoEval_base.eval.items():
    print('%s: %.3f'%(metric, score))

print("**************************************************************************")
print("flan-t5-XL results")
for metric, score in cocoEval_tflan.eval.items():
    print('%s: %.3f'%(metric, score))

print("**************************************************************************")
print("Vicuna results")
for metric, score in cocoEval_vicuna.eval.items():
    print('%s: %.3f'%(metric, score))

print("**************************************************************************")
print("ViT-gpt2 results")
for metric, score in cocoEval_vit_gpt2.eval.items():
    print('%s: %.3f'%(metric, score))

print("**************************************************************************")
print("CLIP results")
for metric, score in cocoEval_clip.eval.items():
    print('%s: %.3f'%(metric, score))

Base results
Bleu_1: 0.579
Bleu_2: 0.404
Bleu_3: 0.279
Bleu_4: 0.191
METEOR: 0.195
ROUGE_L: 0.396
CIDEr: 0.505
CIDEr-R: 0.523
SPICE: 0.133
**************************************************************************
flan-t5-XL results
Bleu_1: 0.821
Bleu_2: 0.658
Bleu_3: 0.512
Bleu_4: 0.387
METEOR: 0.297
ROUGE_L: 0.588
CIDEr: 0.977
CIDEr-R: 0.993
SPICE: 0.233
**************************************************************************
Vicuna results
Bleu_1: 0.747
Bleu_2: 0.605
Bleu_3: 0.478
Bleu_4: 0.365
METEOR: 0.305
ROUGE_L: 0.570
CIDEr: 0.952
CIDEr-R: 0.969
SPICE: 0.241
**************************************************************************
ViT-gpt2 results
Bleu_1: 0.686
Bleu_2: 0.506
Bleu_3: 0.367
Bleu_4: 0.267
METEOR: 0.244
ROUGE_L: 0.507
CIDEr: 0.732
CIDEr-R: 0.765
SPICE: 0.165
**************************************************************************
CLIP results
Bleu_1: 0.771
Bleu_2: 0.612
Bleu_3: 0.462
Bleu_4: 0.341
METEOR: 0.278
ROUGE_L: 0.571
CIDEr: 0.868
CIDEr-R: 0.873
SPICE: