In [7]:
import json
from eval_datasets import CaptionDataset
from aac_metrics.functional import cider_d, bleu_4, rouge_l
from aac_metrics.utils.tokenization import preprocess_mono_sents, preprocess_mult_sents

# def get_metrics_captioning(res_file, annotations_path, image_dir_path):
#     with open(res_file) as f:
#         data = json.load(f)
    
#     model_outputs = {}
#     for k,item in data["outputs"].items():
#         model_outputs[k] = item[len(data["query"]):]
    
#     dataset = CaptionDataset(image_dir_path, annotations_path)
#     hyps, refs = [], []
#     for k,v in model_outputs.items():
#         hyps.append(v.replace('\n',' '))
#         refs.append(dataset.data_dict[int(k)]["captions"])

#     candidates = preprocess_mono_sents(hyps)
#     mult_references = preprocess_mult_sents(refs)
    
#     cider_scores, _ = cider_d(candidates, mult_references)
#     bleu_scores, _  = bleu_4(candidates, mult_references)
#     rouge_scores, _  = rouge_l(candidates, mult_references)
#     print("Corpus BLEU Score:", bleu_scores)
#     print("Corpus ROUGE Scores:", rouge_scores)
#     print("Corpus Cider Scores:", cider_scores)

def get_metrics_captioning(res_file, annotations_path, image_dir_path):
    with open(res_file) as f:
        data = json.load(f)
    
    model_outputs = []
    for item in data["outputs"]:
        model_outputs.append( item)
    
    dataset = CaptionDataset(image_dir_path, annotations_path)
    hyps, refs = [], []
    for k,v in enumerate(model_outputs):
        hyps.append(v.replace('\n',' '))
        refs.append([rf.replace('\n',' ') for rf in dataset.get_item_with_idx(k)["captions"]])

    candidates = preprocess_mono_sents(hyps)
    mult_references = preprocess_mult_sents(refs)
    
    cider_scores, _ = cider_d(candidates, mult_references)
    bleu_scores, _  = bleu_4(candidates, mult_references)
    rouge_scores, _  = rouge_l(candidates, mult_references)
    print("Corpus BLEU Score:", bleu_scores)
    print("Corpus ROUGE Scores:", rouge_scores)
    print("Corpus Cider Scores:", cider_scores)

### Random In-context demonstrations

In [8]:
res_file = "/home/asureddy_umass_edu/cs682/flamingo/results/captioning/OpenFlamingo-3B-vitl-mpt1b_0-shot_random-examples.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics_captioning(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: {'bleu_4': tensor(0.0324, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.2225, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.1288, dtype=torch.float64)}


In [10]:
res_file = "/home/asureddy_umass_edu/cs682/flamingo/results/captioning/OpenFlamingo-3B-vitl-mpt1b_1-shot_random-examples.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics_captioning(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: {'bleu_4': tensor(0.0945, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.3238, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.2331, dtype=torch.float64)}


In [11]:
res_file = "/home/asureddy_umass_edu/cs682/flamingo/results/captioning/OpenFlamingo-3B-vitl-mpt1b_2-shot_random-examples.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics_captioning(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: {'bleu_4': tensor(0.1307, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.3844, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.3318, dtype=torch.float64)}


In [9]:
res_file = "/home/asureddy_umass_edu/cs682/flamingo/results/captioning/OpenFlamingo-3B-vitl-mpt1b_4-shot_random-examples.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics_captioning(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: {'bleu_4': tensor(0.1440, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.3961, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.3558, dtype=torch.float64)}


In [12]:
res_file = "/home/asureddy_umass_edu/cs682/flamingo/results/captioning/OpenFlamingo-3B-vitl-mpt1b_8-shot_random-examples.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics_captioning(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: {'bleu_4': tensor(0.1539, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.4065, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.3749, dtype=torch.float64)}


### RICE based in-context examples

In [13]:
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"

In [14]:
for i in [1,2,4,8]:
    res_file = f"/home/asureddy_umass_edu/cs682/flamingo/results/captioning/OpenFlamingo-3B-vitl-mpt1b_{i}-shot.json"
    print(f"\n{i}-shots ICL with RICE\n")
    get_metrics_captioning(res_file, annotations_path, image_dir_path)


1-shots ICL with RICE

Corpus BLEU Score: {'bleu_4': tensor(0.1042, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.3500, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.2251, dtype=torch.float64)}

2-shots ICL with RICE

Corpus BLEU Score: {'bleu_4': tensor(0.1437, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.4019, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.3358, dtype=torch.float64)}

4-shots ICL with RICE

Corpus BLEU Score: {'bleu_4': tensor(0.1713, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.4275, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.4084, dtype=torch.float64)}

8-shots ICL with RICE

Corpus BLEU Score: {'bleu_4': tensor(0.1892, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.4402, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.4414, dtype=torch.float64)}


In [None]:
# rice based
# n-shots,bleu,rouge,cider
# 0,0.0324,0.2225,0.1288
# 1,0.1042,0.3500,0.2251
# 2,0.1437,0.4019,0.3358
# 4,0.1713,0.4275,0.4084
# 8,0.1892,0.4402,0.4414


In [None]:
# random

In [44]:
res_file = "/home/asureddy_umass_edu/cs682/flamingo/flamingo-3b-coco-one-shot.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: {'bleu_4': tensor(0.1012, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.3440, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.4058, dtype=torch.float64)}


In [45]:
res_file = "/home/asureddy_umass_edu/cs682/flamingo/flamingo-3b-coco-ZS.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: {'bleu_4': tensor(0.0532, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.2362, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.2325, dtype=torch.float64)}


In [46]:
res_file = "/home/asureddy_umass_edu/cs682/flamingo/flamingo-3b-coco-two-shot.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: {'bleu_4': tensor(0.0899, dtype=torch.float64)}
Corpus ROUGE Scores: {'rouge_l': tensor(0.3341, dtype=torch.float64)}
Corpus Cider Scores: {'cider_d': tensor(0.4238, dtype=torch.float64)}


In [24]:
# old
res_file = "/home/asureddy_umass_edu/cs682/flamingo/flamingo-3b-coco-ZS.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: 0.059729001580259705
Corpus ROUGE Scores: {'rouge1': 0.010141944522348787, 'rouge2': 0.0002675585284280936, 'rougeL': 0.010141944522348787}


In [25]:
res_file = "/home/asureddy_umass_edu/cs682/flamingo/flamingo-3b-coco-two-shot.json"
annotations_path = "../dataset/annotations/captions_val2017.json"
image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
get_metrics(res_file, annotations_path, image_dir_path)

Corpus BLEU Score: 0.08197111291929228
Corpus ROUGE Scores: {'rouge1': 0.014435707489431725, 'rouge2': 9.020386072523904e-05, 'rougeL': 0.014435707489431725}


In [27]:
with open(res_file) as f:
    data = json.load(f)

model_outputs = {}
for k,item in data["outputs"].items():
    model_outputs[k] = item[len(data["query"]):]

dataset = CaptionDataset(image_dir_path, annotations_path)
hyps, refs = [], []
for k,v in model_outputs.items():
    hyps.append(dataset.data_dict[int(k)]["captions"][0])
    refs.append(dataset.data_dict[int(k)]["captions"])

bleu_score = compute_corpus_bleu(refs, hyps)
rouge_scores = compute_corpus_rouge(refs, hyps)

print("Corpus BLEU Score:", bleu_score)
print("Corpus ROUGE Scores:", rouge_scores)

Corpus BLEU Score: 1.0
Corpus ROUGE Scores: {'rouge1': 0.014656737642366828, 'rouge2': 0.0, 'rougeL': 0.014656737642366828}


In [33]:
from aac_metrics import evaluate

candidates: list[str] = ["a man is speaking", "rain falls"]
mult_references: list[list[str]] = [["a man speaks.", "someone speaks.", "a man is speaking while a bird is chirping in the background"], ["rain is falling hard on a surface"]]

corpus_scores, _ = evaluate(candidates, mult_references)
print(corpus_scores)
# dict containing the score of each metric: "bleu_1", "bleu_2", "bleu_3", "bleu_4", "rouge_l", "meteor", "cider_d", "spice", "spider"
# {"bleu_1": tensor(0.4278), "bleu_2": ..., ...}

{'bleu_1': tensor(0.4278, dtype=torch.float64), 'bleu_2': tensor(0.4059, dtype=torch.float64), 'bleu_3': tensor(0.4390, dtype=torch.float64), 'bleu_4': tensor(0.4565, dtype=torch.float64), 'meteor': tensor(0.2634, dtype=torch.float64), 'rouge_l': tensor(0.4871, dtype=torch.float64), 'cider_d': tensor(0.9614, dtype=torch.float64), 'spice': tensor(0.4222, dtype=torch.float64), 'spider': tensor(0.6918, dtype=torch.float64)}


In [38]:
from aac_metrics.functional import cider_d, bleu_4, rouge_l
from aac_metrics.utils.tokenization import preprocess_mono_sents, preprocess_mult_sents

candidates: list[str] = ["a man is speaking", "rain falls"]
mult_references: list[list[str]] = [["a man speaks.", "someone speaks.", "a man is speaking while a bird is chirping in the background"], ["rain is falling hard on a surface"]]

candidates = preprocess_mono_sents(candidates)
mult_references = preprocess_mult_sents(mult_references)

corpus_scores, sents_scores = cider_d(candidates, mult_references)
print(corpus_scores)
# {"cider_d": tensor(0.9614)}
print(sents_scores)
# {"cider_d": tensor([1.3641, 0.5587])}

{'cider_d': tensor(0.9614, dtype=torch.float64)}
{'cider_d': tensor([1.3641, 0.5587], dtype=torch.float64)}


In [37]:
compute_corpus_bleu(mult_references, candidates)

0.34686464535871

In [39]:
corpus_scores, sents_scores = bleu_4(candidates, mult_references)
print(corpus_scores)
# {"cider_d": tensor(0.9614)}
print(sents_scores)
# {"cider_d": tensor([1.3641, 0.5587])}

{'bleu_4': tensor(0.4565, dtype=torch.float64)}
{'bleu_4': tensor([1.0000e+00, 1.2275e-08], dtype=torch.float64)}
