### Do an eval for answers we get from LLaVa-Next to understand the following

- Baseline: Single vs 2-image accuracy
- Perturbed images + labels unchanged: Single vs 2-image accuracy
- Perturbed images + new labels: Single vs 2-image accuracy

In [1]:
import os
import json
import jsonlines
import os.path as osp

disk_root = "/mnt/disks/data/webqa"
# LLaVa next predictions
llavan_train_color_path = osp.join(disk_root, "llavanext_webqa_ptb_train_color.jsonl")
llavan_val_color_path = osp.join(disk_root, "llavanext_webqa_ptb_val_color.jsonl")
# Candidate questions
candidate_path = osp.join(disk_root, "ptb_color_gpt_validated.json")

with open(candidate_path, 'r') as f:
    eval_data = json.load(f)

predictions = {}
for in_file in [llavan_train_color_path, llavan_val_color_path]:
    with jsonlines.open(in_file, 'r') as f:
        for obj in f:
            key = list(obj.keys())[0]
            val = list(obj.values())[0]
            predictions[key] = val

In [2]:
# Do we have predictions for all questions?
remaining = set()
for k in eval_data.keys():
    if k not in predictions:
        remaining.add(k)

print(len(remaining))

0


In [6]:
from eval_1022 import compute_vqa_metrics, color_set, shape_set, yesno_set
import numpy as np

def webqa_accuracy(answer, label, Qcate):
    if Qcate == 'color':
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics([answer], label[0], "", color_set)
    elif Qcate == 'shape': 
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics([answer], label[0], "", shape_set)
    elif Qcate == 'yesno': 
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics([answer], label[0], "", yesno_set)
    elif Qcate == 'number': 
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics([answer], label[0], "", {"NUMBER"})
    else:
        return None
    return (F1_avg, F1_max, EM, RE_avg, PR_avg)

def accuracy_agg_results(qa_results):
    single_image_keys = [k for k in qa_results.keys() if len(eval_data[k]['img_posFacts']) == 1]
    two_image_keys = [k for k in qa_results.keys() if len(eval_data[k]['img_posFacts']) == 2]

    single_acc = np.mean([PR_avg for key, (F1_avg, F1_max, EM, RE_avg, PR_avg) in qa_results.items() if key in single_image_keys])
    two_image_acc = np.mean([PR_avg for key, (F1_avg, F1_max, EM, RE_avg, PR_avg) in qa_results.items() if key in two_image_keys])
    avr_acc = np.mean([PR_avg for key, (F1_avg, F1_max, EM, RE_avg, PR_avg) in qa_results.items()])
    return (single_acc, two_image_acc, avr_acc)

##### LLAVA-NEXT baseline on original images that have perturbations

In [8]:
llavan_results_baseline = {}
for k in list(eval_data.keys()):
    question = eval_data[k]['Q']
    answer = predictions[k]['llava_A']
    label = eval_data[k]['A']
    eval_data[k]['A_llavanext'] = answer
    Qcate = eval_data[k]['Qcate'].lower()
    llavan_results_baseline[k] = webqa_accuracy(answer, label, Qcate)

print(accuracy_agg_results(llavan_results_baseline))


(0.8095760233918129, 0.8744252873563219, 0.8227272727272728)


#### Correct despite perturbation

In [17]:
llavan_results_perturbed_original_label = {}
llavan_results_perturbed_generated_label = {}

for k in list(eval_data.keys()):
    llavan_results_perturbed_original_label[k] = {}
    llavan_results_perturbed_generated_label[k] = {}
    eval_data[k]['A_perturbed_llavanext'] = {}
    question = eval_data[k]['Q']
    for idx, label in eval_data[k]['A_perturbed'].items():
        answer = predictions[k]['llava_A_perturbed'][int(idx)]
        original_label = eval_data[k]['A']
        eval_data[k]['A_perturbed_llavanext'][idx] = answer
        Qcate = eval_data[k]['Qcate'].lower()
        llavan_results_perturbed_original_label[k][idx] = webqa_accuracy(answer, original_label, Qcate)
        llavan_results_perturbed_generated_label[k][idx] = webqa_accuracy(answer, [label], Qcate)

     

In [18]:
def accuracy_agg_generated_results(qa_results):
    single_image_keys = [k for k in qa_results.keys() if len(eval_data[k]['img_posFacts']) == 1]
    two_image_keys = [k for k in qa_results.keys() if len(eval_data[k]['img_posFacts']) == 2]

    single_acc = np.mean([PR_avg for key, dict in qa_results.items() if key in single_image_keys for idx, (_,_,_,_,PR_avg) in dict.items()])
    two_image_acc = np.mean([PR_avg for key, dict in qa_results.items() if key in two_image_keys for idx, (_,_,_,_,PR_avg) in dict.items()])
    avr_acc = np.mean([PR_avg for key, dict in qa_results.items() for idx, (_,_,_,_,PR_avg) in dict.items()])
    
    return (single_acc, two_image_acc, avr_acc)

print(accuracy_agg_generated_results(llavan_results_perturbed_original_label))
print(accuracy_agg_generated_results(llavan_results_perturbed_generated_label))

(0.13804713804713803, 0.5819277108433735, 0.22382615444315093)
(0.7188552188552187, 0.34508032128514055, 0.6466239813736904)


#### TODO: Blank predictions

In [20]:
import pandas as pd
exp_name = "llavanext"
baseline_accs = accuracy_agg_results(llavan_results_baseline)
perturbed_original_label_accs = accuracy_agg_generated_results(llavan_results_perturbed_original_label)
perturbed_generated_label_acc = accuracy_agg_generated_results(llavan_results_perturbed_generated_label)

columns = ['experiment name', 'single_image', 'two_image', 'average']
accuracy_agg_df = pd.DataFrame(columns=columns)
accuracy_agg_df['experiment name'] = ['baseline', 'perturbed_original_label', 'perturbed_generated_label']
accuracy_agg_df['single_image'] = [baseline_accs[0], perturbed_original_label_accs[0], perturbed_generated_label_acc[0]]
accuracy_agg_df['two_image'] = [baseline_accs[1], perturbed_original_label_accs[1], perturbed_generated_label_acc[1]]
accuracy_agg_df['average'] = [baseline_accs[2], perturbed_original_label_accs[2], perturbed_generated_label_acc[2]]
accuracy_agg_df.to_csv("results/{exp_name}.csv", index=False)
accuracy_agg_df

Unnamed: 0,experiment name,single_image,two_image,average
0,baseline,0.809576,0.874425,0.822727
1,perturbed_original_label,0.138047,0.581928,0.223826
2,perturbed_generated_label,0.718855,0.34508,0.646624
