In [8]:
import argparse
import json
import collections
import random
import pandas as pd    
from nltk.translate.bleu_score import sentence_bleu
from eval_metrics.evaluate_metrics import calculate_exactmatch, calculate_f1score, bleu, calculate_appearance_with_normalization
from tabulate import tabulate
from eval_metrics.glossary import *

def evaluate(gt, pred, return_pred=False):
    bleu_scores = collections.defaultdict(list)
    exact_scores = collections.defaultdict(list)
    f1_scores = collections.defaultdict(list)
    question_analysis = []  # To store detailed scores for each question
    
    num_open = 0
    for gt_item, pred_item in zip(gt, pred):
        try:
            gt_results = gt_item['conversations']
        except KeyError:
            gt_results = gt_item['conversatons']
        if not pred_item.__contains__('gt'):
            pred_item['gt'] = gt_results[1]['value']

        gt_value = gt_results[1]['value'].lower()
        pred_value = pred_item['text'].lower()
        if pred_value.startswith('assistant:'):
            pred_value = pred_value[10:].strip()

        gt_value = normalize_word(gt_value)
        pred_value = normalize_word(pred_value)

        if gt_item['answer_type'] == 'OPEN':
            num_open += 1

            question_id = pred_item['question_id']
            exact_match = calculate_exactmatch(pred_value, gt_value)
            f1, precision, recall = calculate_f1score(pred_value, gt_value)
            bleu = sentence_bleu(references=[str(gt_value).split()], hypothesis=str(pred_value).split())
            bleu_1 = sentence_bleu(references=[str(gt_value).split()], hypothesis=str(pred_value).split(), weights=(1, 0, 0, 0))
            bleu_2 = sentence_bleu(references=[str(gt_value).split()], hypothesis=str(pred_value).split(), weights=(0, 1, 0, 0))
            bleu_3 = sentence_bleu(references=[str(gt_value).split()], hypothesis=str(pred_value).split(), weights=(0, 0, 1, 0))

            # Store detailed scores for each question
            question_analysis.append({
                'question_id': question_id,
                'exact_match': exact_match,
                'f1': f1,
                'precision': precision,
                'recall': recall,
                'bleu': bleu,
                'bleu_1': bleu_1,
                'bleu_2': bleu_2,
                'bleu_3': bleu_3
            })

            exact_scores['hit'].append(exact_match)
            f1_scores['f1'].append(f1)
            f1_scores['precision'].append(precision)
            f1_scores['recall'].append(recall)
            bleu_scores['bleu_score'].append(bleu)
            bleu_scores['bleu_score_1'].append(bleu_1)
            bleu_scores['bleu_score_2'].append(bleu_2)
            bleu_scores['bleu_score_3'].append(bleu_3)

    # Calculate aggregate metrics
    exact_score = sum(exact_scores['hit']) / len(exact_scores['hit']) if num_open else 0
    f1_score = sum(f1_scores['f1']) / len(f1_scores['f1']) if num_open else 0
    precision = sum(f1_scores['precision']) / len(f1_scores['precision']) if num_open else 0
    recall = sum(f1_scores['recall']) / len(f1_scores['recall']) if num_open else 0
    bleu_score = sum(bleu_scores['bleu_score']) / len(bleu_scores['bleu_score']) if num_open else 0
    bleu_score_1 = sum(bleu_scores['bleu_score_1']) / len(bleu_scores['bleu_score_1']) if num_open else 0
    bleu_score_2 = sum(bleu_scores['bleu_score_2']) / len(bleu_scores['bleu_score_2']) if num_open else 0
    bleu_score_3 = sum(bleu_scores['bleu_score_3']) / len(bleu_scores['bleu_score_3']) if num_open else 0

    # Print summary metrics
    print(f'num_open {num_open}')
    print(tabulate(
        [
            ['exact match score', exact_score * 100], 
            ['f1 score', f1_score * 100], 
            ['precision', precision * 100], 
            ['recall', recall * 100], 
            ['bleu_score', bleu_score * 100], 
            ['bleu_score_1', bleu_score_1 * 100], 
            ['bleu_score_2', bleu_score_2 * 100], 
            ['bleu_score_3', bleu_score_3 * 100]
        ], 
        headers=['Metric', 'Performance']
    ))

    # Sort question analysis by lowest F1 score and return the top 30
    low_performance_questions = sorted(question_analysis, key=lambda x: x['f1'])
    if return_pred:
        return low_performance_questions, pred
    return low_performance_questions

def load_jsonl(path):
    data=[]
    with open(path, 'r', encoding='utf-8') as reader:
        for line in reader:
            data.append(json.loads(line))
    return data 

## 1.LoRA tuning results

In [27]:
# def eval_open_file(gt_file, pred_file):
visual_enhance_ratio=0.08
bbox_ratio=0.03
epoch_num=6
ROOT_PATH="/data/aofei"
dataset="Slake"

dir=f"llava_med/moe_img_dense_all_query/all_expert_8_16_rank16/lora_{visual_enhance_ratio}_bbox_{bbox_ratio}/epoch{epoch_num}"
# gt_file = f"{ROOT_PATH}/hallucination/{dataset}/data/test.json"
gt_file = f"{ROOT_PATH}/hallucination/{dataset}/data/organ/test_lung.json"
# pred_file = f"{ROOT_PATH}/hallucination/mitigation/{dataset}/{dir}/inference/pred.jsonl"

pred_file = "/data/aofei/hallucination/mitigation/Slake/llava_med/organ_lung/lora/epoch9_seed4/inference/pred.jsonl"
# bv_pred_path = "/data/aofei/hallucination/mitigation/Slake/llava_med/organ_lung/bbox_0.1/epoch9_seed4/inference/pred_beam.jsonl"
dataset = gt_file.split("/")[-2]
print(f"\n========\n {dataset}")

gt = json.load(open(gt_file, 'r'))
# candidate = json.load(open(args.candidate, 'r'))
pred = load_jsonl(pred_file)

gt_ids = [item['id'] for item in gt]
pred_ids = [item['question_id'] for item in pred]
num_gt_ids, num_pred_ids = len(gt_ids), len(pred_ids)
print(f'num_gt_ids: {num_gt_ids} || num_pred_ids: {num_pred_ids}')
# import pdb; pdb.set_trace()
assert gt_ids == pred_ids, "please make sure pred and gt are exactly matched"

# perform evaluation
results, pred = evaluate(gt, pred, return_pred=True)


 organ
num_gt_ids: 419 || num_pred_ids: 419
num_open 300
Metric               Performance
-----------------  -------------
exact match score      76.7574
f1 score               76.6147
precision              76.9241
recall                 76.9606
bleu_score              0.333333
bleu_score_1           75.8185
bleu_score_2           14.7748
bleu_score_3            3.44444


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [28]:
len(results)

300

In [11]:
results[100]

{'question_id': 11984,
 'exact_match': 1.0,
 'f1': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'bleu': 1.821831989445342e-231,
 'bleu_1': 1.0,
 'bleu_2': 2.2250738585072626e-308,
 'bleu_3': 2.2250738585072626e-308}

In [29]:
pred_dict = dict()
for i in pred:
    pred_dict[i['question_id']] = i

In [30]:
#select the top 100 low performance questions
top_low_performance = [item for item in results if item['f1'] < 0.1]
top_low_performance_ids = [item['question_id'] for item in top_low_performance] 

In [31]:
top_low_performance[-1], len(top_low_performance)

({'question_id': 12968,
  'exact_match': 0.0,
  'f1': 0,
  'precision': 0,
  'recall': 0,
  'bleu': 0,
  'bleu_1': 0,
  'bleu_2': 0,
  'bleu_3': 0},
 52)

## 2. Our method

In [32]:
# def eval_open_file(gt_file, pred_file):
visual_enhance_ratio=0.08
bbox_ratio=0.03
epoch_num=6
ROOT_PATH="/data/aofei"
dataset="Slake"

gt_file = f"{ROOT_PATH}/hallucination/{dataset}/data/organ/test_lung.json"

dataset = gt_file.split("/")[-2]
print(f"\n========\n {dataset}")

pred_file = "/data/aofei/hallucination/mitigation/Slake/llava_med/organ_lung/bbox_0.1/epoch9_seed4/inference/pred_beam.jsonl"

gt = json.load(open(gt_file, 'r'))
# candidate = json.load(open(args.candidate, 'r'))
pred = load_jsonl(pred_file)

gt_ids = [item['id'] for item in gt]
pred_ids = [item['question_id'] for item in pred]
num_gt_ids, num_pred_ids = len(gt_ids), len(pred_ids)
print(f'num_gt_ids: {num_gt_ids} || num_pred_ids: {num_pred_ids}')
# import pdb; pdb.set_trace()
assert gt_ids == pred_ids, "please make sure pred and gt are exactly matched"

# perform evaluation
results_ours, pred_ours = evaluate(gt, pred, return_pred=True)

pred_dict_ours = dict()
for i in pred_ours:
    pred_dict_ours[i['question_id']] = i


 organ
num_gt_ids: 419 || num_pred_ids: 419
num_open 300
Metric               Performance
-----------------  -------------
exact match score      82.0046
f1 score               81.8196
precision              82.213
recall                 82.2096
bleu_score              0.333333
bleu_score_1           80.744
bleu_score_2           16.9418
bleu_score_3            4.51138


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [33]:
#select the top high performance questions
top_high_performance = [item for item in results_ours if item['f1'] > 0.5]
top_high_performance_ids = [item['question_id'] for item in top_high_performance] 

In [34]:
top_high_performance[-1], len(top_high_performance)

({'question_id': 12988,
  'exact_match': 1.0,
  'f1': 1.0,
  'precision': 1.0,
  'recall': 1.0,
  'bleu': 1.821831989445342e-231,
  'bleu_1': 1.0,
  'bleu_2': 2.2250738585072626e-308,
  'bleu_3': 2.2250738585072626e-308},
 242)

### Find intersections

In [35]:
our_better_ids = set(top_low_performance_ids) & set(top_high_performance_ids)
len(our_better_ids)

20

In [38]:
for _id in our_better_ids:
    print(f"Question ID: {_id}")
    print(f"GT: {pred_dict[_id]['gt']}")
    print(f"LoRA: {pred_dict[_id]}")
    print(f"Ours (beam): {pred_dict_ours[_id]['text']}")
    print()

Question ID: 12038
GT: Lung
LoRA: {'question_id': 12038, 'prompt': '<image>\nWhich organ is abnormal, heart or lung?', 'text': 'Assistant: Heart', 'gt': 'Lung', 'answer_id': 'eEyVA6xFagLx8dJ4T2YTeY', 'model_id': '/data/aofei/LLM/llava_med', 'metadata': {}}
Ours (beam): Assistant: Lung

Question ID: 12050
GT: Pneumonia
LoRA: {'question_id': 12050, 'prompt': '<image>\nWhat diseases are included in the picture?', 'text': 'Assistant: Cardiomegaly', 'gt': 'Pneumonia', 'answer_id': 'T23NyNEc7WFPbYn54ymRs2', 'model_id': '/data/aofei/LLM/llava_med', 'metadata': {}}
Ours (beam): Assistant: Pneumonia

Question ID: 12054
GT: Lung
LoRA: {'question_id': 12054, 'prompt': '<image>\nWhich organ is abnormal, heart or lung?', 'text': 'Assistant: Heart', 'gt': 'Lung', 'answer_id': 'BXtm7UHfSHiyFjCEmVaYnD', 'model_id': '/data/aofei/LLM/llava_med', 'metadata': {}}
Ours (beam): Assistant: Lung

Question ID: 12070
GT: Pneumothorax
LoRA: {'question_id': 12070, 'prompt': '<image>\nWhat diseases are included in