In [1]:
import os
#set the openai env variable here

In [2]:
api_version = "2024-05-01-preview"
from langchain_openai import AzureChatOpenAI

model_kwargs = dict(
    model="gpt-4-128k", 
    azure_endpoint="endpoint here",
    api_key="key here",
    api_version=api_version,
    temperature=0.0,
)

gpt_model = AzureChatOpenAI(**model_kwargs, cache=False)

In [11]:
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from pydantic import BaseModel, Field
import json
from langchain_core.messages import HumanMessage, SystemMessage

In [132]:
system_msg_eval = """
You are provided with a JSON object of model answers from eleven models and a structured ground truth. The structured ground truth contains lists from three aspects: anatomy, symptom, and measurement. Your task is to evaluate each model's answer based on these aspects. Specifically, for each aspect, you need to count: The number of correctly recalled components (recall_number). The number of incorrect components that do not exist in the chest X-ray image and ground truth (wrong_number).

The input format, a json object:
{{"structured_ground_truth": a json object}, {"models":{"model_1": model_1_answer}, ...}}
Your output format:
{
"model_1": {"anatomy": {recall_number":int, "wrong_number":int}, "symptom":{...}, "measurement":{...}},
"model_2": {"anatomy": {recall_number":int, "wrong_number":int}, "symptom":{...}, "measurement":{...}},
...
}

Instructions:
- The recall should consider similar mentions, not just exact matches. Ensure you capture all possible correct components.
- Do not count common anatomical structures in chest X-rays that are absent from the ground truth as incorrect components..
"""

In [133]:
class Eval_single(BaseModel):
    recall_number: int = Field(description="The number of correct components in the model responses.")
    wrong_number: int = Field(description="The number of wrong components in the model responses.")
    

class Eval_Res(BaseModel):
    anatomy: Eval_single = Field(description="The evaluation results of anatomy")
    measurement: Eval_single = Field(description="The evaluation results of measurement")
    symptom: Eval_single = Field(description="The evaluation results of symptom")
    
class Eval_ResList(BaseModel):
    GPT4o: Eval_Res = Field(description="The evaluation results of GPT4o.", default=None)
    CheXagent: Eval_Res = Field(description="The evaluation results of CheXagent.")
    LLaVA_med: Eval_Res = Field(description="The evaluation results of LLaVA-med.")
    LLaVA_med_15: Eval_Res = Field(description="The evaluation results of LLaVA-med_15.")
    LLaVA: Eval_Res = Field(description="The evaluation results of LLaVA.")
    LLaVA_13b: Eval_Res = Field(description="The evaluation results of LLaVA_13b.")
    LLM_CXR: Eval_Res = Field(description="The evaluation results of LLM-CXR.")
    Med_flamingo: Eval_Res = Field(description="The evaluation results of Med-flamingo.")
    MiniGPT4: Eval_Res = Field(description="The evaluation results of MiniGPT4.")
    XrayGPT: Eval_Res = Field(description="The evaluation results of XrayGPT.")
    RadFM: Eval_Res = Field(description="The evaluation results of RadFM.")

#Here we use the classic parser in LangChain: Pydantic, to ensure a strict parsing process
eval_parser = PydanticOutputParser(pydantic_object=Eval_ResList)

open_eval_prompt = ChatPromptTemplate.from_messages([
    SystemMessage(content=system_msg_eval),
    HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            template="The structured ground_truth and models' model_response: {input_pairs}. Ensure that you read the model answers carefully and find the similar mentioned components as the recalled components. Your evaluation output in a JSON object format, without extra explanation:",
            input_variables=["input_pairs"],
            partial_variables={"format_instructions": eval_parser.get_format_instructions()},
        )
    ),
])

open_eval_chain = (
    open_eval_prompt 
    | gpt_model 
    | eval_parser
)

In [144]:
# load model inference results
def load_model_ans(model_name):
    ans_path = f"/path/to/model/answer/{model_name}/mimic_open_res.jsonl"
    with open(ans_path, "r") as f:
        results = [json.loads(line) for line in f]
    formatted_results = []
    id_results_dict = dict()
    for i in results:
        _id = i["question_id"]
        new_dict = dict()
        new_dict["structured_ground_truth"] = i["structured_answer"]
        new_dict["model_answer"] = i["model_answer"]
        id_results_dict[_id] = new_dict
        formatted_results.append(new_dict)
    return formatted_results, id_results_dict

In [145]:
# formatted_results[:5]
from tqdm import tqdm

In [148]:
model_name_res_dict = dict()
model_names = ["LLM-CXR", "GPT4o", "llava-v1.6", "llava-v1.6-13b", "RadFM", "XrayGPT", "MiniGPT4", "Med-flamingo", "LLaVA-med-1.5", "LLaVA-med", "CheXagent"]
len(model_names)
for model_name in model_names:
    model_name_res_dict[model_name] = load_model_ans(model_name)[1]

In [111]:
final_evals = []

In [1]:
question_ids = list(model_name_res_dict["LLM-CXR"].keys())
# question_ids
for _id in tqdm(question_ids):
    gt = model_name_res_dict["LLM-CXR"][_id]["structured_ground_truth"]
    try:
        models_res = {
            "CheXagent":model_name_res_dict["CheXagent"][_id]["model_answer"], 
            "LLaVA_med":model_name_res_dict["LLaVA-med"][_id]["model_answer"], 
            "LLaVA_med_15":model_name_res_dict["LLaVA-med-1.5"][_id]["model_answer"], 
            "LLaVA":model_name_res_dict["llava-v1.6"][_id]["model_answer"], 
            "LLaVA_13b":model_name_res_dict["llava-v1.6-13b"][_id]["model_answer"], 
            "LLM_CXR":model_name_res_dict["LLM-CXR"][_id]["model_answer"], 
            "Med_flamingo":model_name_res_dict["Med-flamingo"][_id]["model_answer"], 
            "MiniGPT4":model_name_res_dict["MiniGPT4"][_id]["model_answer"], 
            "XrayGPT":model_name_res_dict["XrayGPT"][_id]["model_answer"], 
            "RadFM":model_name_res_dict["RadFM"][_id]["model_answer"]
        }
    except KeyError as e: # if key error, that means the GPT4o doesn't have result due the image privacy problem
        print(e)
        continue
    if model_name_res_dict["GPT4o"].__contains__(_id):
        models_res["GPT4o"] = model_name_res_dict["GPT4o"][_id]["model_answer"]
    for_eval = {"structured_ground_truth":gt, "models": models_res}
    # print(for_eval)
    example_ans = open_eval_chain.invoke(dict(input_pairs=for_eval))

    final_evals.append(example_ans)

In [163]:
final_evals[-1].dict() #samples

{'GPT4o': {'anatomy': {'recall_number': 0, 'wrong_number': 0},
  'measurement': {'recall_number': 0, 'wrong_number': 0},
  'symptom': {'recall_number': 0, 'wrong_number': 3}},
 'CheXagent': {'anatomy': {'recall_number': 0, 'wrong_number': 0},
  'measurement': {'recall_number': 0, 'wrong_number': 0},
  'symptom': {'recall_number': 0, 'wrong_number': 0}},
 'LLaVA_med': {'anatomy': {'recall_number': 0, 'wrong_number': 0},
  'measurement': {'recall_number': 0, 'wrong_number': 0},
  'symptom': {'recall_number': 0, 'wrong_number': 2}},
 'LLaVA_med_15': {'anatomy': {'recall_number': 1, 'wrong_number': 2},
  'measurement': {'recall_number': 0, 'wrong_number': 0},
  'symptom': {'recall_number': 0, 'wrong_number': 3}},
 'LLaVA': {'anatomy': {'recall_number': 0, 'wrong_number': 0},
  'measurement': {'recall_number': 0, 'wrong_number': 0},
  'symptom': {'recall_number': 0, 'wrong_number': 0}},
 'LLaVA_13b': {'anatomy': {'recall_number': 0, 'wrong_number': 0},
  'measurement': {'recall_number': 0, 

In [166]:
i=0
res_dict = dict()
for model in list(final_evals[0].dict().keys()):
    res_dict[model] = {"anatomy_recall":0, "symptom_recall":0,  "measurement_recall":0, "anatomy_error":0, "symptom_error":0, "measurement_error":0}
gt_anatomy_num, gt_symptom_num, gt_measure_num = 0, 0, 0
for _id in question_ids:
    gt = model_name_res_dict["LLM-CXR"][_id]["structured_ground_truth"]
    gt_anatomy_num += len(gt["anatomy"])
    gt_symptom_num += len(gt["symptom"])
    gt_measure_num += len(gt["measurement"])
    for model in list(final_evals[i].dict().keys()):
        if final_evals[i].dict()[model] is not None:
            res_dict[model]['anatomy_recall'] += final_evals[i].dict()[model]['anatomy']['recall_number']
            res_dict[model]['anatomy_error'] += final_evals[i].dict()[model]['anatomy']['wrong_number']
            res_dict[model]['symptom_recall'] += final_evals[i].dict()[model]['symptom']['recall_number']
            res_dict[model]['symptom_error'] += final_evals[i].dict()[model]['symptom']['wrong_number']
            res_dict[model]['measurement_recall'] += final_evals[i].dict()[model]['measurement']['recall_number']
            res_dict[model]['measurement_error'] += final_evals[i].dict()[model]['measurement']['wrong_number']
    i+=1

In [2]:
# get the final evaluation results
for k in list(res_dict.keys()):
    rate_ana = res_dict[k]['anatomy_error'] / (res_dict[k]['anatomy_recall'] + res_dict[k]['anatomy_error'])
    rate_sym = res_dict[k]['symptom_error'] / (res_dict[k]['symptom_recall'] + res_dict[k]['symptom_error'])
    rate_mea = res_dict[k]['measurement_error'] / (res_dict[k]['measurement_recall'] + res_dict[k]['measurement_error'])
    print(k, rate_ana, rate_sym, rate_mea)