In [1]:
import os
#set the openai env variable here

In [2]:
api_version = "2024-05-01-preview"
from langchain_openai import AzureChatOpenAI

model_kwargs = dict(
    model="gpt-4-128k", 
    azure_endpoint="endpoint here",
    api_key="key here",
    api_version=api_version,
    temperature=0.0,
)

gpt_model = AzureChatOpenAI(**model_kwargs, cache=False)

In [3]:
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from pydantic import BaseModel, Field
import json
from langchain_core.messages import HumanMessage, SystemMessage

In [4]:
system_msg_eval = """
You are provided with a JSON object that includes a question-ground_truth pair and the corresponding answers from eleven different models. The question pertains to medical knowledge derived from an image. Your task is to evaluate the correctness of each model's answer, returning a binary output: 1 for correct and 0 for incorrect.

The input format, a json object:
{{"question":str, "ground_truth": str}, {"models":{"model_1": model_1_answer}, ...}}
Your output format:
{
"model_1": int, 1 or 0,
"model_2": int, 1 or 0,
...
}

Instructions:
- Consider an answer correct if it expresses the correct medical knowledge, even if the wording differs from the ground truth.
- Aim to capture all possible correct expressions, not just those that match the ground truth verbatim.
"""

In [32]:
class Eval_ResList(BaseModel):
    GPT4o: int = Field(description="The evaluation results of GPT4o.", default=0)
    CheXagent: int = Field(description="The evaluation results of CheXagent.")
    LLaVA_med: int = Field(description="The evaluation results of LLaVA-med.")
    LLaVA_med_15: int = Field(description="The evaluation results of LLaVA-med_15.")
    LLaVA: int = Field(description="The evaluation results of LLaVA.")
    LLaVA_13b: int = Field(description="The evaluation results of LLaVA_13b.")
    LLM_CXR: int = Field(description="The evaluation results of LLM-CXR.")
    Med_flamingo: int = Field(description="The evaluation results of Med-flamingo.")
    MiniGPT4: int = Field(description="The evaluation results of MiniGPT4.")
    XrayGPT: int = Field(description="The evaluation results of XrayGPT.")
    RadFM: int = Field(description="The evaluation results of RadFM.")

#Here we use the classic parser in LangChain: Pydantic, to ensure a strict parsing process
eval_parser = PydanticOutputParser(pydantic_object=Eval_ResList)

open_eval_prompt = ChatPromptTemplate.from_messages([
    SystemMessage(content=system_msg_eval),
    HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            template="The question, ground_truth and models' model_response: {input_pairs}. Ensure that you read the model answers carefully and evaluate its correctness. Your evaluation output in a JSON object format, without extra explanation:",
            input_variables=["input_pairs"],
            partial_variables={"format_instructions": eval_parser.get_format_instructions()},
        )
    ),
])

open_eval_chain = (
    open_eval_prompt 
    | gpt_model 
    | eval_parser
)

In [24]:
#get model inference results
def load_model_ans(model_name):
    ans_path = f"/path/to/model/answer/{model_name}/mimic_open_res.jsonl"
    with open(ans_path, "r") as f:
        results = [json.loads(line) for line in f]
    formatted_results = []
    id_results_dict = dict()
    for i in results:
        if i["question_type"] == "type4_Knowledge":
            _id = i["question_id"]
            new_dict = dict()
            # print(i)
            new_dict["ground_truth"] = i["ground_truth"]
            new_dict["question"] = i["prompt"]
            new_dict["model_answer"] = i["model_answer"]
            id_results_dict[_id] = new_dict
            formatted_results.append(new_dict)
    return formatted_results, id_results_dict

In [25]:
# formatted_results[:5]
from tqdm import tqdm

In [26]:
model_name_res_dict = dict()
model_names = ["LLM-CXR", "GPT4o", "llava-v1.6", "llava-v1.6-13b", "RadFM", "XrayGPT", "MiniGPT4", "Med-flamingo", "LLaVA-med-1.5", "LLaVA-med", "CheXagent"]
len(model_names)
for model_name in model_names:
    model_name_res_dict[model_name] = load_model_ans(model_name)[1]

In [27]:
final_evals = []

In [30]:
for k in model_name_res_dict:
    print(k, len(model_name_res_dict[k])) # the number of all inferenced answer

LLM-CXR 2318
GPT4o 2252
llava-v1.6 2318
llava-v1.6-13b 2318
RadFM 2318
XrayGPT 2318
MiniGPT4 2318
Med-flamingo 2318
LLaVA-med-1.5 2318
LLaVA-med 2318
CheXagent 2318


In [16]:
final_evals = []

In [1]:
len(model_name_res_dict["LLM-CXR"])
question_ids = list(model_name_res_dict["LLM-CXR"].keys())
# question_ids
for _id in tqdm(question_ids[500:]):
    gt = model_name_res_dict["LLM-CXR"][_id]["ground_truth"]
    q = model_name_res_dict["LLM-CXR"][_id]["question"]
    models_res = {
        "CheXagent":model_name_res_dict["CheXagent"][_id]["model_answer"], 
        "LLaVA_med":model_name_res_dict["LLaVA-med"][_id]["model_answer"], 
        "LLaVA_med_15":model_name_res_dict["LLaVA-med-1.5"][_id]["model_answer"], 
        "LLaVA":model_name_res_dict["llava-v1.6"][_id]["model_answer"], 
        "LLaVA_13b":model_name_res_dict["llava-v1.6-13b"][_id]["model_answer"], 
        "LLM_CXR":model_name_res_dict["LLM-CXR"][_id]["model_answer"], 
        "Med_flamingo":model_name_res_dict["Med-flamingo"][_id]["model_answer"], 
        "MiniGPT4":model_name_res_dict["MiniGPT4"][_id]["model_answer"], 
        "XrayGPT":model_name_res_dict["XrayGPT"][_id]["model_answer"], 
        "RadFM":model_name_res_dict["RadFM"][_id]["model_answer"]
    }
    if model_name_res_dict["GPT4o"].__contains__(_id):
        models_res["GPT4o"] = model_name_res_dict["GPT4o"][_id]["model_answer"]
    for_eval = {"question":q, "ground_truth":gt, "models": models_res}
    # print(for_eval)
    example_ans = open_eval_chain.invoke(dict(gr_pairs=for_eval))

    final_evals.append(example_ans)

In [45]:
final_evals[-5].dict() #example

{'GPT4o': 1,
 'CheXagent': 1,
 'LLaVA_med': 1,
 'LLaVA_med_15': 1,
 'LLaVA': 0,
 'LLaVA_13b': 1,
 'LLM_CXR': 1,
 'Med_flamingo': 1,
 'MiniGPT4': 0,
 'XrayGPT': 0,
 'RadFM': 1}

In [47]:
all_models = final_evals[0].dict()
for i in final_evals[1:]:
    for k in i.dict():
        all_models[k] += i.dict()[k]

In [2]:
#the evaluation results
for i in all_models:
    num = all_models[i]
    print(i, (2318-num)/2318) #2318 is the number of all questions

In [50]:
(2252 - 2137) / 2252 # For GPT-4o, it can not answer some of the questions, so the total number is less

0.05106571936056838