In [None]:
## Imports

In [None]:
!pip install openai
!pip install tiktoken
!pip install langchain
!pip install -U sentence-transformers
!pip install accelerate
!pip install einops
!pip install transformers
!pip install xformers
!pip install huggingface_hub

Collecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m71.7/77.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.1
Collecting tiktoken
  Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.5.1
Collecting langchain
  Downloading langchain-0.0.314-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.9 

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import pickle
from langchain import PromptTemplate, LLMChain
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain import HuggingFacePipeline, HuggingFaceHub
import os
import openai

Mounted at /content/drive


In [None]:
GPT_3_KEY = ''
GPT_4_KEY = ''
OPENAI_KEY = GPT_4_KEY
HF_API_KEY = ''

In [None]:
os.environ["OPENAI_API_KEY"] = OPENAI_KEY
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_API_KEY

In [None]:
# Set openai.api_key to the OPENAI environment variable
openai.api_key = os.environ["OPENAI_API_KEY"]

### API Check

## Environment Variables

In [None]:
# models
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
GPT_3 = "gpt-3.5-turbo"
GPT_4 = "gpt-4"
LOCAL_MODEL = "DrugGPT"
# Use local embedding model?
USELOCALEMBD = True
# Use local model?
USELOCALMODEL = False
# Use langchain model or build your own from transformers library?
USELANGCHAINMODEL = True
# Use huggingface model? RECOMMNAD, OTHERWISE IT"S TOO SLOW
USEHFAPI = False
# Use langchain openAI model or directly use openAI api
USEOPENAI = True

In [None]:
# Define the slice size
SLICE_THRESHOLD = 5000
# Define the size of the evaluation set
EVALUATION_SIZE = 1000
# Define the smaller size of the evaluation set
EVALUATION_SIZE_SMALL = 500

#### Data Paths

In [None]:
data_paths_dict = {
    'pubmedqa': {
        'type': 'binary',
        'data': '',
        'answer': '',
        'GPT_3_answer': '',
        'GPT_4_answer': ''
    },
    'ade': {
        'type': 'text',
        'data': '',
        'answer': '',
        'GPT_3_answer': '',
        'GPT_4_answer': ''
    },
    'chatDoctor': {
        'type': 'text',
        'data': '',
        'answer': '',
        'GPT_3_answer': '',
        'GPT_4_answer': ''
    },
    'DDI_binary': {
        'type': 'binary',
        'data': '',
        'answer': '',
        'GPT_3_answer': '',
        'GPT_4_answer': ''
    },
    'drug_usage': {
        'type': 'text',
        'data': '',
        'answer': '',
        'GPT_3_answer': '',
        'GPT_4_answer': ''
    },
    'medmcqa': {
        'type': 'mc',
        'data': '',
        'answer': '',
        'GPT_3_answer': '',
        'GPT_4_answer': ''
    },
    'mmlu_mc': {
        'type': 'mc',
        'data': '',
        'answer': '',
        'GPT_3_answer': '',
        'GPT_4_answer': ''
    },
    'usmle_mc': {
        'type': 'mc',
        'data': '',
        'answer': '',
        'GPT_3_answer': '',
        'GPT_4_answer': ''
    },
    'moderna_interactions': {
        'type': 'binary',
        'data': '',
        'answer': '',
        'GPT_3_answer': '',
        'GPT_4_answer': ''
    }
}


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cpu


## Local Model

1. model: Specifies the model to be used for text generation. In this case, we will use 'DrugGPT' for DrugGPT evaluation.

2. tokenizer = AutoTokenizer.from_pretrained(model): Loads the tokenizer associated with the model. A tokenizer is responsible for converting input text into a format (like a list of tokens) that the model can understand.

4. torch_dtype=torch.bfloat16: This is the data type of the torch tensors used in the pipeline. Using bfloat16 instead of the default float32 can reduce memory usage and computation time, at the expense of some numerical precision.

5. trust_remote_code=True: This allows execution of code from the remote model, which is necessary if the model includes custom post-processing.

6. device_map="auto": This allows the model to run on the best available device (either a GPU or CPU).

7. max_length=200: This specifies the maximum length of the generated text.

8. do_sample=True: This means that the generated text will be sampled from the model's output distribution rather than just taking the most probable token at each step. This makes the output more diverse and creative.

9. top_k=10: This parameter is used for the "top-k sampling" strategy, which only considers the top k most probable tokens at each step.

10. num_return_sequences=1: This means that the model will return one generated sequence.

11. eos_token_id=tokenizer.eos_token_id: The id of the End of Sentence (EOS) token. The generation process will stop if the model generates this token.

#### Save Local Model （We use Langchain instead）

In [None]:
if not USELANGCHAINMODEL:
  print("We are using our own local model")
  # Save the pipeline
  if os.path.isfile(local_model_path):
    with open(local_model_path, 'rb') as f:
      pipeline = pickle.load(f)
  else:
    tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL)
    model = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL, trust_remote_code=True)
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
    )

    with open(local_model_path, 'wb') as f:
        pickle.dump(pipeline, f)

  sequences = pipeline(
    "Tell me a joke",
      max_length=200,
      do_sample=True,
      top_k=10,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
  )
  for seq in sequences:
      print(f"Result: {seq['generated_text']}")

### Langchain LLM Local Model or HF API

In [None]:
if USEHFAPI:
  llm = HuggingFaceHub(repo_id=LOCAL_MODEL, task="text-generation", model_kwargs={"temperature":0.5, "max_length":2000, "trust_remote_code": True})
  print(f"We are using HF API {llm}")
else:
  llm = HuggingFacePipeline.from_model_id(model_id=LOCAL_MODEL, task="text-generation", device=0, model_kwargs={"temperature":0.5, "max_length":2000, "trust_remote_code": True})
  print(f"We are using local mod")

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

We are using local mod


## Chat Agent Prompts

### System Prompt

In [None]:
SYS_PROMPT_INSTRUCTIONS = {
    "pubmedqa": {
        "task": "You are tasked with answering question based on the provided content related to pubmed articles. You should thoroughly understand the question, extract relevant information from the content, analyze this information and finally provide an answer. BE CONCISE AND ACCURATE",
        "answer_format": """
Analysis: Provide an analysis that logically leads to the answer based on the relevant content.
Final Answer: Provide the final answer, Must be a 'yes', 'no'.
        """,
        "not_to_dos": "Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Stick to summarizing and interpreting the content as objectively and accurately as possible. Remember that you are providing an analysis based on the content and not diagnosing or treating medical conditions."
    },
    "ade": {
        "task": "Your task is to identify the adverse drug effect mentioned in the question based on the provided content . Thoroughly understand the question, extract relevant information, analyze it, and provide a concise and accurate answer.",
        "answer_format": """
Analysis: Provide an analysis that logically leads to the answer based on the relevant content.
Final Answer: Provide the final answer, which should identify the adverse effect.
        """,
        "not_to_dos": "Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Summarize and interpret the information as objectively and accurately as possible. You are providing an analysis, not diagnosing or treating medical conditions."
    },

    "chatDoctor": {
        "task": "Your task is to list the medications based on the provided content related to the symptom or disease mentioned in the question. Understand the question, extract relevant information, analyze it, and provide a concise and accurate answer.",
        "answer_format": """
Analysis: Provide an analysis that logically leads to the answer based on the relevant content.
Final Answer: Provide the final answer, which should be a list of medications related to the symptom or disease.
        """,
        "not_to_dos": "Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Summarize and interpret the information as objectively and accurately as possible. You are providing an analysis, not diagnosing or treating medical conditions."
    },
    "DDI_binary": {
        "task": "You are tasked to answer yes/no questions based on the provided content about whether there is a reaction between the drugs mentioned. Understand the question, extract relevant information, analyze it, and provide a concise and accurate answer.",
        "answer_format": """
Analysis: Provide an analysis that logically leads to the answer based on the relevant content.
Final Answer: Provide the final answer. The answer should be 'yes' or 'no'.
        """,
        "not_to_dos": "Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Summarize and interpret the information as objectively and accurately as possible. You are providing an analysis, not diagnosing or treating medical conditions."
    },

    "drug_usage": {
        "task": "Your task is to answer two yes/no questions based on the provided content related to the usage of the drug provided. Understand the question, extract relevant information, analyze it, and provide a concise and accurate answer.",
        "answer_format": """
Analysis: Provide an analysis that logically leads to the answer based on the relevant content.
Final Answer: Provide the final answer, The answer should be two 'yes' or 'no' separated by comma.
        """,
        "not_to_dos": "Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Summarize and interpret the information as objectively and accurately as possible. You are providing an analysis, not diagnosing or treating medical conditions."
    },

    "medmcqa": {
        "task": "Your task is to answer multiple choice questions based on the provided content about medication and pharmacology. Understand the question, extract relevant information, analyze it, and provide a concise and accurate answer.",
        "answer_format": """
Analysis: Provide an analysis that logically leads to the answer based on the relevant content.
Final Answer: Provide the final answer, which should be a single letter in the alphabet representing the best option among the multiple choices provided in the question.
        """,
        "not_to_dos": "Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Summarize and interpret the information as objectively and accurately as possible. You are providing an analysis, not diagnosing or treating medical conditions."
    },

    "mmlu_mc": {
        "task": "Your task is to answer multiple choice questions based on the provided content about drug recommendation. Understand the question, extract relevant information, analyze it, and provide a concise and accurate answer.",
        "answer_format": """
Analysis: Provide an analysis that logically leads to the answer based on the relevant content.
Final Answer: Provide the final answer, which should be a single letter in the alphabet representing the best option among the multiple choices provided in the question.
        """,
        "not_to_dos": "Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Summarize and interpret the information as objectively and accurately as possible. You are providing an analysis, not diagnosing or treating medical conditions."
    },

    "usmle_mc": {
        "task": "Your task is to answer multiple choice questions based on the provided content about drug recommendation. Understand the question, analyze it, and provide a concise and accurate answer.",
        "answer_format": """
Analysis: Provide an analysis that logically leads to the answer based on the relevant content.
Final Answer: Provide the final answer, which should be a single letter in the alphabet representing the best option among the multiple choices provided in the question.
        """,
        "not_to_dos": "Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Summarize and interpret the content as objectively and accurately as possible."
    },
    "moderna_interactions": {
        "task": "You are tasked to answer yes/no questions based on the provided content about whether the therapeutic efficacy of Moderna COVID-19 Vaccine decrease when used with another drug. Understand the question, extract relevant information, analyze it, and provide a concise and accurate answer.",
        "answer_format": """
Analysis: Provide an analysis that logically leads to the answer based on the relevant information.
Final Answer: Provide the final answer. The answer should be 'yes' or 'no'.
        """,
        "not_to_dos": "Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Summarize and interpret the information as objectively and accurately as possible. You are providing an analysis, not diagnosing or treating medical conditions."
    },
}


In [None]:
sys_template = PromptTemplate(
    input_variables=["task", "answer_format", "not_to_dos"],
    template="Task: {task}\nAnswer Format: {answer_format}\nNot to dos: {not_to_dos}\n----Below are some examples-----\n"
)

In [None]:
def generate_sys_prompt(knowledge_set):
  instruction = SYS_PROMPT_INSTRUCTIONS[knowledge_set]
  return sys_template.format(**instruction)

### Few Shots

In [None]:
FS_EXAMPLES = {
    "pubmedqa": [
        {
            "question": "Does intermittent warm blood cardioplegia provide adequate myocardial resuscitation after global ischaemia?",
            "content" : "",
            "analysis": "Based on the provided information from the study, intermittent warm blood cardioplegia does not seem to provide adequate myocardial resuscitation after global ischaemia.",
            "final_answer": "no"
        },
        ],
    "ade": [
        {
            "question": "Identify the adverse drug reaction related to azithromycin in this context: Intravenous azithromycin-induced ototoxicity.",
            "content": "",
            "analysis": "It can be concluded that the adverse drug reaction related to azithromycin is ototoxicity.",
            "final_answer": "ototoxicity"
        },
    ],
    "chatDoctor": [
        {
            "question": "What are the recommended medications for Panic disorder?",
            "content": "",
            "analysis": "The medications listed in the content are the recommended ones for treating Panic disorder.",
            "final_answer": "lorazepam, alprazolam, clonazepam, paroxetine, venlafaxine, mirtazapine, buspirone, fluvoxamine, imipramine, desvenlafaxine, clomipramine, acamprosate"
        },
    ],
    "DDI_binary": [
        {
            "question": "Is there a reaction between Sibutramine and Icatibant?",
            "content": "",
            "analysis": "From the given context, it can be concluded that there is a reaction effect identified between Sibutramine and Icatibant",
            "final_answer": "yes"
        },
    ],
    "drug_usage": [
        {
            "question": "Answer the following two questions about acetaminophen:\n Have studies shown adverse effects on preganancy?\n Have studies shown an interaction with alchohol?",
            "content": "",
            "analysis": "The content shows that acetaminophen has shwon adverse effects on preganancy demonstrated in animal studies. The content has shown that acetaminophen interacts with alchohol",
            "final_answer": "yes, yes"
        },
    ],
    "usmle_mc": [
        {
            "question": "A 62-year-old woman presents for a regular check-up. She complains of lightheadedness and palpitations which occur episodically. Past medical history is significant for a myocardial infarction 6 months ago and NYHA class II chronic heart failure. She also was diagnosed with grade I arterial hypertension 4 years ago. Current medications are aspirin 81 mg, atorvastatin 10 mg, enalapril 10 mg, and metoprolol 200 mg daily. Her vital signs are a blood pressure of 135/90 mm Hg, a heart rate of 125/min, a respiratory rate of 14/min, and a temperature of 36.5¬∞C (97.7¬∞F). Cardiopulmonary examination is significant for irregular heart rhythm and decreased S1 intensity. ECG is obtained and is shown in the picture (see image). Echocardiography shows a left ventricular ejection fraction of 39%. Which of the following drugs is the best choice for rate control in this patient? A:Atenolol, B:Diltiazem, C:Propafenone, D:Digoxin",
            "content": "",
            "analysis": "The 62-year-old woman's presentation of lightheadedness and palpitations, an irregular heart rhythm on cardiopulmonary examination, and her history of myocardial infarction and NYHA class II chronic heart failure suggest she may be suffering from a rate control issue related to her heart. The patient is already on a regimen that includes metoprolol, a beta-blocker used for rate control, and the dose is relatively high. This makes adding another beta-blocker like atenolol less effective. The use of calcium channel blockers, like diltiazem, might worsen her heart failure symptoms. Propafenone, a class IC antiarrhythmic, is mainly used to treat conditions that cause a fast heart rate, such as atrial fibrillation and atrial flutter, and ventricular arrhythmias, but may not be suitable for a patient with heart failure. On the other hand, digoxin has been traditionally used for rate control in patients with heart failure and atrial fibrillation, as it increases the strength of the heart's contractions and slows down the electrical impulses in the AV node.",
            "final_answer": "D",
        },
    ],
    "mmlu_mc": [
        {
            "question": "A 55-year-old man is brought to the emergency department by his wife because he told her he did not want to live anymore. During the past 6 weeks, he has experienced fatigue, loss of interest in usual activities, a 7-kg (15-lb) weight loss, and insomnia. He has no history of serious medical or psychiatric illness. Vital signs are temperature 37.0¬∞C (98.6¬∞F), pulse 80/min, respirations 16/min, and blood pressure 140/82 mm Hg. Physical examination discloses no abnormalities. Beck Depression Inventory score is 35 (severely depressed). He says he feels guilty that his investments have not done well and that he has ruined his family finances. He reports he hears voices at night telling him he has sinned. In addition to olanzapine, which of the following is the best treatment option for this patient? A:Divalproex, B:Fluoxetine, C:Lamotrigine, D:Lithium carbonate",
            "content": "",
            "analysis": "The patient presents with symptoms indicative of a major depressive episode with possible psychotic features, as indicated by his hearing voices. The severity of his depression is confirmed by his high Beck Depression Inventory score. As such, his treatment should address both the depressive symptoms and the psychotic symptoms.",
            "final_answer": "B",
        },
    ],
    "medmcqa": [
        {
            "question": "A 42year old female presents with diazepam and alcohol overdose. She is comatose. Temperature is 34.5degC. BP is 100/80 mmHg. Creatinine is 2.4mg/dL, AST -500, GGT- 35 IU. Urine dipstick showed 3+ for blood but urine analysis was normal. USG abdomen was normal. What is the most likely diagnosis? A:Hypothermia, B: Alcoholic hallucinosis, C: Rhabdomyolysis, D: Acute interstitial nephritis",
            "content": "",
            "analysis": "The patient has presented following an overdose of diazepam and alcohol. Given the symptoms and clinical findings, we are considering several potential diagnoses: Hypothermia, Alcoholic hallucinosis, Rhabdomyolysis, and Acute interstitial nephritis.Hypothermia (option A) could be suggested by the low body temperature, but it doesn't account for all of the patient's symptoms and test results.Alcoholic hallucinosis (option B) could be a possibility given the alcohol overdose, but the patient is comatose and not displaying signs of hallucinations.Acute interstitial nephritis (option D) might cause an increase in creatinine, but it generally does not result in blood in the urine without abnormality in urine analysis.Rhabdomyolysis (option C), on the other hand, can result from a drug overdose. This condition causes muscle breakdown, which releases myoglobin into the bloodstream. Myoglobin can cause renal failure, which could explain the increased creatinine. It also can show up as blood on a urine dipstick test while other aspects of a urine analysis remain normal, due to the fact that common urine dipstick tests cannot differentiate between myoglobin and hemoglobin.",
            "final_answer": "C",
        },
    ],
    "moderna_interactions": [
        {
            "question": "Does the therapeutic efficacy of Moderna COVID-19 Vaccine decrease when used in combination with abatacept?",
            "content": "",
            "analysis": "From the given context, it can be concluded that the therapeutic efficacy of Moderna COVID-19 Vaccine does decrease when used with abatacept",
            "final_answer": "yes"
        },
    ],

}

In [None]:
# Define Few-shots prompt
fs_template = PromptTemplate(
    input_variables=["question", "content", "analysis", "final_answer"],
    template="Question: {question}\nContent: {content}\nAnalysis: {analysis}\nFinal Answer: {final_answer}"
)

In [None]:
# Define few-shot prompt template
def fs_prompt_temp(knowledge_set):
  fs_p = FewShotPromptTemplate(
      examples=FS_EXAMPLES[knowledge_set],
      example_prompt=fs_template,
      suffix="Question: {input_question}\nContent: {input_content}",
      input_variables=["input_question", "input_content"]
  )
  return fs_p

#### Few shots generater for chat conversation

In [None]:
def generate_fs_prompt_chat(knowledge_set):
  knowledge_set = FS_EXAMPLES[knowledge_set]
  fs_messages = []
  for example in knowledge_set:
    user_message = {'role': 'user', 'content': f"Question: {example['question']}\nContent: {example['content']}"}
    assistant_message = {'role': 'assistant', 'content': f"\nAnalysis: {example['analysis']}\nFinal Answer: {example['final_answer']}"}
    fs_messages.extend([user_message, assistant_message])
  return fs_messages

### Final Prompt

In [None]:
def generate_final_prompt(knowledge_set, input_question, input_content):
  fs_prompt = fs_prompt_temp(knowledge_set)
  fs_prompt_input = fs_prompt.format(input_question=input_question, input_content=input_content)
  sys_prompt_input = generate_sys_prompt(knowledge_set)
  return [sys_prompt_input, fs_prompt_input]

In [None]:
prompt_template = prompt_input = PromptTemplate(
    input_variables=["sys_prompt", "fs_prompt"],
    template="{sys_prompt}\n{fs_prompt}"
)
sys_prompt, fs_prompt = generate_final_prompt(knowledge_set='moderna_interactions', input_question='hi', input_content='yo')
print(prompt_template.format(sys_prompt=sys_prompt, fs_prompt=fs_prompt))

Task: You are tasked to answer yes/no questions based on the provided content about whether the therapeutic efficacy of Moderna COVID-19 Vaccine decrease when used with another drug. Understand the question, extract relevant information, analyze it, and provide a concise and accurate answer.
Answer Format: 
Analysis: Provide an analysis that logically leads to the answer based on the relevant information.
Final Answer: Provide the final answer. The answer should be 'yes' or 'no'.
        
Not to dos: Do not make assumptions not supported by the content. Avoid providing personal opinions or interpretations. Summarize and interpret the information as objectively and accurately as possible. You are providing an analysis, not diagnosing or treating medical conditions.
----Below are some examples-----

Question: Does the therapeutic efficacy of Moderna COVID-19 Vaccine decrease when used in combination with abatacept?
Content: The therapeutic efficacy of Moderna COVID-19 Vaccine decrease wh

## Parsers


#### binary_parser

In [None]:
def binary_parser(output):
    sections = output.split('\n')
    analysis = ""
    final_answer = ""
    for section in sections:
        section_lower = section.lower()
        if section_lower.startswith("analysis: "):
            analysis = section_lower.replace("analysis: ", "")
        elif section_lower.startswith("final answer: "):
            final_answer = section_lower.replace("final answer: ", "").strip()
    # Ensuring that we only get 'yes' or 'no'
    if 'yes' in final_answer:
        final_answer = 'yes'
    elif 'no' in final_answer:
        final_answer = 'no'
    else:
        final_answer = ""
    return analysis, final_answer


#### mc_parser

In [None]:
def mc_parser(output):
    sections = output.split('\n')
    analysis = ""
    final_answer = ""
    for section in sections:
        section_lower = section.lower()
        if section_lower.startswith("analysis: "):
            analysis = section_lower.replace("analysis: ", "")
        elif section_lower.startswith("final answer: "):
            final_answer = section_lower.replace("final answer: ", "").strip()
    # We only keep the first character of the final answer which should be the letter representing the chosen option
    final_answer = final_answer[0] if final_answer and final_answer[0].isalpha() else ""
    return analysis, final_answer


#### text_parser

In [None]:
def text_parser(output):
    sections = output.split('\n')
    analysis = ""
    final_answer = ""
    for section in sections:
        section_lower = section.lower()
        if section_lower.startswith("analysis: "):
            analysis = section_lower.replace("analysis: ", "")
        elif section_lower.startswith("final answer: "):
            final_answer = section_lower.replace("final answer: ", "").strip()
    return analysis, final_answer


#### parser_dict

In [None]:
# knowledge_paths
parser_dict = {
    'pubmedqa': binary_parser,
    'ade': text_parser,
    'chatDoctor': text_parser,
    'DDI_binary': binary_parser,
    'drug_usage': text_parser,
    'medmcqa': mc_parser,
    'mmlu_mc': mc_parser,
    'usmle_mc': mc_parser,
    'moderna_interactions': binary_parser,
}

## LLM Chain

In [None]:
llm_chain = LLMChain(prompt=prompt_template, llm=llm)

NameError: ignored

#### Sanity Check

In [None]:
test_question = 'Is methotrexate an alternative to azathioprine in neuromyelitis optica spectrum disorders with aquaporin-4 antibodies?'
test_content = ''
test_sys_prompt, test_fs_prompt = generate_final_prompt(knowledge_set='pubmedqa', input_question=test_question, input_content=test_content)
result = llm_chain.run({"sys_prompt": test_sys_prompt, "fs_prompt": test_fs_prompt})
analysis, final_answer = parser(result)
print("Analysis:\n", analysis)
print("Final Answer:\n", final_answer)

## Data Pre-processing

### Load Data

In [None]:
def load_data(key, data_paths_dict, threshold):
    # Get the path of the data based on the key
    data_path = data_paths_dict[key]['data']

    # Load the data into a DataFrame
    df = pd.read_csv(data_path)
    if len(df) > threshold:
      df = df[:threshold]

    # Convert DataFrame to dict
    data_dict = {
        'id': df['id'].tolist(),
        'question': df['question'].tolist(),
        'context': df['context'].tolist(),
        'answer': df['answer'].tolist(),
    }

    n_samples = len(data_dict['question'])
    print(f"We have {n_samples} data samples in {key} dataset")

    return data_dict


In [None]:
data_df_dict = {}
knowledge_df_dict = {
    'text': {},
    'embedding': {},
}
for key in data_paths_dict.keys():
    if key in ['medmcqa', 'mmlu_mc', 'usmle_mc', 'DDI_binary', 'drug_usage', 'chatDoctor', 'moderna_interactions']:
      knowledge_df_dict['text'][key] = pd.read_csv(knowledge_blocks_dict[key])
    if not key in ['DDI_binary', 'drug_usage', 'moderna_interactions']:
      data_df_dict[key] = load_data(key, data_paths_dict, SLICE_THRESHOLD)
    else:
      data_path = data_paths_dict[key]['data']
      # Load the data into a DataFrame
      df = pd.read_csv(data_path)
      # Convert DataFrame to dict
      data_dict = {
          'question': df['question'].tolist(),
          'answer': df['answer'].tolist(),
      }
      n_samples = len(data_dict['question'])
      data_df_dict[key] = data_dict
      print(f"We have {n_samples} data samples in {key} dataset")

We have 5000 data samples in pubmedqa dataset
We have 5000 data samples in ade dataset
We have 796 data samples in chatDoctor dataset
We have 600 data samples in DDI_binary dataset
We have 2888 data samples in drug_usage dataset
We have 2000 data samples in medmcqa dataset
We have 956 data samples in mmlu_mc dataset
We have 1499 data samples in usmle_mc dataset
We have 200 data samples in moderna_interactions dataset


### Dataloader

In [None]:
def dataloader(key, data_df_dict):
    # Get the dictionary corresponding to the key
    data = data_df_dict[key]

    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)

    # Shuffle the DataFrame
    df = df.sample(frac=1).reset_index(drop=True)

    # Decide the sample size based on the size of the dataset
    sample_size = EVALUATION_SIZE if len(df) > EVALUATION_SIZE else len(df)

    if data_paths_dict[key]['type'] == 'binary':
        min_samples = min(sample_size//2, min(df['answer'].value_counts().values))
        # For binary, ensure equal number of 'yes' and 'no' samples
        yes_df = df[df['answer'] == 'yes'].sample(n=min_samples, replace=False)
        no_df = df[df['answer'] == 'no'].sample(n=min_samples, replace=False)
        sample_df = pd.concat([yes_df, no_df])
        print(f"we have {len(yes_df)} of yes and {len(no_df)} of no in the loaded {key} evaluation set")
    elif data_paths_dict[key]['type'] == 'mc':
        min_samples = min(sample_size//4, min(df['answer'].value_counts().values))
        print(min_samples)
        # For multiple choice, ensure equal number of each option
        option_a_df = df[df['answer'] == 'A'].sample(n=min_samples, replace=False)
        option_b_df = df[df['answer'] == 'B'].sample(n=min_samples, replace=False)
        option_c_df = df[df['answer'] == 'C'].sample(n=min_samples, replace=False)
        option_d_df = df[df['answer'] == 'D'].sample(n=min_samples, replace=False)
        print(f"we have {len(option_a_df)} of A and {len(option_b_df)} of B and {len(option_c_df)} of C and {len(option_d_df)} of D in the loaded {key} evaluation set")
        sample_df = pd.concat([option_a_df, option_b_df, option_c_df, option_d_df])
    else:
        # Select a random sample of rows from the DataFrame
        sample_df = df.sample(n=sample_size)
        print(f"we have {len(sample_df)} of samples")
    # Convert the selected sample to a dict
    sample_data = {
        'sample': sample_df['question'].tolist(),
        'label': sample_df['answer'].tolist(),
    }

    return sample_data


In [None]:
pubmed_eval = dataloader('pubmedqa', data_df_dict)
medmcqa_eval = dataloader('medmcqa', data_df_dict)
moderna_eval = dataloader('moderna_interactions', data_df_dict)

we have 500 of yes and 500 of no in the loaded pubmedqa evaluation set
250
we have 250 of A and 250 of B and 250 of C and 250 of D in the loaded medmcqa evaluation set
we have 100 of yes and 100 of no in the loaded moderna_interactions evaluation set


In [None]:
print(pubmed_eval['sample'][0])
print(pubmed_eval['label'][0])

Does vasopressin improve survival compared with epinephrine in a neonatal piglet model of asphyxial cardiac arrest?
yes


## OPENAI API Handlers

In [None]:
# Function to create a chat message and get a response
def chat_handler(query, prompt, few_shots, model):
    if USEOPENAI:
      while True:
        try:
            response = openai.ChatCompletion.create(
                messages = [
                    {'role': 'system', 'content': prompt},
                    *few_shots,
                    {'role': 'user', 'content': query},
                ],
                model=model,
                temperature=0.1,
            )
            return response['choices'][0]['message']['content']
        except Exception as e:
            print(f"Chat API error: {e}")
            continue


## Chat Implementation (set local to true to evaluate other models, current evaluating openAI models)

In [None]:
def ask(query, model, knowledge_key, embedding_process, useknowledge, local):
    CoT = ''
    chat_response = ''

    # we can switch from using local open-source model or openAI api
    if local:

      # generate system prompt, few-shot prompt with user query and knowledge
      sys_prompt, fs_prompt = generate_final_prompt(input_question=query, input_content='')
      response = model.run({"sys_prompt": sys_prompt, "fs_prompt": fs_prompt})

      # parse the final result
      final_answer = response
      CoT = analysis
      chat_response = final_answer

    else:
      sys_prompt = generate_sys_prompt(knowledge_key)
      fs_prompt = generate_fs_prompt_chat(knowledge_key)
      # construct user query with domain-specific knowledge
      query = f"\nQuestion: {query} \nContent: {''}"
      # Initiate conversation
      response = chat_handler(query, sys_prompt, fs_prompt, model)

      final_answer = response
      CoT = analysis
      chat_response = final_answer

    return [chat_response, CoT, response]

## Evaluation

In [None]:
dataset_name = 'mmlu_mc'
evaluation_set = dataloader(dataset_name, data_df_dict)

190
we have 190 of A and 190 of B and 190 of C and 190 of D in the loaded mmlu_mc evaluation set


In [None]:
print(evaluation_set['sample'][0])
print(evaluation_set['label'][0])

A 35-year-old man comes to the office because of 1-week history of mid low back pain that radiates down his right leg. The pain began after the patient lifted a heavy box onto his truck. He rates his current pain as an 8 on a 10-point scale. He has been unable to find a comfortable position and has been sleeping in a recliner. Medical history is unremarkable and he takes no medications. He has smoked one pack of cigarettes daily for the past 25 years, and he drinks a six-pack of beer on Friday and Saturday nights. BMI is 27 kg/m2 . He appears uncomfortable and stands during the physical examination. Vital signs are normal. Straight-leg raise test is positive on the right, with loss of right ankle reflex. The remainder of the physical examination discloses no abnormalities. Which of the following is the most likely explanation for this patient’s symptoms? A:Displacement of the nucleus pulposus, B:Hypertrophy of the facet joints, C:Osteophyte formation, D:Spondylolisthesis
A


### metrics

In [None]:
def check_text_accuracy(prediction, actual):
    # Remove any trailing periods from the prediction
    prediction = prediction.rstrip('.')

    # Generate the initials from the prediction
    prediction_initials = ''.join(word[0] for word in prediction.split())

    actual_words = actual.split()
    prediction_contains_all_words = all(word in prediction for word in actual_words)

    if prediction_contains_all_words or prediction_initials == actual:
        # Prediction contains all words in actual or initials match
        return True

    return False

In [None]:
def calculate_f1_metrics(prediction, label):
    # Convert predictions and labels to sets for easy comparison
    prediction_set = set(prediction.split(', '))
    label_set = set(label.split(', '))
    print(prediction_set)
    print(label_set)
    # Compute true positives, false positives, and false negatives
    true_positives = len(prediction_set & label_set)
    false_positives = len(prediction_set - label_set)
    false_negatives = len(label_set - prediction_set)

    # Compute precision, recall, and F1 score
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1_score


### GPT-3.5 Evaluation

#### Without CoT

In [None]:
accurate_predictions = 0;
print_process = True;
basic_mc_prompt = 'Provide the final answer, which should be a single letter in the alphabet representing the best option among the multiple choices provided in the question. Follow this format:\nFinal Answer: A'
basic_binary_prompt = 'Provide the final answer, which should be a yes or no. Never say you dont know, always provide a yes or no answer regardless. If you dont have enough information, just guess. Follow this format:\nFinal Answer: yes'
for i in tqdm(range(len(evaluation_set['sample']) ), desc="Processing"):
  chat_response = chat_handler(evaluation_set['sample'][i], basic_binary_prompt, [], GPT_3)
  analysis, prediction = binary_parser(chat_response)
  if dataset_name == 'ade':
    if check_text_accuracy(prediction, evaluation_set['label'][i].lower()):
      accurate_predictions += 1
    else:
      print(f"\nWrong Answer Index: {i} Question: {evaluation_set['sample'][i]} Chat Response: {prediction}, Answer: {evaluation_set['label'][i]}")
  else:
    prediction = chat_response.split(".")[0].lower()
    if prediction == evaluation_set['label'][i].lower():
      accurate_predictions += 1
    else:
      print(f"\nWrong Answer Index: {i} Chat Response: {chat_response}, Answer: {evaluation_set['label'][i]}")
print(f"\nAccuracy: {accurate_predictions}/{len(evaluation_set['sample']) }")

#### With CoT

In [None]:
accurate_predictions = 0;
print_process = True;
basic_binary_prompt = 'Analyze the question first, Provide the final answer, which should be a yes or no. Never say you dont know, always provide a yes or no answer regardless. If you dont have enough information, just guess. You must Follow this format:\nAnalysis: your analysis\nFinal Answer: no'
basic_mc_prompt = 'Analyze the question first, then provide the final answer, which should be a single letter in the alphabet representing the best option among the multiple choices provided in the question. Follow this format:\nAnalysis: your analysis\nFinal Answer: A'
basic_text_prompt = 'Analyze the question first, then provide the final answer, which should be a single term consisting of english words. Follow this format:\nAnalysis: your analysis\nFinal Answer: the adverse drug reaction you identified'
basic_double_binary_prompt = 'Your task is to answer to questions. Analyze the questions first, then provide the final answer, which should consist of two terms, each being either "yes" or "no" corresponding to the questions. Never say you dont know, always provide a yes or no answer regardless. If you dont have enough information, just guess. Follow this format:\n\nAnalysis: your analysis\nFinal Answer: no, no'
basic_text_list_prompt = '''Your task is to identify a list of medications. Analyze the question first, then provide the final answer, which should consist of drug names, each separated by a comma. Never say you don't know, always provide an answer regardless. If you don't have enough information, just guess. Follow this format:
Analysis: your analysis
Final Answer: DrugA, DrugB, DrugC, ...
'''
CoT_list = []
QA_list = []
precision_list = []
recall_list = []
f1_list = []
for i in tqdm(range(len(evaluation_set['sample'])), desc="Processing"):
  chat_response = chat_handler(evaluation_set['sample'][i], basic_binary_prompt, [], GPT_3)
  analysis, prediction = binary_parser(chat_response)
  CoT_list.append(chat_response)
  QA_list.append(prediction)
  if dataset_name == 'ade':
    if check_text_accuracy(prediction, evaluation_set['label'][i].lower()):
      accurate_predictions += 1
    else:
      print(f"\nWrong Answer Index: {i} Question: {evaluation_set['sample'][i]} Chat Response: {prediction}, Answer: {evaluation_set['label'][i]}")
  elif dataset_name == 'drug_usage':
      # Extract the binary yes/no answers from the chat response
      answer1, answer2 = prediction.split(", ")
      label1, label2 = evaluation_set['label'][i].lower().split(", ")
      # Check the accuracy of each part of the answer
      if answer1.lower() == label1 and answer2.lower() == label2:
        accurate_predictions += 1
      else:
        print(f"\nWrong Answer Index: {i} Question: {evaluation_set['sample'][i]} Chat Response: {chat_response}, Answer: {evaluation_set['label'][i]}")
  elif dataset_name == 'chatDoctor':
      # Extract the binary yes/no answers from the chat response
        precision, recall, f1 = calculate_f1_metrics(prediction, evaluation_set['label'][i].lower())
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
  else:
    if prediction == evaluation_set['label'][i].lower():
      accurate_predictions += 1
    else:
      print(f"\nWrong Answer Index: {i} YO Chat Response: {prediction}, Answer: {evaluation_set['label'][i]}")

data = {
      'Questions': evaluation_set['sample'],
      'Long Answer': CoT_list,
      'Short Answer': QA_list,
      'Official Answer': evaluation_set['label']
    }
# Convert the dictionary to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a csv file
df.to_csv(data_paths_dict[dataset_name]['GPT_3_answer'], index=False)
print(f"\nAccuracy: {accurate_predictions}/{len(evaluation_set['sample']) }")

Processing:   1%|          | 2/200 [00:05<09:05,  2.75s/it]


Wrong Answer Index: 1 YO Chat Response: no, Answer: yes


Processing:   2%|▎         | 5/200 [00:15<09:41,  2.98s/it]


Wrong Answer Index: 4 YO Chat Response: no, Answer: yes


Processing:   3%|▎         | 6/200 [00:18<09:42,  3.00s/it]


Wrong Answer Index: 5 YO Chat Response: no, Answer: yes


Processing:   4%|▍         | 9/200 [00:25<08:01,  2.52s/it]


Wrong Answer Index: 8 YO Chat Response: no, Answer: yes


Processing:   6%|▌         | 11/200 [00:30<07:52,  2.50s/it]


Wrong Answer Index: 10 YO Chat Response: no, Answer: yes


Processing:   6%|▌         | 12/200 [00:32<07:21,  2.35s/it]


Wrong Answer Index: 11 YO Chat Response: no, Answer: yes


Processing:   7%|▋         | 14/200 [00:38<08:34,  2.77s/it]


Wrong Answer Index: 13 YO Chat Response: no, Answer: yes


Processing:   8%|▊         | 15/200 [00:40<07:56,  2.58s/it]


Wrong Answer Index: 14 YO Chat Response: no, Answer: yes


Processing:   8%|▊         | 17/200 [00:46<08:16,  2.71s/it]


Wrong Answer Index: 16 YO Chat Response: no, Answer: yes


Processing:  10%|█         | 21/200 [00:56<08:22,  2.81s/it]


Wrong Answer Index: 20 YO Chat Response: no, Answer: yes


Processing:  12%|█▏        | 24/200 [01:06<08:49,  3.01s/it]


Wrong Answer Index: 23 YO Chat Response: no, Answer: yes


Processing:  14%|█▍        | 29/200 [01:20<08:33,  3.00s/it]


Wrong Answer Index: 28 YO Chat Response: no, Answer: yes


Processing:  17%|█▋        | 34/200 [01:32<07:01,  2.54s/it]


Wrong Answer Index: 33 YO Chat Response: no, Answer: yes


Processing:  18%|█▊        | 36/200 [01:37<07:05,  2.59s/it]


Wrong Answer Index: 35 YO Chat Response: no, Answer: yes


Processing:  18%|█▊        | 37/200 [01:40<07:03,  2.60s/it]


Wrong Answer Index: 36 YO Chat Response: no, Answer: yes


Processing:  20%|██        | 40/200 [01:47<06:34,  2.46s/it]


Wrong Answer Index: 39 YO Chat Response: no, Answer: yes


Processing:  22%|██▏       | 44/200 [01:58<06:44,  2.60s/it]


Wrong Answer Index: 43 YO Chat Response: no, Answer: yes


Processing:  24%|██▍       | 48/200 [02:10<07:21,  2.91s/it]


Wrong Answer Index: 47 YO Chat Response: no, Answer: yes


Processing:  24%|██▍       | 49/200 [02:13<07:32,  3.00s/it]


Wrong Answer Index: 48 YO Chat Response: no, Answer: yes


Processing:  26%|██▌       | 51/200 [02:18<07:01,  2.83s/it]


Wrong Answer Index: 50 YO Chat Response: no, Answer: yes


Processing:  26%|██▌       | 52/200 [02:21<06:38,  2.69s/it]


Wrong Answer Index: 51 YO Chat Response: no, Answer: yes


Processing:  28%|██▊       | 56/200 [02:32<06:59,  2.91s/it]


Wrong Answer Index: 55 YO Chat Response: no, Answer: yes


Processing:  30%|███       | 60/200 [02:41<05:32,  2.38s/it]


Wrong Answer Index: 59 YO Chat Response: no, Answer: yes


Processing:  31%|███       | 62/200 [02:46<05:46,  2.51s/it]


Wrong Answer Index: 61 YO Chat Response: no, Answer: yes


Processing:  32%|███▏      | 63/200 [02:48<05:22,  2.35s/it]


Wrong Answer Index: 62 YO Chat Response: no, Answer: yes


Processing:  32%|███▏      | 64/200 [02:50<05:14,  2.31s/it]


Wrong Answer Index: 63 YO Chat Response: no, Answer: yes


Processing:  34%|███▎      | 67/200 [02:59<06:08,  2.77s/it]


Wrong Answer Index: 66 YO Chat Response: no, Answer: yes


Processing:  36%|███▌      | 71/200 [03:09<05:19,  2.48s/it]


Wrong Answer Index: 70 YO Chat Response: no, Answer: yes


Processing:  36%|███▌      | 72/200 [03:11<05:26,  2.55s/it]


Wrong Answer Index: 71 YO Chat Response: no, Answer: yes


Processing:  38%|███▊      | 77/200 [03:24<05:17,  2.58s/it]


Wrong Answer Index: 76 YO Chat Response: no, Answer: yes


Processing:  39%|███▉      | 78/200 [03:27<05:19,  2.62s/it]

Chat API error: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 23 Jun 2023 03:02:25 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7db976589a924aa5-TPE', 'alt-svc': 'h3=":443"; ma=86400'}


Processing:  42%|████▏     | 83/200 [08:54<49:13, 25.24s/it]  


Wrong Answer Index: 82 YO Chat Response: no, Answer: yes


Processing:  42%|████▎     | 85/200 [08:58<25:59, 13.56s/it]


Wrong Answer Index: 84 YO Chat Response: no, Answer: yes


Processing:  44%|████▎     | 87/200 [09:04<15:02,  7.99s/it]


Wrong Answer Index: 86 YO Chat Response: no, Answer: yes


Processing:  45%|████▌     | 90/200 [09:13<08:54,  4.86s/it]


Wrong Answer Index: 89 YO Chat Response: no, Answer: yes


Processing:  46%|████▋     | 93/200 [09:21<06:09,  3.46s/it]


Wrong Answer Index: 92 YO Chat Response: no, Answer: yes


Processing:  49%|████▉     | 98/200 [09:35<04:33,  2.68s/it]


Wrong Answer Index: 97 YO Chat Response: no, Answer: yes


Processing:  50%|█████     | 101/200 [09:43<04:22,  2.65s/it]


Wrong Answer Index: 100 YO Chat Response: yes, Answer: no


Processing:  52%|█████▎    | 105/200 [09:52<03:34,  2.26s/it]


Wrong Answer Index: 104 YO Chat Response: yes, Answer: no
Chat API error: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 23 Jun 2023 03:08:50 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7db97fbd3a3e4a6d-TPE', 'alt-svc': 'h3=":443"; ma=86400'}


Processing:  55%|█████▍    | 109/200 [15:15<52:41, 34.74s/it]  


Wrong Answer Index: 108 YO Chat Response: yes, Answer: no


Processing:  56%|█████▋    | 113/200 [15:26<15:15, 10.53s/it]


Wrong Answer Index: 112 YO Chat Response: yes, Answer: no


Processing:  57%|█████▋    | 114/200 [15:29<11:36,  8.10s/it]


Wrong Answer Index: 113 YO Chat Response: yes, Answer: no


Processing:  58%|█████▊    | 117/200 [15:36<06:09,  4.45s/it]


Wrong Answer Index: 116 YO Chat Response: yes, Answer: no


Processing:  60%|██████    | 120/200 [15:45<04:32,  3.40s/it]

Chat API error: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 23 Jun 2023 03:14:43 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7db9885df9566a81-TPE', 'alt-svc': 'h3=":443"; ma=86400'}


Processing:  61%|██████    | 122/200 [21:02<1:29:02, 68.50s/it]


Wrong Answer Index: 121 YO Chat Response: yes, Answer: no


Processing:  62%|██████▏   | 124/200 [21:08<44:15, 34.94s/it]  


Wrong Answer Index: 123 YO Chat Response: yes, Answer: no


Processing:  63%|██████▎   | 126/200 [21:15<23:13, 18.83s/it]


Wrong Answer Index: 125 YO Chat Response: yes, Answer: no


Processing:  64%|██████▍   | 128/200 [21:21<13:00, 10.85s/it]

Chat API error: The server is overloaded or not ready yet.


Processing:  66%|██████▌   | 131/200 [22:00<11:30, 10.00s/it]


Wrong Answer Index: 130 YO Chat Response: yes, Answer: no


Processing:  69%|██████▉   | 138/200 [22:18<03:31,  3.41s/it]


Wrong Answer Index: 137 YO Chat Response: yes, Answer: no


Processing:  70%|██████▉   | 139/200 [22:21<03:18,  3.26s/it]


Wrong Answer Index: 138 YO Chat Response: yes, Answer: no


Processing:  70%|███████   | 141/200 [22:25<02:37,  2.66s/it]


Wrong Answer Index: 140 YO Chat Response: yes, Answer: no


Processing:  71%|███████   | 142/200 [22:27<02:23,  2.48s/it]


Wrong Answer Index: 141 YO Chat Response: yes, Answer: no


Processing:  72%|███████▏  | 143/200 [22:31<02:42,  2.84s/it]


Wrong Answer Index: 142 YO Chat Response: yes, Answer: no


Processing:  73%|███████▎  | 146/200 [22:39<02:22,  2.63s/it]


Wrong Answer Index: 145 YO Chat Response: yes, Answer: no


Processing:  74%|███████▍  | 148/200 [22:44<02:22,  2.73s/it]


Wrong Answer Index: 147 YO Chat Response: yes, Answer: no


Processing:  76%|███████▌  | 151/200 [22:51<01:53,  2.32s/it]


Wrong Answer Index: 150 YO Chat Response: yes, Answer: no


Processing:  78%|███████▊  | 157/200 [23:07<01:55,  2.70s/it]


Wrong Answer Index: 156 YO Chat Response: yes, Answer: no


Processing:  80%|████████  | 160/200 [23:15<01:55,  2.88s/it]


Wrong Answer Index: 159 YO Chat Response: yes, Answer: no


Processing:  80%|████████  | 161/200 [23:18<01:47,  2.76s/it]


Wrong Answer Index: 160 YO Chat Response: yes, Answer: no


Processing:  82%|████████▎ | 165/200 [23:28<01:35,  2.73s/it]


Wrong Answer Index: 164 YO Chat Response: yes, Answer: no


Processing:  83%|████████▎ | 166/200 [23:31<01:29,  2.62s/it]


Wrong Answer Index: 165 YO Chat Response: yes, Answer: no


Processing:  84%|████████▎ | 167/200 [23:34<01:30,  2.74s/it]


Wrong Answer Index: 166 YO Chat Response: yes, Answer: no


Processing:  84%|████████▍ | 168/200 [23:37<01:33,  2.91s/it]


Wrong Answer Index: 167 YO Chat Response: yes, Answer: no


Processing:  86%|████████▌ | 172/200 [23:47<01:08,  2.43s/it]


Wrong Answer Index: 171 YO Chat Response: yes, Answer: no


Processing:  86%|████████▋ | 173/200 [23:49<01:03,  2.35s/it]

Chat API error: The server is overloaded or not ready yet.


Processing:  88%|████████▊ | 175/200 [24:24<03:39,  8.78s/it]


Wrong Answer Index: 174 YO Chat Response: yes, Answer: no


Processing:  94%|█████████▎| 187/200 [24:55<00:37,  2.89s/it]


Wrong Answer Index: 186 YO Chat Response: yes, Answer: no


Processing:  94%|█████████▍| 188/200 [24:58<00:33,  2.78s/it]


Wrong Answer Index: 187 YO Chat Response: yes, Answer: no


Processing:  94%|█████████▍| 189/200 [25:01<00:32,  2.91s/it]


Wrong Answer Index: 188 YO Chat Response: yes, Answer: no


Processing:  98%|█████████▊| 197/200 [25:22<00:08,  2.69s/it]


Wrong Answer Index: 196 YO Chat Response: yes, Answer: no


Processing: 100%|█████████▉| 199/200 [25:28<00:02,  2.95s/it]


Wrong Answer Index: 198 YO Chat Response: yes, Answer: no


Processing: 100%|██████████| 200/200 [25:31<00:00,  7.66s/it]


Accuracy: 132/200





#### Without CoT

In [None]:
accurate_predictions = 0;
print_process = True;
basic_mc_prompt = 'Provide the final answer, which should be a single letter in the alphabet representing the best option among the multiple choices provided in the question. Follow this format:\nFinal Answer: A'
basic_binary_prompt = 'Provide the final answer, which should be a yes or no. Never say you dont know, always provide a yes or no answer regardless. If you dont have enough information, just guess. Follow this format:\nFinal Answer: yes'
for i in tqdm(range(len(evaluation_set['sample']) ), desc="Processing"):
  chat_response = chat_handler(evaluation_set['sample'][i], basic_binary_prompt, [], GPT_4)
  analysis, prediction = binary_parser(chat_response)
  if dataset_name == 'ade':
    if check_text_accuracy(prediction, evaluation_set['label'][i].lower()):
      accurate_predictions += 1
    else:
      print(f"\nWrong Answer Index: {i} Question: {evaluation_set['sample'][i]} Chat Response: {prediction}, Answer: {evaluation_set['label'][i]}")
  else:
    prediction = chat_response.split(".")[0].lower()
    if prediction == evaluation_set['label'][i].lower():
      accurate_predictions += 1
    else:
      print(f"\nWrong Answer Index: {i} Chat Response: {chat_response}, Answer: {evaluation_set['label'][i]}")
print(f"\nAccuracy: {accurate_predictions}/{len(evaluation_set['sample']) }")

250
we have 250 of A and 250 of B and 250 of C and 250 of D in the loaded medmcqa evaluation set


Processing:   1%|          | 8/1000 [00:11<23:18,  1.41s/it]


Wrong Answer Index: 7 Chat Response: Final Answer: B, Answer: A


Processing:   3%|▎         | 27/1000 [00:37<20:06,  1.24s/it]


Wrong Answer Index: 26 Chat Response: Final Answer: C, Answer: A


Processing:   3%|▎         | 28/1000 [00:39<20:39,  1.28s/it]


Wrong Answer Index: 27 Chat Response: Final Answer: C, Answer: A


Processing:   4%|▎         | 35/1000 [00:50<23:56,  1.49s/it]


Wrong Answer Index: 34 Chat Response: Final Answer: B, Answer: A


Processing:   4%|▍         | 38/1000 [00:54<25:42,  1.60s/it]


Wrong Answer Index: 37 Chat Response: Final Answer: C, Answer: A


Processing:   5%|▌         | 50/1000 [01:08<19:48,  1.25s/it]


Wrong Answer Index: 49 Chat Response: Final Answer: B, Answer: A


Processing:   5%|▌         | 51/1000 [01:10<20:33,  1.30s/it]


Wrong Answer Index: 50 Chat Response: Final Answer: C, Answer: A
Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID a0616f7b2be72004dc12c2e9de6ba781 in your message.)


Processing:   6%|▌         | 57/1000 [02:53<1:51:09,  7.07s/it]


Wrong Answer Index: 56 Chat Response: As an AI, I am not able to view images or interpret ECG results. Please provide the information from the ECG, and I will try to help answer the question., Answer: A


Processing:   8%|▊         | 82/1000 [03:25<18:50,  1.23s/it]


Wrong Answer Index: 81 Chat Response: Final Answer: B, Answer: A


Processing:   8%|▊         | 83/1000 [03:28<27:10,  1.78s/it]


Wrong Answer Index: 82 Chat Response: Final Answer: B, Answer: A


Processing:   8%|▊         | 84/1000 [03:29<24:48,  1.63s/it]


Wrong Answer Index: 83 Chat Response: Final Answer: C, Answer: A


Processing:   9%|▉         | 91/1000 [03:37<16:21,  1.08s/it]


Wrong Answer Index: 90 Chat Response: Final Answer: B, Answer: A


Processing:   9%|▉         | 94/1000 [03:40<16:30,  1.09s/it]


Wrong Answer Index: 93 Chat Response: Final Answer: C, Answer: A


Processing:  10%|▉         | 97/1000 [03:43<15:31,  1.03s/it]


Wrong Answer Index: 96 Chat Response: Final Answer: B, Answer: A


Processing:  10%|▉         | 99/1000 [03:46<17:07,  1.14s/it]

Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 49cbfa450388d8ee5271b18d437b9a0c in your message.)


Processing:  10%|█         | 104/1000 [05:23<1:55:19,  7.72s/it]

Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 869b2372eb7db511d8bbdee1d7ec6986 in your message.)


Processing:  11%|█         | 111/1000 [07:02<1:13:22,  4.95s/it]


Wrong Answer Index: 110 Chat Response: Final Answer: B, Answer: A


Processing:  12%|█▏        | 123/1000 [07:15<16:11,  1.11s/it]

Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID d4dfb132fdc6f5d31bc3344503fdd6b4 in your message.)


Processing:  13%|█▎        | 129/1000 [08:52<1:22:47,  5.70s/it]


Wrong Answer Index: 128 Chat Response: Final Answer: B, Answer: A


Processing:  15%|█▍        | 149/1000 [09:15<17:46,  1.25s/it]


Wrong Answer Index: 148 Chat Response: Final Answer: D, Answer: A


Processing:  15%|█▌        | 154/1000 [09:20<14:01,  1.01it/s]


Wrong Answer Index: 153 Chat Response: Final Answer: B, Answer: A


Processing:  16%|█▌        | 156/1000 [09:22<14:42,  1.05s/it]


Wrong Answer Index: 155 Chat Response: Final Answer: B, Answer: A


Processing:  17%|█▋        | 166/1000 [09:32<15:42,  1.13s/it]


Wrong Answer Index: 165 Chat Response: Final Answer: C, Answer: A


Processing:  18%|█▊        | 176/1000 [09:43<14:48,  1.08s/it]


Wrong Answer Index: 175 Chat Response: Final Answer: B, Answer: A


Processing:  18%|█▊        | 178/1000 [09:45<16:06,  1.18s/it]


Wrong Answer Index: 177 Chat Response: Final Answer: B, Answer: A


Processing:  18%|█▊        | 180/1000 [09:48<15:49,  1.16s/it]


Wrong Answer Index: 179 Chat Response: Final Answer: B, Answer: A


Processing:  20%|█▉        | 196/1000 [10:07<14:55,  1.11s/it]


Wrong Answer Index: 195 Chat Response: Final Answer: D, Answer: A


Processing:  20%|█▉        | 197/1000 [10:08<14:36,  1.09s/it]


Wrong Answer Index: 196 Chat Response: Final Answer: D, Answer: A


Processing:  20%|█▉        | 199/1000 [10:10<14:04,  1.05s/it]


Wrong Answer Index: 198 Chat Response: Final Answer: D, Answer: A


Processing:  21%|██        | 210/1000 [10:24<14:57,  1.14s/it]

Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 90d8873c11481c2bdeb165cd4dfebbe1 in your message.)


Processing:  21%|██▏       | 213/1000 [11:57<3:07:13, 14.27s/it]


Wrong Answer Index: 212 Chat Response: Final Answer: D, Answer: A


Processing:  23%|██▎       | 226/1000 [12:12<15:31,  1.20s/it]


Wrong Answer Index: 225 Chat Response: Final Answer: B, Answer: A


Processing:  23%|██▎       | 231/1000 [12:17<13:24,  1.05s/it]


Wrong Answer Index: 230 Chat Response: Final Answer: B, Answer: A


Processing:  24%|██▍       | 243/1000 [12:30<12:25,  1.01it/s]


Wrong Answer Index: 242 Chat Response: Final Answer: C, Answer: A


Processing:  25%|██▍       | 249/1000 [12:38<15:03,  1.20s/it]


Wrong Answer Index: 248 Chat Response: Final Answer: B, Answer: A


Processing:  25%|██▌       | 250/1000 [12:39<14:17,  1.14s/it]


Wrong Answer Index: 249 Chat Response: Final Answer: B, Answer: A


Processing:  26%|██▌       | 262/1000 [12:51<13:48,  1.12s/it]


Wrong Answer Index: 261 Chat Response: Final Answer: D, Answer: B


Processing:  26%|██▋       | 263/1000 [12:52<12:52,  1.05s/it]


Wrong Answer Index: 262 Chat Response: Final Answer: A, Answer: B


Processing:  26%|██▋       | 264/1000 [12:53<12:40,  1.03s/it]


Wrong Answer Index: 263 Chat Response: Final Answer: D, Answer: B


Processing:  28%|██▊       | 275/1000 [13:11<29:48,  2.47s/it]


Wrong Answer Index: 274 Chat Response: It is not possible for me to view the ECG image, as I am an AI language model and cannot process images. Please provide a description of the ECG or the condition it represents to help me provide an accurate answer., Answer: B


Processing:  28%|██▊       | 284/1000 [13:20<12:53,  1.08s/it]


Wrong Answer Index: 283 Chat Response: Final Answer: C, Answer: B


Processing:  29%|██▊       | 286/1000 [13:23<13:02,  1.10s/it]


Wrong Answer Index: 285 Chat Response: Final Answer: C, Answer: B


Processing:  30%|██▉       | 295/1000 [13:32<13:08,  1.12s/it]


Wrong Answer Index: 294 Chat Response: Final Answer: A, Answer: B


Processing:  30%|██▉       | 296/1000 [13:33<13:17,  1.13s/it]


Wrong Answer Index: 295 Chat Response: Final Answer: C, Answer: B


Processing:  31%|███       | 311/1000 [13:52<13:54,  1.21s/it]


Wrong Answer Index: 310 Chat Response: Final Answer: A, Answer: B


Processing:  32%|███▏      | 321/1000 [14:05<14:15,  1.26s/it]


Wrong Answer Index: 320 Chat Response: Final Answer: C, Answer: B


Processing:  32%|███▏      | 324/1000 [14:08<13:20,  1.18s/it]

Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID bce23e348ba2455c6a2b4a0544add7f8 in your message.)


Processing:  33%|███▎      | 328/1000 [15:43<1:56:14, 10.38s/it]


Wrong Answer Index: 327 Chat Response: Final Answer: A, Answer: B


Processing:  33%|███▎      | 332/1000 [15:49<42:50,  3.85s/it]


Wrong Answer Index: 331 Chat Response: Final Answer: C, Answer: B


Processing:  34%|███▎      | 335/1000 [15:52<22:51,  2.06s/it]


Wrong Answer Index: 334 Chat Response: Final Answer: A, Answer: B


Processing:  34%|███▎      | 336/1000 [15:53<18:56,  1.71s/it]


Wrong Answer Index: 335 Chat Response: Final Answer: A, Answer: B


Processing:  34%|███▍      | 338/1000 [15:55<15:07,  1.37s/it]


Wrong Answer Index: 337 Chat Response: Final Answer: D, Answer: B


Processing:  34%|███▍      | 341/1000 [15:58<12:36,  1.15s/it]


Wrong Answer Index: 340 Chat Response: Final Answer: D, Answer: B


Processing:  34%|███▍      | 344/1000 [16:01<11:37,  1.06s/it]


Wrong Answer Index: 343 Chat Response: Final Answer: A, Answer: B


Processing:  34%|███▍      | 345/1000 [16:02<11:16,  1.03s/it]


Wrong Answer Index: 344 Chat Response: Final Answer: C, Answer: B


Processing:  35%|███▌      | 351/1000 [16:08<10:43,  1.01it/s]


Wrong Answer Index: 350 Chat Response: Final Answer: D, Answer: B


Processing:  35%|███▌      | 352/1000 [16:09<10:35,  1.02it/s]


Wrong Answer Index: 351 Chat Response: Final Answer: C, Answer: B


Processing:  36%|███▋      | 364/1000 [16:30<13:42,  1.29s/it]


Wrong Answer Index: 363 Chat Response: Final Answer: C, Answer: B


Processing:  37%|███▋      | 368/1000 [16:34<11:24,  1.08s/it]


Wrong Answer Index: 367 Chat Response: Final Answer: C, Answer: B


Processing:  37%|███▋      | 373/1000 [16:39<10:37,  1.02s/it]

Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID f289232da84144581648551bc46b8c36 in your message.)


Processing:  38%|███▊      | 378/1000 [18:15<1:19:04,  7.63s/it]


Wrong Answer Index: 377 Chat Response: Final Answer: A, Answer: B


Processing:  38%|███▊      | 381/1000 [18:19<34:53,  3.38s/it]


Wrong Answer Index: 380 Chat Response: Final Answer: C, Answer: B


Processing:  38%|███▊      | 384/1000 [18:24<21:24,  2.08s/it]

Chat API error: Request failed due to server shutdown {
  "error": {
    "message": "Request failed due to server shutdown",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'Request failed due to server shutdown', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Sat, 10 Jun 2023 11:38:33 GMT', 'Content-Type': 'application/json', 'Content-Length': '141', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-model': 'gpt-4-0314', 'openai-organization': 'user-so5pfryxritwlv4sc3ijfnpe', 'openai-processing-ms': '18904', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '200', 'x-ratelimit-limit-tokens': '40000', 'x-ratelimit-remaining-requests': '199', 'x-ratelimit-remaining-tokens': '39873', 'x-ratelimit-reset-requests': '300ms', 'x-ratelimit-reset-tokens': '190ms', 'x-request-id': '53f35b955e222f0391260a8c0cd7a3f4', 'CF-Cache-Sta

Processing:  38%|███▊      | 385/1000 [18:44<1:16:43,  7.48s/it]


Wrong Answer Index: 384 Chat Response: Final Answer: C, Answer: B


Processing:  39%|███▉      | 391/1000 [18:50<18:35,  1.83s/it]


Wrong Answer Index: 390 Chat Response: Final Answer: A, Answer: B


Processing:  39%|███▉      | 392/1000 [18:51<16:05,  1.59s/it]


Wrong Answer Index: 391 Chat Response: Final Answer: C, Answer: B


Processing:  40%|███▉      | 396/1000 [18:56<11:57,  1.19s/it]


Wrong Answer Index: 395 Chat Response: Final Answer: D, Answer: B


Processing:  40%|████      | 405/1000 [19:06<11:08,  1.12s/it]


Wrong Answer Index: 404 Chat Response: Final Answer: A, Answer: B


Processing:  41%|████      | 409/1000 [19:10<12:07,  1.23s/it]


Wrong Answer Index: 408 Chat Response: Final Answer: A, Answer: B


Processing:  41%|████▏     | 414/1000 [19:16<11:49,  1.21s/it]


Wrong Answer Index: 413 Chat Response: Final Answer: A, Answer: B


Processing:  42%|████▏     | 415/1000 [19:18<12:55,  1.33s/it]


Wrong Answer Index: 414 Chat Response: Final Answer: D, Answer: B


Processing:  43%|████▎     | 429/1000 [19:34<10:19,  1.09s/it]


Wrong Answer Index: 428 Chat Response: Final Answer: A, Answer: B


Processing:  43%|████▎     | 432/1000 [19:37<10:00,  1.06s/it]


Wrong Answer Index: 431 Chat Response: Final Answer: C, Answer: B


Processing:  44%|████▎     | 435/1000 [19:40<09:42,  1.03s/it]


Wrong Answer Index: 434 Chat Response: Final Answer: A, Answer: B


Processing:  44%|████▍     | 444/1000 [19:51<10:11,  1.10s/it]


Wrong Answer Index: 443 Chat Response: Final Answer: C, Answer: B


Processing:  45%|████▍     | 449/1000 [19:57<10:34,  1.15s/it]


Wrong Answer Index: 448 Chat Response: Final Answer: C, Answer: B


Processing:  45%|████▌     | 450/1000 [19:58<10:39,  1.16s/it]


Wrong Answer Index: 449 Chat Response: Final Answer: C, Answer: B


Processing:  45%|████▌     | 451/1000 [19:59<09:56,  1.09s/it]


Wrong Answer Index: 450 Chat Response: Final Answer: A, Answer: B


Processing:  45%|████▌     | 452/1000 [20:00<09:43,  1.06s/it]


Wrong Answer Index: 451 Chat Response: Final Answer: A, Answer: B


Processing:  46%|████▌     | 457/1000 [20:07<13:42,  1.51s/it]


Wrong Answer Index: 456 Chat Response: Final Answer: A, Answer: B


Processing:  47%|████▋     | 466/1000 [20:18<10:07,  1.14s/it]


Wrong Answer Index: 465 Chat Response: Final Answer: D, Answer: B


Processing:  47%|████▋     | 467/1000 [20:19<10:25,  1.17s/it]


Wrong Answer Index: 466 Chat Response: Final Answer: D, Answer: B


Processing:  47%|████▋     | 471/1000 [20:23<09:45,  1.11s/it]


Wrong Answer Index: 470 Chat Response: Final Answer: A, Answer: B


Processing:  49%|████▊     | 486/1000 [20:44<17:19,  2.02s/it]


Wrong Answer Index: 485 Chat Response: I am an AI language model and I cannot see images or interpret visual information. Please provide a description of the rhythm disorder so that I can help you with the best drug choice., Answer: B


Processing:  49%|████▊     | 487/1000 [20:46<18:06,  2.12s/it]


Wrong Answer Index: 486 Chat Response: Final Answer: C, Answer: B


Processing:  50%|████▉     | 495/1000 [20:56<10:57,  1.30s/it]


Wrong Answer Index: 494 Chat Response: Final Answer: D, Answer: B


Processing:  50%|████▉     | 497/1000 [20:58<09:31,  1.14s/it]


Wrong Answer Index: 496 Chat Response: Final Answer: D, Answer: B


Processing:  50%|█████     | 500/1000 [21:01<08:57,  1.08s/it]


Wrong Answer Index: 499 Chat Response: Final Answer: A, Answer: B


Processing:  50%|█████     | 501/1000 [21:02<08:50,  1.06s/it]


Wrong Answer Index: 500 Chat Response: Final Answer: A, Answer: C


Processing:  50%|█████     | 502/1000 [21:03<08:46,  1.06s/it]


Wrong Answer Index: 501 Chat Response: Final Answer: B, Answer: C


Processing:  50%|█████     | 504/1000 [21:06<10:40,  1.29s/it]


Wrong Answer Index: 503 Chat Response: Final Answer: B, Answer: C


Processing:  51%|█████     | 506/1000 [21:08<09:35,  1.17s/it]


Wrong Answer Index: 505 Chat Response: Final Answer: A, Answer: C


Processing:  51%|█████     | 507/1000 [21:09<09:33,  1.16s/it]


Wrong Answer Index: 506 Chat Response: Final Answer: A, Answer: C


Processing:  51%|█████     | 510/1000 [21:13<09:15,  1.13s/it]


Wrong Answer Index: 509 Chat Response: Final Answer: B, Answer: C


Processing:  52%|█████▏    | 520/1000 [21:24<08:55,  1.12s/it]


Wrong Answer Index: 519 Chat Response: Final Answer: B, Answer: C


Processing:  52%|█████▏    | 522/1000 [21:27<09:35,  1.20s/it]


Wrong Answer Index: 521 Chat Response: Final Answer: A, Answer: C


Processing:  52%|█████▏    | 524/1000 [21:29<09:07,  1.15s/it]


Wrong Answer Index: 523 Chat Response: Final Answer: B, Answer: C


Processing:  53%|█████▎    | 528/1000 [21:33<08:30,  1.08s/it]


Wrong Answer Index: 527 Chat Response: Final Answer: B, Answer: C


Processing:  53%|█████▎    | 534/1000 [21:42<12:43,  1.64s/it]


Wrong Answer Index: 533 Chat Response: Final Answer: A, Answer: C


Processing:  54%|█████▎    | 536/1000 [21:44<10:04,  1.30s/it]


Wrong Answer Index: 535 Chat Response: Final Answer: D, Answer: C


Processing:  54%|█████▍    | 540/1000 [21:49<08:58,  1.17s/it]

Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 6f10d4af62ba0c05184674aeac8111c6 in your message.)


Processing:  55%|█████▌    | 550/1000 [23:29<15:29,  2.07s/it]


Wrong Answer Index: 549 Chat Response: Final Answer: B, Answer: C


Processing:  55%|█████▌    | 551/1000 [23:30<13:49,  1.85s/it]


Wrong Answer Index: 550 Chat Response: Final Answer: B, Answer: C


Processing:  55%|█████▌    | 554/1000 [23:34<10:15,  1.38s/it]


Wrong Answer Index: 553 Chat Response: Final Answer: B, Answer: C


Processing:  56%|█████▌    | 557/1000 [23:38<09:03,  1.23s/it]


Wrong Answer Index: 556 Chat Response: Final Answer: A, Answer: C


Processing:  56%|█████▌    | 559/1000 [23:39<08:01,  1.09s/it]


Wrong Answer Index: 558 Chat Response: Final Answer: A, Answer: C


Processing:  56%|█████▋    | 565/1000 [23:45<07:22,  1.02s/it]


Wrong Answer Index: 564 Chat Response: Final Answer: B, Answer: C


Processing:  57%|█████▋    | 567/1000 [23:48<07:52,  1.09s/it]


Wrong Answer Index: 566 Chat Response: Final Answer: B, Answer: C


Processing:  57%|█████▋    | 570/1000 [23:51<08:09,  1.14s/it]


Wrong Answer Index: 569 Chat Response: Final Answer: D, Answer: C


Processing:  57%|█████▋    | 574/1000 [23:56<09:07,  1.29s/it]


Wrong Answer Index: 573 Chat Response: Final Answer: B, Answer: C


Processing:  58%|█████▊    | 576/1000 [23:58<08:06,  1.15s/it]


Wrong Answer Index: 575 Chat Response: Final Answer: D, Answer: C


Processing:  58%|█████▊    | 579/1000 [24:02<08:07,  1.16s/it]


Wrong Answer Index: 578 Chat Response: Final Answer: D, Answer: C


Processing:  58%|█████▊    | 582/1000 [24:05<07:52,  1.13s/it]


Wrong Answer Index: 581 Chat Response: Final Answer: B, Answer: C


Processing:  59%|█████▉    | 588/1000 [24:12<07:34,  1.10s/it]


Wrong Answer Index: 587 Chat Response: Final Answer: A, Answer: C


Processing:  59%|█████▉    | 589/1000 [24:13<07:34,  1.10s/it]


Wrong Answer Index: 588 Chat Response: Final Answer: B, Answer: C


Processing:  60%|█████▉    | 597/1000 [24:23<07:21,  1.10s/it]


Wrong Answer Index: 596 Chat Response: Final Answer: B, Answer: C


Processing:  60%|██████    | 605/1000 [24:31<07:16,  1.11s/it]


Wrong Answer Index: 604 Chat Response: Final Answer: B, Answer: C


Processing:  61%|██████    | 609/1000 [24:36<07:14,  1.11s/it]


Wrong Answer Index: 608 Chat Response: Final Answer: A, Answer: C


Processing:  61%|██████    | 612/1000 [24:39<07:17,  1.13s/it]


Wrong Answer Index: 611 Chat Response: Final Answer: D, Answer: C


Processing:  63%|██████▎   | 627/1000 [24:56<06:19,  1.02s/it]


Wrong Answer Index: 626 Chat Response: Final Answer: D, Answer: C


Processing:  63%|██████▎   | 628/1000 [24:57<06:09,  1.01it/s]


Wrong Answer Index: 627 Chat Response: Final Answer: D, Answer: C


Processing:  64%|██████▎   | 636/1000 [25:05<06:10,  1.02s/it]


Wrong Answer Index: 635 Chat Response: Final Answer: B, Answer: C


Processing:  64%|██████▍   | 640/1000 [25:10<06:50,  1.14s/it]


Wrong Answer Index: 639 Chat Response: Final Answer: B, Answer: C


Processing:  65%|██████▍   | 647/1000 [25:18<07:02,  1.20s/it]


Wrong Answer Index: 646 Chat Response: Final Answer: B, Answer: C


Processing:  66%|██████▌   | 658/1000 [25:30<05:47,  1.02s/it]


Wrong Answer Index: 657 Chat Response: Final Answer: D, Answer: C


Processing:  66%|██████▋   | 664/1000 [25:36<05:52,  1.05s/it]


Wrong Answer Index: 663 Chat Response: Final Answer: B, Answer: C


Processing:  67%|██████▋   | 673/1000 [25:57<08:21,  1.53s/it]


Wrong Answer Index: 672 Chat Response: Final Answer: B, Answer: C


Processing:  68%|██████▊   | 682/1000 [26:09<07:56,  1.50s/it]


Wrong Answer Index: 681 Chat Response: Final Answer: B, Answer: C


Processing:  68%|██████▊   | 684/1000 [26:12<07:14,  1.38s/it]


Wrong Answer Index: 683 Chat Response: Final Answer: A, Answer: C


Processing:  69%|██████▉   | 688/1000 [26:16<06:22,  1.22s/it]


Wrong Answer Index: 687 Chat Response: Final Answer: D, Answer: C


Processing:  69%|██████▉   | 694/1000 [26:22<05:18,  1.04s/it]


Wrong Answer Index: 693 Chat Response: Final Answer: A, Answer: C


Processing:  70%|██████▉   | 695/1000 [26:24<05:58,  1.18s/it]


Wrong Answer Index: 694 Chat Response: Final Answer: B, Answer: C


Processing:  70%|██████▉   | 696/1000 [26:25<05:28,  1.08s/it]


Wrong Answer Index: 695 Chat Response: Final Answer: A, Answer: C


Processing:  70%|███████   | 702/1000 [26:31<05:17,  1.07s/it]


Wrong Answer Index: 701 Chat Response: Final Answer: B, Answer: C


Processing:  70%|███████   | 703/1000 [26:33<05:56,  1.20s/it]


Wrong Answer Index: 702 Chat Response: Final Answer: D, Answer: C


Processing:  71%|███████   | 709/1000 [26:40<06:14,  1.29s/it]


Wrong Answer Index: 708 Chat Response: Final Answer: A, Answer: C


Processing:  71%|███████   | 712/1000 [26:43<05:18,  1.11s/it]


Wrong Answer Index: 711 Chat Response: Final Answer: B, Answer: C


Processing:  71%|███████▏  | 714/1000 [26:46<05:18,  1.12s/it]


Wrong Answer Index: 713 Chat Response: Final Answer: A, Answer: C


Processing:  72%|███████▏  | 718/1000 [26:50<04:55,  1.05s/it]


Wrong Answer Index: 717 Chat Response: Final Answer: D, Answer: C


Processing:  72%|███████▏  | 722/1000 [26:54<04:51,  1.05s/it]


Wrong Answer Index: 721 Chat Response: Final Answer: B, Answer: C


Processing:  72%|███████▎  | 725/1000 [26:57<04:48,  1.05s/it]


Wrong Answer Index: 724 Chat Response: Final Answer: A, Answer: C


Processing:  73%|███████▎  | 730/1000 [27:03<04:54,  1.09s/it]


Wrong Answer Index: 729 Chat Response: Final Answer: B, Answer: C


Processing:  73%|███████▎  | 731/1000 [27:04<04:44,  1.06s/it]


Wrong Answer Index: 730 Chat Response: Final Answer: B, Answer: C


Processing:  74%|███████▎  | 735/1000 [27:13<09:50,  2.23s/it]


Wrong Answer Index: 734 Chat Response: As an AI language model, I am not able to diagnose a patient without detailed information and a proper medical examination. It is recommended to consult a healthcare professional for an accurate diagnosis., Answer: C


Processing:  74%|███████▎  | 737/1000 [27:15<07:17,  1.66s/it]


Wrong Answer Index: 736 Chat Response: Final Answer: D, Answer: C


Processing:  74%|███████▍  | 739/1000 [27:17<05:53,  1.36s/it]


Wrong Answer Index: 738 Chat Response: Final Answer: B, Answer: C


Processing:  74%|███████▍  | 741/1000 [27:19<05:00,  1.16s/it]


Wrong Answer Index: 740 Chat Response: Final Answer: A, Answer: C


Processing:  74%|███████▍  | 743/1000 [27:22<06:02,  1.41s/it]


Wrong Answer Index: 742 Chat Response: Final Answer: A, Answer: C


Processing:  74%|███████▍  | 744/1000 [27:24<05:36,  1.31s/it]


Wrong Answer Index: 743 Chat Response: Final Answer: A, Answer: C


Processing:  75%|███████▌  | 751/1000 [27:31<04:49,  1.16s/it]


Wrong Answer Index: 750 Chat Response: Final Answer: A, Answer: D


Processing:  76%|███████▌  | 761/1000 [27:41<04:10,  1.05s/it]


Wrong Answer Index: 760 Chat Response: Final Answer: C, Answer: D


Processing:  77%|███████▋  | 772/1000 [27:53<04:20,  1.14s/it]


Wrong Answer Index: 771 Chat Response: Final Answer: A, Answer: D


Processing:  77%|███████▋  | 774/1000 [27:57<05:29,  1.46s/it]


Wrong Answer Index: 773 Chat Response: Final Answer: C, Answer: D


Processing:  80%|███████▉  | 797/1000 [28:23<03:42,  1.10s/it]


Wrong Answer Index: 796 Chat Response: Final Answer: C, Answer: D


Processing:  80%|███████▉  | 799/1000 [28:25<03:26,  1.03s/it]

Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID f270a90be5430ddadd0bd01c458874fd in your message.)


Processing:  80%|████████  | 801/1000 [29:58<1:06:23, 20.02s/it]


Wrong Answer Index: 800 Chat Response: Final Answer: C, Answer: D


Processing:  80%|████████  | 802/1000 [29:59<47:17, 14.33s/it]  


Wrong Answer Index: 801 Chat Response: Final Answer: A, Answer: D


Processing:  80%|████████  | 804/1000 [30:02<25:10,  7.70s/it]


Wrong Answer Index: 803 Chat Response: Final Answer: C, Answer: D


Processing:  81%|████████  | 806/1000 [30:04<14:13,  4.40s/it]


Wrong Answer Index: 805 Chat Response: Final Answer: C, Answer: D


Processing:  81%|████████  | 811/1000 [30:10<05:36,  1.78s/it]


Wrong Answer Index: 810 Chat Response: Final Answer: A, Answer: D


Processing:  82%|████████▏ | 818/1000 [30:18<03:27,  1.14s/it]


Wrong Answer Index: 817 Chat Response: Final Answer: A, Answer: D


Processing:  82%|████████▎ | 825/1000 [30:28<03:31,  1.21s/it]


Wrong Answer Index: 824 Chat Response: Final Answer: A, Answer: D


Processing:  83%|████████▎ | 826/1000 [30:29<03:23,  1.17s/it]


Wrong Answer Index: 825 Chat Response: Final Answer: C, Answer: D


Processing:  83%|████████▎ | 827/1000 [30:30<03:06,  1.08s/it]


Wrong Answer Index: 826 Chat Response: Final Answer: C, Answer: D


Processing:  83%|████████▎ | 829/1000 [30:32<03:01,  1.06s/it]


Wrong Answer Index: 828 Chat Response: Final Answer: A, Answer: D


Processing:  83%|████████▎ | 834/1000 [30:37<02:59,  1.08s/it]


Wrong Answer Index: 833 Chat Response: Final Answer: B, Answer: D


Processing:  84%|████████▎ | 837/1000 [30:40<02:41,  1.01it/s]


Wrong Answer Index: 836 Chat Response: Final Answer: B, Answer: D


Processing:  84%|████████▍ | 840/1000 [30:43<02:37,  1.02it/s]


Wrong Answer Index: 839 Chat Response: Final Answer: A, Answer: D


Processing:  84%|████████▍ | 843/1000 [30:46<02:31,  1.03it/s]


Wrong Answer Index: 842 Chat Response: Final Answer: A, Answer: D


Processing:  84%|████████▍ | 845/1000 [30:48<02:29,  1.04it/s]


Wrong Answer Index: 844 Chat Response: Final Answer: C, Answer: D


Processing:  86%|████████▌ | 861/1000 [31:05<02:28,  1.07s/it]


Wrong Answer Index: 860 Chat Response: Final Answer: B, Answer: D


Processing:  86%|████████▌ | 862/1000 [31:06<02:31,  1.10s/it]


Wrong Answer Index: 861 Chat Response: Final Answer: B, Answer: D


Processing:  86%|████████▋ | 865/1000 [31:10<02:30,  1.12s/it]


Wrong Answer Index: 864 Chat Response: Final Answer: C, Answer: D


Processing:  87%|████████▋ | 870/1000 [31:15<02:19,  1.08s/it]


Wrong Answer Index: 869 Chat Response: Final Answer: B, Answer: D


Processing:  87%|████████▋ | 872/1000 [31:17<02:16,  1.07s/it]


Wrong Answer Index: 871 Chat Response: Final Answer: B, Answer: D


Processing:  88%|████████▊ | 876/1000 [31:21<02:13,  1.08s/it]


Wrong Answer Index: 875 Chat Response: Final Answer: C, Answer: D


Processing:  88%|████████▊ | 884/1000 [31:30<02:02,  1.06s/it]


Wrong Answer Index: 883 Chat Response: Final Answer: A, Answer: D


Processing:  88%|████████▊ | 885/1000 [31:31<01:58,  1.03s/it]


Wrong Answer Index: 884 Chat Response: Final Answer: B, Answer: D


Processing:  89%|████████▊ | 886/1000 [31:32<01:56,  1.02s/it]


Wrong Answer Index: 885 Chat Response: Final Answer: A, Answer: D


Processing:  89%|████████▉ | 891/1000 [31:38<01:55,  1.06s/it]


Wrong Answer Index: 890 Chat Response: Final Answer: A, Answer: D


Processing:  89%|████████▉ | 892/1000 [31:39<01:50,  1.02s/it]


Wrong Answer Index: 891 Chat Response: Final Answer: C, Answer: D


Processing:  89%|████████▉ | 893/1000 [31:40<01:46,  1.00it/s]


Wrong Answer Index: 892 Chat Response: Final Answer: C, Answer: D


Processing:  90%|████████▉ | 896/1000 [31:43<01:49,  1.05s/it]


Wrong Answer Index: 895 Chat Response: Final Answer: A, Answer: D


Processing:  90%|████████▉ | 897/1000 [31:44<01:46,  1.04s/it]


Wrong Answer Index: 896 Chat Response: Final Answer: B, Answer: D
Chat API error: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID ff8adba9188d805888ba22b46cd2c888 in your message.)


Processing:  90%|█████████ | 900/1000 [33:18<24:03, 14.43s/it]


Wrong Answer Index: 899 Chat Response: Final Answer: B, Answer: D


Processing:  90%|█████████ | 904/1000 [33:22<06:52,  4.29s/it]


Wrong Answer Index: 903 Chat Response: Final Answer: A, Answer: D


Processing:  91%|█████████ | 909/1000 [33:28<02:26,  1.61s/it]


Wrong Answer Index: 908 Chat Response: Final Answer: B, Answer: D


Processing:  91%|█████████ | 910/1000 [33:30<02:33,  1.71s/it]


Wrong Answer Index: 909 Chat Response: Final Answer: B, Answer: D


Processing:  92%|█████████▏| 915/1000 [33:35<01:34,  1.11s/it]


Wrong Answer Index: 914 Chat Response: Final Answer: C, Answer: D


Processing:  92%|█████████▏| 916/1000 [33:36<01:31,  1.09s/it]


Wrong Answer Index: 915 Chat Response: Final Answer: C, Answer: D


Processing:  92%|█████████▏| 922/1000 [33:42<01:18,  1.01s/it]


Wrong Answer Index: 921 Chat Response: Final Answer: A, Answer: D


Processing:  93%|█████████▎| 927/1000 [33:49<01:42,  1.40s/it]


Wrong Answer Index: 926 Chat Response: Final Answer: B, Answer: D


Processing:  93%|█████████▎| 929/1000 [33:51<01:31,  1.28s/it]


Wrong Answer Index: 928 Chat Response: Final Answer: C, Answer: D


Processing:  93%|█████████▎| 931/1000 [33:53<01:15,  1.10s/it]


Wrong Answer Index: 930 Chat Response: Final Answer: B, Answer: D


Processing:  94%|█████████▍| 944/1000 [34:07<01:01,  1.10s/it]


Wrong Answer Index: 943 Chat Response: Final Answer: B, Answer: D


Processing:  95%|█████████▍| 946/1000 [34:09<00:56,  1.06s/it]


Wrong Answer Index: 945 Chat Response: Final Answer: B, Answer: D


Processing:  95%|█████████▍| 947/1000 [34:13<01:48,  2.05s/it]


Wrong Answer Index: 946 Chat Response: I cannot provide a final answer without more context or information about the patient's symptoms, medical history, and test results. Please provide more details to help me determine the most likely diagnosis., Answer: D


Processing:  95%|█████████▌| 950/1000 [34:17<01:10,  1.41s/it]


Wrong Answer Index: 949 Chat Response: Final Answer: A, Answer: D


Processing:  96%|█████████▌| 956/1000 [34:23<00:46,  1.05s/it]


Wrong Answer Index: 955 Chat Response: Final Answer: B, Answer: D


Processing:  96%|█████████▌| 960/1000 [34:27<00:48,  1.20s/it]


Wrong Answer Index: 959 Chat Response: Final Answer: B, Answer: D


Processing:  97%|█████████▋| 968/1000 [34:38<00:40,  1.26s/it]


Wrong Answer Index: 967 Chat Response: Final Answer: C, Answer: D


Processing:  98%|█████████▊| 976/1000 [34:52<00:34,  1.42s/it]


Wrong Answer Index: 975 Chat Response: Final Answer: B, Answer: D


Processing:  98%|█████████▊| 977/1000 [34:53<00:29,  1.27s/it]


Wrong Answer Index: 976 Chat Response: Final Answer: B, Answer: D


Processing:  98%|█████████▊| 978/1000 [34:54<00:25,  1.14s/it]


Wrong Answer Index: 977 Chat Response: Final Answer: C, Answer: D


Processing:  98%|█████████▊| 979/1000 [34:55<00:21,  1.05s/it]


Wrong Answer Index: 978 Chat Response: Final Answer: C, Answer: D


Processing:  99%|█████████▉| 992/1000 [35:09<00:08,  1.04s/it]


Wrong Answer Index: 991 Chat Response: Final Answer: C, Answer: D


Processing:  99%|█████████▉| 993/1000 [35:10<00:07,  1.04s/it]


Wrong Answer Index: 992 Chat Response: Final Answer: B, Answer: D


Processing: 100%|██████████| 1000/1000 [35:18<00:00,  2.12s/it]


Accuracy: 802/1000





#### With CoT

In [None]:
accurate_predictions = 0;
print_process = True;
basic_binary_prompt = 'Analyze the question first, Provide the final answer, which should be a yes or no. Never say you dont know, always provide a yes or no answer regardless. If you dont have enough information, just guess. Follow this format:\n Analysis: your analysis\nFinal Answer: no'
basic_mc_prompt = 'Analyze the question first, then provide the final answer, which should be a single letter in the alphabet representing the best option among the multiple choices provided in the question. Follow this format:\nAnalysis: your analysis\nFinal Answer: A'
basic_text_prompt = 'Analyze the question first, then provide the final answer, which should be a single term consisting of english words. Follow this format:\nAnalysis: your analysis\nFinal Answer: the adverse drug reaction you identified'
basic_double_binary_prompt = 'Your task is to answer to questions. Analyze the questions first, then provide the final answer, which should consist of two terms, each being either "yes" or "no" corresponding to the questions. Never say you dont know, always provide a yes or no answer regardless. If you dont have enough information, just guess. Follow this format:\n\nAnalysis: your analysis\nFinal Answer: no, no'
basic_text_list_prompt = '''Your task is to identify a list of medications. Analyze the question first, then provide the final answer, which should consist of drug names, each separated by a comma. Never say you don't know, always provide an answer regardless. If you don't have enough information, just guess. Follow this format:
Analysis: your analysis
Final Answer: DrugA, DrugB, DrugC, ...
'''
CoT_list = []
QA_list = []
precision_list = []
recall_list = []
f1_list = []
for i in tqdm(range(len(evaluation_set['sample'])), desc="Processing"):
  chat_response = chat_handler(evaluation_set['sample'][i], basic_binary_prompt, [], GPT_4)
  analysis, prediction = text_parser(chat_response)
  CoT_list.append(chat_response)
  QA_list.append(prediction)
  if dataset_name == 'ade':
    if check_text_accuracy(prediction, evaluation_set['label'][i].lower()):
      accurate_predictions += 1
    else:
      print(f"\nWrong Answer Index: {i} Question: {evaluation_set['sample'][i]} Chat Response: {prediction}, Answer: {evaluation_set['label'][i]}")
  elif dataset_name == 'drug_usage':
      # Extract the binary yes/no answers from the chat response
      answer1, answer2 = prediction.split(", ")
      label1, label2 = evaluation_set['label'][i].lower().split(", ")
      # Check the accuracy of each part of the answer
      if answer1.lower() == label1 and answer2.lower() == label2:
        accurate_predictions += 1
      else:
        print(f"\nWrong Answer Index: {i} Question: {evaluation_set['sample'][i]} Chat Response: {chat_response}, Answer: {evaluation_set['label'][i]}")
  elif dataset_name == 'chatDoctor':
      # Extract the binary yes/no answers from the chat response
        precision, recall, f1 = calculate_f1_metrics(prediction, evaluation_set['label'][i].lower())
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
  else:
    if prediction == evaluation_set['label'][i].lower():
      accurate_predictions += 1
    else:
      print(f"\nWrong Answer Index: {i} Chat Response: {prediction}, Answer: {evaluation_set['label'][i]}")

data = {
      'Questions': evaluation_set['sample'],
      'Long Answer': CoT_list,
      'Short Answer': QA_list,
      'Official Answer': evaluation_set['label']
    }
# Convert the dictionary to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a csv file
df.to_csv(data_paths_dict[dataset_name]['GPT_4_answer'], index=False)
print(f"\nAccuracy: {accurate_predictions}/{len(evaluation_set['sample']) }")

Processing:   1%|          | 2/200 [00:14<24:41,  7.48s/it]


Wrong Answer Index: 1 Chat Response: no, Answer: yes


Processing:   2%|▏         | 3/200 [00:20<21:13,  6.46s/it]


Wrong Answer Index: 2 Chat Response: no, Answer: yes


Processing:   2%|▏         | 4/200 [00:25<20:07,  6.16s/it]


Wrong Answer Index: 3 Chat Response: no, Answer: yes


Processing:   2%|▎         | 5/200 [00:31<18:57,  5.83s/it]


Wrong Answer Index: 4 Chat Response: no, Answer: yes


Processing:   3%|▎         | 6/200 [00:36<18:34,  5.74s/it]


Wrong Answer Index: 5 Chat Response: no, Answer: yes


Processing:   4%|▎         | 7/200 [00:41<17:22,  5.40s/it]


Wrong Answer Index: 6 Chat Response: no, Answer: yes


Processing:   4%|▍         | 8/200 [00:46<17:28,  5.46s/it]


Wrong Answer Index: 7 Chat Response: no, Answer: yes


Processing:   4%|▍         | 9/200 [00:52<17:55,  5.63s/it]


Wrong Answer Index: 8 Chat Response: no, Answer: yes


Processing:   5%|▌         | 10/200 [00:59<18:35,  5.87s/it]


Wrong Answer Index: 9 Chat Response: no, Answer: yes


Processing:   6%|▌         | 11/200 [01:03<17:13,  5.47s/it]


Wrong Answer Index: 10 Chat Response: no, Answer: yes


Processing:   6%|▌         | 12/200 [01:13<20:47,  6.64s/it]


Wrong Answer Index: 11 Chat Response: no, Answer: yes


Processing:   6%|▋         | 13/200 [01:21<22:32,  7.23s/it]


Wrong Answer Index: 12 Chat Response: no, Answer: yes


Processing:   7%|▋         | 14/200 [01:27<21:20,  6.88s/it]


Wrong Answer Index: 13 Chat Response: no, Answer: yes


Processing:   8%|▊         | 15/200 [01:32<19:13,  6.24s/it]


Wrong Answer Index: 14 Chat Response: no, Answer: yes


Processing:   8%|▊         | 16/200 [01:37<17:30,  5.71s/it]


Wrong Answer Index: 15 Chat Response: no, Answer: yes


Processing:   8%|▊         | 17/200 [01:43<17:59,  5.90s/it]


Wrong Answer Index: 16 Chat Response: no, Answer: yes


Processing:   9%|▉         | 18/200 [01:51<20:06,  6.63s/it]


Wrong Answer Index: 17 Chat Response: no, Answer: yes


Processing:  10%|▉         | 19/200 [01:56<18:09,  6.02s/it]


Wrong Answer Index: 18 Chat Response: no, Answer: yes


Processing:  10%|█         | 21/200 [02:08<17:38,  5.91s/it]


Wrong Answer Index: 20 Chat Response: no, Answer: yes


Processing:  11%|█         | 22/200 [02:13<16:49,  5.67s/it]


Wrong Answer Index: 21 Chat Response: no, Answer: yes


Processing:  12%|█▏        | 23/200 [02:19<17:09,  5.82s/it]


Wrong Answer Index: 22 Chat Response: no, Answer: yes


Processing:  12%|█▏        | 24/200 [02:25<17:27,  5.95s/it]


Wrong Answer Index: 23 Chat Response: no, Answer: yes


Processing:  12%|█▎        | 25/200 [02:31<17:00,  5.83s/it]


Wrong Answer Index: 24 Chat Response: no, Answer: yes


Processing:  13%|█▎        | 26/200 [02:38<17:50,  6.15s/it]


Wrong Answer Index: 25 Chat Response: no, Answer: yes


Processing:  14%|█▎        | 27/200 [02:44<18:12,  6.31s/it]


Wrong Answer Index: 26 Chat Response: no, Answer: yes


Processing:  14%|█▍        | 28/200 [02:50<17:35,  6.13s/it]


Wrong Answer Index: 27 Chat Response: no, Answer: yes


Processing:  14%|█▍        | 29/200 [02:57<17:57,  6.30s/it]


Wrong Answer Index: 28 Chat Response: no, Answer: yes


Processing:  15%|█▌        | 30/200 [03:01<16:31,  5.83s/it]


Wrong Answer Index: 29 Chat Response: no, Answer: yes


Processing:  16%|█▌        | 31/200 [03:07<15:46,  5.60s/it]


Wrong Answer Index: 30 Chat Response: no, Answer: yes


Processing:  16%|█▌        | 32/200 [03:12<15:43,  5.61s/it]


Wrong Answer Index: 31 Chat Response: no, Answer: yes


Processing:  16%|█▋        | 33/200 [03:20<17:39,  6.34s/it]


Wrong Answer Index: 32 Chat Response: no, Answer: yes


Processing:  17%|█▋        | 34/200 [03:25<16:41,  6.03s/it]


Wrong Answer Index: 33 Chat Response: no, Answer: yes


Processing:  18%|█▊        | 35/200 [03:32<16:57,  6.17s/it]


Wrong Answer Index: 34 Chat Response: no, Answer: yes


Processing:  18%|█▊        | 36/200 [03:36<15:10,  5.55s/it]


Wrong Answer Index: 35 Chat Response: no, Answer: yes


Processing:  18%|█▊        | 37/200 [03:42<15:34,  5.73s/it]


Wrong Answer Index: 36 Chat Response: no, Answer: yes


Processing:  19%|█▉        | 38/200 [03:47<14:59,  5.55s/it]


Wrong Answer Index: 37 Chat Response: no, Answer: yes


Processing:  20%|█▉        | 39/200 [03:56<17:21,  6.47s/it]


Wrong Answer Index: 38 Chat Response: no, Answer: yes


Processing:  20%|██        | 41/200 [04:10<17:29,  6.60s/it]


Wrong Answer Index: 40 Chat Response: no, Answer: yes


Processing:  21%|██        | 42/200 [04:18<18:22,  6.98s/it]


Wrong Answer Index: 41 Chat Response: no, Answer: yes


Processing:  22%|██▏       | 43/200 [04:25<18:08,  6.93s/it]


Wrong Answer Index: 42 Chat Response: no, Answer: yes


Processing:  22%|██▏       | 44/200 [04:31<17:15,  6.64s/it]


Wrong Answer Index: 43 Chat Response: no, Answer: yes


Processing:  23%|██▎       | 46/200 [04:46<17:35,  6.85s/it]


Wrong Answer Index: 45 Chat Response: no, Answer: yes


Processing:  24%|██▎       | 47/200 [04:51<16:17,  6.39s/it]


Wrong Answer Index: 46 Chat Response: no, Answer: yes


Processing:  24%|██▍       | 48/200 [04:58<16:57,  6.70s/it]


Wrong Answer Index: 47 Chat Response: no, Answer: yes


Processing:  24%|██▍       | 49/200 [05:03<15:36,  6.20s/it]


Wrong Answer Index: 48 Chat Response: no, Answer: yes


Processing:  25%|██▌       | 50/200 [05:12<17:06,  6.84s/it]


Wrong Answer Index: 49 Chat Response: no, Answer: yes


Processing:  26%|██▌       | 51/200 [05:19<17:15,  6.95s/it]


Wrong Answer Index: 50 Chat Response: no, Answer: yes


Processing:  26%|██▌       | 52/200 [05:25<16:26,  6.67s/it]


Wrong Answer Index: 51 Chat Response: no, Answer: yes


Processing:  26%|██▋       | 53/200 [05:30<15:26,  6.30s/it]


Wrong Answer Index: 52 Chat Response: no, Answer: yes


Processing:  27%|██▋       | 54/200 [05:36<14:33,  5.98s/it]


Wrong Answer Index: 53 Chat Response: no, Answer: yes


Processing:  28%|██▊       | 55/200 [05:41<14:16,  5.91s/it]


Wrong Answer Index: 54 Chat Response: no, Answer: yes


Processing:  28%|██▊       | 57/200 [05:59<17:22,  7.29s/it]


Wrong Answer Index: 56 Chat Response: no, Answer: yes


Processing:  29%|██▉       | 58/200 [06:03<15:19,  6.48s/it]


Wrong Answer Index: 57 Chat Response: no, Answer: yes


Processing:  30%|██▉       | 59/200 [06:09<14:26,  6.15s/it]


Wrong Answer Index: 58 Chat Response: no, Answer: yes


Processing:  30%|███       | 60/200 [06:14<13:32,  5.80s/it]


Wrong Answer Index: 59 Chat Response: no, Answer: yes


Processing:  30%|███       | 61/200 [06:19<13:18,  5.74s/it]


Wrong Answer Index: 60 Chat Response: no, Answer: yes


Processing:  31%|███       | 62/200 [06:25<12:53,  5.61s/it]


Wrong Answer Index: 61 Chat Response: no, Answer: yes


Processing:  32%|███▏      | 64/200 [06:36<12:58,  5.72s/it]


Wrong Answer Index: 63 Chat Response: no, Answer: yes


Processing:  32%|███▎      | 65/200 [06:41<12:17,  5.46s/it]


Wrong Answer Index: 64 Chat Response: no, Answer: yes


Processing:  33%|███▎      | 66/200 [06:53<16:19,  7.31s/it]


Wrong Answer Index: 65 Chat Response: no, Answer: yes


Processing:  34%|███▎      | 67/200 [07:00<15:43,  7.09s/it]


Wrong Answer Index: 66 Chat Response: no, Answer: yes


Processing:  34%|███▍      | 68/200 [07:13<19:36,  8.91s/it]


Wrong Answer Index: 67 Chat Response: no, Answer: yes


Processing:  34%|███▍      | 69/200 [07:18<17:17,  7.92s/it]


Wrong Answer Index: 68 Chat Response: no, Answer: yes


Processing:  35%|███▌      | 70/200 [07:24<15:41,  7.24s/it]


Wrong Answer Index: 69 Chat Response: no, Answer: yes


Processing:  36%|███▌      | 72/200 [07:38<14:45,  6.92s/it]


Wrong Answer Index: 71 Chat Response: no, Answer: yes


Processing:  36%|███▋      | 73/200 [07:44<14:27,  6.83s/it]


Wrong Answer Index: 72 Chat Response: no, Answer: yes


Processing:  37%|███▋      | 74/200 [07:50<13:26,  6.40s/it]


Wrong Answer Index: 73 Chat Response: no, Answer: yes


Processing:  38%|███▊      | 75/200 [07:57<13:59,  6.72s/it]


Wrong Answer Index: 74 Chat Response: no, Answer: yes


Processing:  38%|███▊      | 77/200 [08:07<12:00,  5.85s/it]


Wrong Answer Index: 76 Chat Response: no, Answer: yes
Chat API error: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 23 Jun 2023 03:38:11 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7db9aaa1985c4a68-TPE', 'alt-svc': 'h3=":443"; ma=86400'}


Processing:  39%|███▉      | 78/200 [13:32<3:26:26, 101.53s/it]


Wrong Answer Index: 77 Chat Response: no, Answer: yes


Processing:  40%|███▉      | 79/200 [13:38<2:26:50, 72.81s/it] 


Wrong Answer Index: 78 Chat Response: no, Answer: yes


Processing:  40%|████      | 80/200 [13:44<1:45:43, 52.87s/it]


Wrong Answer Index: 79 Chat Response: no, Answer: yes


Processing:  41%|████      | 82/200 [13:56<56:39, 28.81s/it]  


Wrong Answer Index: 81 Chat Response: no, Answer: yes


Processing:  42%|████▏     | 83/200 [14:05<44:25, 22.78s/it]


Wrong Answer Index: 82 Chat Response: no, Answer: yes


Processing:  43%|████▎     | 86/200 [14:28<23:41, 12.47s/it]


Wrong Answer Index: 85 Chat Response: no, Answer: yes


Processing:  44%|████▎     | 87/200 [14:33<19:18, 10.25s/it]


Wrong Answer Index: 86 Chat Response: no, Answer: yes


Processing:  44%|████▍     | 89/200 [14:44<14:11,  7.67s/it]


Wrong Answer Index: 88 Chat Response: no, Answer: yes


Processing:  45%|████▌     | 90/200 [14:51<14:04,  7.67s/it]


Wrong Answer Index: 89 Chat Response: no, Answer: yes


Processing:  47%|████▋     | 94/200 [15:18<11:50,  6.70s/it]


Wrong Answer Index: 93 Chat Response: no, Answer: yes


Processing:  48%|████▊     | 95/200 [15:26<12:20,  7.05s/it]


Wrong Answer Index: 94 Chat Response: no, Answer: yes


Processing:  48%|████▊     | 96/200 [15:33<12:09,  7.01s/it]


Wrong Answer Index: 95 Chat Response: no, Answer: yes


Processing:  48%|████▊     | 97/200 [15:40<12:10,  7.09s/it]


Wrong Answer Index: 96 Chat Response: no, Answer: yes


Processing:  50%|████▉     | 99/200 [15:53<11:11,  6.65s/it]


Wrong Answer Index: 98 Chat Response: no, Answer: yes


Processing:  50%|█████     | 100/200 [16:02<11:55,  7.16s/it]


Wrong Answer Index: 99 Chat Response: no, Answer: yes


Processing:  66%|██████▌   | 131/200 [19:26<08:05,  7.04s/it]


Wrong Answer Index: 130 Chat Response: yes, Answer: no


Processing:  90%|█████████ | 180/200 [24:49<02:29,  7.47s/it]


Wrong Answer Index: 179 Chat Response: yes, Answer: no


Processing:  94%|█████████▍| 189/200 [25:48<01:13,  6.69s/it]


Wrong Answer Index: 188 Chat Response: yes, Answer: no


Processing: 100%|██████████| 200/200 [26:56<00:00,  8.08s/it]


Accuracy: 113/200



