In [1]:
OPENAI_API_KEY = "INSERT_OPEN_API_KEY_HERE"
ANTHROPIC_API_KEY = "INSERT_ANTHROPIC_API_KEY_HERE"

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [9]:
from llama_index.core import SimpleDirectoryReader

documents1 = SimpleDirectoryReader(input_files=["anatomybook.pdf"]).load_data()
documents3 = SimpleDirectoryReader(input_files=["anatomybook.pdf","anatomybook2.pdf","anatomybook3.pdf"]).load_data()
documents5 = SimpleDirectoryReader(input_files=["anatomybook.pdf","anatomybook2.pdf","anatomybook3.pdf","anatomybook4.pdf","anatomybook5.pdf"]).load_data()

In [28]:
documents2 = SimpleDirectoryReader(input_files=["anatomybook.pdf","anatomybook2.pdf"]).load_data()
documents4 = SimpleDirectoryReader(input_files=["anatomybook.pdf","anatomybook2.pdf","anatomybook3.pdf","anatomybook4.pdf"]).load_data()

In [10]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.selectors import LLMSingleSelector

splitter = SentenceSplitter(chunk_size=1024)

nodes1 = splitter.get_nodes_from_documents(documents1)
nodes2 = splitter.get_nodes_from_documents(documents3)
nodes3 = splitter.get_nodes_from_documents(documents5)

In [29]:
nodes4 = splitter.get_nodes_from_documents(documents2)
nodes5 = splitter.get_nodes_from_documents(documents4)

In [11]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-3.5-turbo", api_key=OPENAI_API_KEY)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002", api_key=OPENAI_API_KEY)

In [12]:
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext, load_index_from_storage

vector_index1 = VectorStoreIndex(nodes1)
vector_index2 = VectorStoreIndex(nodes2)
vector_index3 = VectorStoreIndex(nodes3)

In [30]:
vector_index4 = VectorStoreIndex(nodes4)
vector_index5 = VectorStoreIndex(nodes5)

In [13]:
query_engine1 = vector_index1.as_query_engine(chat_mode="best", llm=OpenAI(model="gpt-3.5-turbo"), verbose=True)
query_engine2 = vector_index2.as_query_engine(chat_mode="best", llm=OpenAI(model="gpt-3.5-turbo"), verbose=True)
query_engine3 = vector_index3.as_query_engine(chat_mode="best", llm=OpenAI(model="gpt-3.5-turbo"), verbose=True)

In [31]:
query_engine4 = vector_index4.as_query_engine(chat_mode="best", llm=OpenAI(model="gpt-3.5-turbo"), verbose=True)
query_engine5 = vector_index5.as_query_engine(chat_mode="best", llm=OpenAI(model="gpt-3.5-turbo"), verbose=True)

In [14]:
from datasets import load_dataset

dataset_validation = load_dataset("openlifescienceai/medmcqa", split="validation").filter(lambda example: example["subject_name"] == "Anatomy")

# query engine 1

In [16]:
def predict1(prompt):
    return str(query_engine1.query(prompt))

def evaluate1():
  SAMPLE_CNT = len(dataset_validation)
  mismatch_cnt = 0
  predictions = []
  references = []

  for i in range(SAMPLE_CNT):
    example = dataset_validation[i]
    question, option_a, option_b, option_c, option_d = example["question"], example["opa"], example["opb"], example["opc"], example["opd"]
    prompt = f'''{question}

{option_a}
{option_b}
{option_c}
{option_d}

Respond with the correct choice from the list above verbatim.  Do not include any explanation.'''

    options = [example['opa'], example['opb'], example['opc'], example['opd']]
    correct_option = options[example['cop']]
    references.append(correct_option)

    prediction = predict1(prompt)
    if prediction not in options:
      prompt += prediction + "\n\nYour response does not exactly match one of the choices from the list. Do not apologise or include any text other than one of the options from the list verbatim without any label. Here are the options again\n\n" + example['opa'] + "\n\n" + example['opb'] + "\n\n" + example['opc'] + "\n\n" + example['opd']
      prediction = predict1(prompt)

    predictions.append(prediction)

    mismatch_cnt += prediction not in options

  exact_match = sum([prediction == reference for prediction, reference, in zip(predictions, references)]) / SAMPLE_CNT
  mismatch = mismatch_cnt / SAMPLE_CNT

  return exact_match, mismatch

exact_match, mismatch = evaluate1()

In [17]:
print("exact_match:", exact_match)
print("mismatch:", mismatch)

exact_match: 0.5897435897435898
mismatch: 0.03418803418803419


# query engine 2

In [18]:
def predict2(prompt):
    return str(query_engine2.query(prompt))

def evaluate2():
  SAMPLE_CNT = len(dataset_validation)
  mismatch_cnt = 0
  predictions = []
  references = []

  for i in range(SAMPLE_CNT):
    example = dataset_validation[i]
    question, option_a, option_b, option_c, option_d = example["question"], example["opa"], example["opb"], example["opc"], example["opd"]
    prompt = f'''{question}

{option_a}
{option_b}
{option_c}
{option_d}

Respond with the correct choice from the list above verbatim.  Do not include any explanation.'''

    options = [example['opa'], example['opb'], example['opc'], example['opd']]
    correct_option = options[example['cop']]
    references.append(correct_option)

    prediction = predict2(prompt)
    if prediction not in options:
      prompt += prediction + "\n\nYour response does not exactly match one of the choices from the list. Do not apologise or include any text other than one of the options from the list verbatim without any label. Here are the options again\n\n" + example['opa'] + "\n\n" + example['opb'] + "\n\n" + example['opc'] + "\n\n" + example['opd']
      prediction = predict2(prompt)

    predictions.append(prediction)

    mismatch_cnt += prediction not in options

  exact_match = sum([prediction == reference for prediction, reference, in zip(predictions, references)]) / SAMPLE_CNT
  mismatch = mismatch_cnt / SAMPLE_CNT

  return exact_match, mismatch

exact_match, mismatch = evaluate2()

In [19]:
print("exact_match:", exact_match)
print("mismatch:", mismatch)

exact_match: 0.5598290598290598
mismatch: 0.038461538461538464


# query engine 3

In [20]:
def predict3(prompt):
    return str(query_engine3.query(prompt))

def evaluate3():
  SAMPLE_CNT = len(dataset_validation)
  mismatch_cnt = 0
  predictions = []
  references = []

  for i in range(SAMPLE_CNT):
    example = dataset_validation[i]
    question, option_a, option_b, option_c, option_d = example["question"], example["opa"], example["opb"], example["opc"], example["opd"]
    prompt = f'''{question}

{option_a}
{option_b}
{option_c}
{option_d}

Respond with the correct choice from the list above verbatim.  Do not include any explanation.'''

    options = [example['opa'], example['opb'], example['opc'], example['opd']]
    correct_option = options[example['cop']]
    references.append(correct_option)

    prediction = predict3(prompt)
    if prediction not in options:
      prompt += prediction + "\n\nYour response does not exactly match one of the choices from the list. Do not apologise or include any text other than one of the options from the list verbatim without any label. Here are the options again\n\n" + example['opa'] + "\n\n" + example['opb'] + "\n\n" + example['opc'] + "\n\n" + example['opd']
      prediction = predict3(prompt)

    predictions.append(prediction)

    mismatch_cnt += prediction not in options

  exact_match = sum([prediction == reference for prediction, reference, in zip(predictions, references)]) / SAMPLE_CNT
  mismatch = mismatch_cnt / SAMPLE_CNT

  return exact_match, mismatch

exact_match, mismatch = evaluate3()

In [21]:
print("exact_match:", exact_match)
print("mismatch:", mismatch)

exact_match: 0.5213675213675214
mismatch: 0.038461538461538464


# query engine 4

In [32]:
def predict4(prompt):
    return str(query_engine4.query(prompt))

def evaluate4():
  SAMPLE_CNT = len(dataset_validation)
  mismatch_cnt = 0
  predictions = []
  references = []

  for i in range(SAMPLE_CNT):
    example = dataset_validation[i]
    question, option_a, option_b, option_c, option_d = example["question"], example["opa"], example["opb"], example["opc"], example["opd"]
    prompt = f'''{question}

{option_a}
{option_b}
{option_c}
{option_d}

Respond with the correct choice from the list above verbatim.  Do not include any explanation.'''

    options = [example['opa'], example['opb'], example['opc'], example['opd']]
    correct_option = options[example['cop']]
    references.append(correct_option)

    prediction = predict4(prompt)
    if prediction not in options:
      prompt += prediction + "\n\nYour response does not exactly match one of the choices from the list. Do not apologise or include any text other than one of the options from the list verbatim without any label. Here are the options again\n\n" + example['opa'] + "\n\n" + example['opb'] + "\n\n" + example['opc'] + "\n\n" + example['opd']
      prediction = predict4(prompt)

    predictions.append(prediction)

    mismatch_cnt += prediction not in options

  exact_match = sum([prediction == reference for prediction, reference, in zip(predictions, references)]) / SAMPLE_CNT
  mismatch = mismatch_cnt / SAMPLE_CNT

  return exact_match, mismatch

exact_match, mismatch = evaluate4()

In [33]:
print("exact_match:", exact_match)
print("mismatch:", mismatch)

exact_match: 0.5598290598290598
mismatch: 0.03418803418803419


# query engine 5

In [34]:
def predict5(prompt):
    return str(query_engine5.query(prompt))

def evaluate5():
  SAMPLE_CNT = len(dataset_validation)
  mismatch_cnt = 0
  predictions = []
  references = []

  for i in range(SAMPLE_CNT):
    example = dataset_validation[i]
    question, option_a, option_b, option_c, option_d = example["question"], example["opa"], example["opb"], example["opc"], example["opd"]
    prompt = f'''{question}

{option_a}
{option_b}
{option_c}
{option_d}

Respond with the correct choice from the list above verbatim.  Do not include any explanation.'''

    options = [example['opa'], example['opb'], example['opc'], example['opd']]
    correct_option = options[example['cop']]
    references.append(correct_option)

    prediction = predict5(prompt)
    if prediction not in options:
      prompt += prediction + "\n\nYour response does not exactly match one of the choices from the list. Do not apologise or include any text other than one of the options from the list verbatim without any label. Here are the options again\n\n" + example['opa'] + "\n\n" + example['opb'] + "\n\n" + example['opc'] + "\n\n" + example['opd']
      prediction = predict5(prompt)

    predictions.append(prediction)

    mismatch_cnt += prediction not in options

  exact_match = sum([prediction == reference for prediction, reference, in zip(predictions, references)]) / SAMPLE_CNT
  mismatch = mismatch_cnt / SAMPLE_CNT

  return exact_match, mismatch

exact_match, mismatch = evaluate5()

In [35]:
print("exact_match:", exact_match)
print("mismatch:", mismatch)

exact_match: 0.5427350427350427
mismatch: 0.021367521367521368


# comparison

In [27]:
def predict1(prompt):
    # print(str(query_engine1.retrieve(prompt)))
    return str(query_engine1.query(prompt))

def predict2(prompt):
    # print(str(query_engine2.retrieve(prompt)))
    return str(query_engine2.query(prompt))

def evaluate4():
  SAMPLE_CNT = len(dataset_validation)
  mismatch_cnt = 0
  predictions = []
  references = []

  for i in range(SAMPLE_CNT):
    example = dataset_validation[i]
    question, option_a, option_b, option_c, option_d = example["question"], example["opa"], example["opb"], example["opc"], example["opd"]
    prompt1 = f'''{question}

{option_a}
{option_b}
{option_c}
{option_d}

Respond with the correct choice from the list above verbatim.  Do not include any explanation.'''
    prompt2 = prompt1

    options = [example['opa'], example['opb'], example['opc'], example['opd']]
    correct_option = options[example['cop']]
    references.append(correct_option)

    prediction1 = predict1(prompt1)
    if prediction1 not in options:
      prompt1 += prediction1 + "\n\nYour response does not exactly match one of the choices from the list. Do not apologise or include any text other than one of the options from the list verbatim without any label. Here are the options again\n\n" + example['opa'] + "\n\n" + example['opb'] + "\n\n" + example['opc'] + "\n\n" + example['opd']
      prediction1 = predict1(prompt1)

    prediction2 = predict2(prompt2)
    if prediction2 not in options:
      prompt2 += prediction2 + "\n\nYour response does not exactly match one of the choices from the list. Do not apologise or include any text other than one of the options from the list verbatim without any label. Here are the options again\n\n" + example['opa'] + "\n\n" + example['opb'] + "\n\n" + example['opc'] + "\n\n" + example['opd']
      prediction2 = predict2(prompt2)

    if prediction1 in options and prediction2 not in options:
       print(prompt1)
       print(str(query_engine1.retrieve(prompt1)))
       print(str(query_engine2.retrieve(prompt2)))
      #  print(prediction1)
      #  print(prediction2)
       print('\n\n-----------------\n\n')

    predictions.append(prediction1)

    mismatch_cnt += prediction1 not in options

  exact_match = sum([prediction == reference for prediction, reference, in zip(predictions, references)]) / SAMPLE_CNT
  mismatch = mismatch_cnt / SAMPLE_CNT

  return exact_match, mismatch

exact_match, mismatch = evaluate4()

Mental foramen is located:

Between roots of premolars
Between roots of molars
Near canine
Between Incisors

Respond with the correct choice from the list above verbatim.  Do not include any explanation.
[NodeWithScore(node=TextNode(id_='076d3f5c-feda-47a6-afb8-58a1cf62ed8a', embedding=None, metadata={'page_label': '535', 'file_name': 'anatomybook.pdf', 'file_path': 'anatomybook.pdf', 'file_type': 'application/pdf', 'file_size': 52581393, 'creation_date': '2024-05-27', 'last_modified_date': '2024-05-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e37fd5a7-da28-4a5e-bfcd-58c204db9f13', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '535', 'file_name': 'anatomybook.pdf', 'file_path': 'ana