# **Install Required Libraries**

In [None]:
pip install transformers huggingface-hub accelerate hf_xet

# **Login with HuggingFace Token**

In [None]:
from huggingface_hub import login

login(token="Your_HuggingFace_Token")

# **Importing Libraries**

In [None]:
import json
import torch
import re
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score
from transformers import (AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline)

# **Setting Device to Gpu**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **Load Emebdding Model**

In [None]:
def load_clinicalbert():
    model_name = "emilyalsentzer/Bio_ClinicalBERT"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    return tokenizer, model

# **Method to get the embedding of the question**

In [None]:
def embed_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]

# **Metod to Calculate Cosine Similarity**

In [None]:
def cosine_sim(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

# **Normalaize Method**

In [None]:
def normalize(text):
    return text.strip().lower().replace(" ", "_") # lowercase + replace 'space' with '_'

# **Load Llama-3.1-8B-Instruct Model**

In [None]:
def load_llm_pipeline():
    model_id = "meta-llama/Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    return pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)

# **Query LLM Output**

In [None]:
def query_llm(llm_pipe, prompt):
    result = llm_pipe(prompt, do_sample=False)[0]["generated_text"]
    if "Answer:" in result:
        return result.split("Answer:")[-1].strip()
    return result.strip()

# **Extract Answer from Output**

In [None]:
def extract_answer(model_output, options):
  # Dictionary to map option letters to their corresponding index
    choices = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}

    # Use regex to find a single uppercase letter A–E with word boundaries
    match = re.search(r"\b([A-E])\b", model_output.strip().upper()) #Removes any leading/trailing spaces, Converts the text to uppercase for case-insensitive matching
    if match:
        letter = match.group(1)
        return choices[letter]

    # fallback: match full option text
    # If the model did not return a letter, this block tries to match the full option text
    for idx, opt in enumerate(options):
        if opt.lower() in model_output.lower():
            return idx
    # If No Match Found
    return -1  # unrecognized

# **Loading Our Models**

In [None]:
print("Setting up...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer, bert_model = load_clinicalbert()
llm_pipe = load_llm_pipeline()

# **Prompts and Main() with LLama Model**

## **1st: Normal Prompt**

In [None]:
def build_prompt(question, options):
    prompt = f"""
                You are a highly knowledgeable and reliable AI assistant trained in clinical reasoning and evidence-based medicine.Your task is to analyze and answer multiple-choice questions, which require integration of clinical findings, pathophysiological understanding, and diagnostic reasoning.
        Carefully evaluate the clinical scenario and the answer options provided. Determine the single best answer based on the information given.

Question:
{question}

Options:
A. {options[0]}
B. {options[1]}
C. {options[2]}
D. {options[3]}
E. {options[4]}

Select the single best option (A, B, C, D, or E) that most accurately answers the medical question
Answer:
"""
    return prompt

In [None]:
def main():

    # Load MEDQA dataset
    with open("medqal.jsonl") as f:
        medqa_data = [json.loads(line) for line in f]

    #define y_true and y_pred (used for Evaluation)
    y_true = []
    y_pred = []
    total = 0

    #looping through Medqa Questions
    print("Running QA loop...")
    for idx, sample in enumerate(tqdm(medqa_data, desc="Evaluating")):
        question = sample["question"]
        option_dict = sample["options"]  # dict: {'A': ..., 'B': ..., ...}
        options = [option_dict[k] for k in sorted(option_dict.keys())]  # Keep order: A-E
        correct_idx = options.index(sample["answer"]) #we take it from the medqa file
        total += 1

        #Building Prompt
        prompt = build_prompt(question, options)
        # print(f"Prompt:\n{prompt}")
        # print('*'*10)
        # print("\n")

        #generating output
        output = query_llm(llm_pipe, prompt)
        # print(output)

        #extracting the answer from the generated output
        pred_idx = extract_answer(output, options)

        #append the correct answer and the predicted answer
        y_true.append(correct_idx)
        y_pred.append(pred_idx)

        # Print answer for each question
        print(f"\n--- Question {idx + 1} ---")
        print(f"Question: {question}")
        print(f"Options: {options}")
        print(f"LLM Output: {output}")
        if pred_idx != -1:
            print(f"Predicted Answer: {chr(65 + pred_idx)}. {options[pred_idx]}")
        else:
            print("Predicted Answer: Invalid or Unrecognized")
        print(f"Correct Answer: {chr(65 + correct_idx)}. {options[correct_idx]}")

    # Evaluation (accuracy + F1_score)
    correct = [yt == yp for yt, yp in zip(y_true, y_pred)]
    accuracy = accuracy_score(y_true, y_pred)
    f1_micro = f1_score(y_true, y_pred, average="micro")
    f1_macro = f1_score(y_true, y_pred, average="macro")

    print("\n--- Evaluation Results ---")
    print(f"Total Questions: {total}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"F1 Score (micro): {f1_micro:.2%}")
    print(f"F1 Score (macro): {f1_macro:.2%}")
    print(f"Invalid Predictions: {sum(p == -1 for p in y_pred)}")

In [None]:
main()

## **2nd: Prompting With FewShot**

In [None]:
def build_prompt(question, options):
    few_shot_examples = [
        {
            "question": "A 45-year-old man has had worsening fatigue and pallor. Labs show low hemoglobin. A bone marrow biopsy reveals a hypercellular marrow with increased myeloblasts. What is the most likely diagnosis?",
            "options": {
                "A": "Aplastic anemia",
                "B": "Myelodysplastic syndrome",
                "C": "Chronic myeloid leukemia",
                "D": "Acute myeloid leukemia",
                "E": "Multiple myeloma"
            },
            "answer": "D"
        },
        {
            "question": "A 29-year-old woman presents with acute onset chest pain and dyspnea. CT shows a pulmonary embolism. She recently returned from a long flight. Which of the following is the best initial treatment?",
            "options": {
                "A": "Aspirin",
                "B": "Warfarin",
                "C": "Low molecular weight heparin",
                "D": "IV corticosteroids",
                "E": "Thrombolytics"
            },
            "answer": "C"
        }
    ]

    # Format the few-shot examples
    formatted_examples = []
    for example in few_shot_examples:
        example_options = "\n".join([f"{k}. {v}" for k, v in example["options"].items()])
        formatted_examples.append(f"""
Question:
{example["question"]}

Options:
{example_options}
Answer: {example["answer"]}
""")

    # Join the formatted examples into a single string
    examples_string = "\n".join(formatted_examples)

    prompt = f"""
You are an expert medical question-answering AI. Your task is to accurately answer the given medical question by carefully analyzing the provided question and the five multiple-choice options.

{examples_string}
---

Question:
{question}

Options:
A. {options[0]}
B. {options[1]}
C. {options[2]}
D. {options[3]}
E. {options[4]}
---

Based on the comprehensive information above, including the question and all five options, select the single best option (A, B, C, D, or E) that most accurately answers the medical question.
Answer:
"""
    return prompt

In [None]:
def main():

    # Load MEDQA dataset
    with open("medqal.jsonl") as f:
        medqa_data = [json.loads(line) for line in f]

    #define y_true and y_pred (used for Evaluation)
    y_true = []
    y_pred = []
    total = 0

    #looping through Medqa Questions
    print("Running QA loop...")
    for idx, sample in enumerate(tqdm(medqa_data, desc="Evaluating")):
        question = sample["question"]
        option_dict = sample["options"]  # dict: {'A': ..., 'B': ..., ...}
        options = [option_dict[k] for k in sorted(option_dict.keys())]  # Keep order: A-E
        correct_idx = options.index(sample["answer"]) #we take it from the medqa file
        total += 1

        #Building Prompt
        prompt = build_prompt(question, options)
        # print(f"Prompt:\n{prompt}")
        # print('*'*10)
        # print("\n")

        #generating output
        output = query_llm(llm_pipe, prompt)
        # print(output)

        #extracting the answer from the generated output
        pred_idx = extract_answer(output, options)

        #append the correct answer and the predicted answer
        y_true.append(correct_idx)
        y_pred.append(pred_idx)

        # Print answer for each question
        print(f"\n--- Question {idx + 1} ---")
        print(f"Question: {question}")
        print(f"Options: {options}")
        print(f"LLM Output: {output}")
        if pred_idx != -1:
            print(f"Predicted Answer: {chr(65 + pred_idx)}. {options[pred_idx]}")
        else:
            print("Predicted Answer: Invalid or Unrecognized")
        print(f"Correct Answer: {chr(65 + correct_idx)}. {options[correct_idx]}")

    # Evaluation (accuracy + F1_score)
    correct = [yt == yp for yt, yp in zip(y_true, y_pred)]
    accuracy = accuracy_score(y_true, y_pred)
    f1_micro = f1_score(y_true, y_pred, average="micro")
    f1_macro = f1_score(y_true, y_pred, average="macro")

    print("\n--- Evaluation Results ---")
    print(f"Total Questions: {total}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"F1 Score (micro): {f1_micro:.2%}")
    print(f"F1 Score (macro): {f1_macro:.2%}")
    print(f"Invalid Predictions: {sum(p == -1 for p in y_pred)}")

In [None]:
main()

## **3rd: Prompting + PubMed Context**

In [None]:
def build_prompt(question, options, top1, top2, top3, top4, top5):
    ctx1 = "\n".join(top1[2])
    ctx2 = "\n".join(top2[2])
    ctx3 = "\n".join(top3[2])
    ctx4 = "\n".join(top4[2])
    ctx5 = "\n".join(top5[2])
    context = f"Context from top relevant options:\n\nOption: {top1[0]}\n{ctx1}\n\nOption: {top2[0]}\n{ctx2}\n\nOption: {top3[0]}\n{ctx3}\n\nOption: {top4[0]}\n{ctx4}\n\nOption: {top5[0]}\n{ctx5}"

    prompt = f"""
You are a medical expert answering clinical board-style multiple choice questions.

First, **carefully understand** the given question and all five answer options.

Then, review the **contextual information** provided for the two most semantically relevant options (based on domain embeddings). Use this context as supporting evidence.

Choose the **single best answer** based on both the question and the contextual support.

If the context isn't enough to decide confidently, still make your best choice.

---

Question:
{question}

Options:
A. {options[0]}
B. {options[1]}
C. {options[2]}
D. {options[3]}
E. {options[4]}
{context}

Final Answer (only one letter A/B/C/D/E, followed by the option text):
Answer:"""
    return prompt

In [None]:
def normalize(text):
    return text.strip().lower()

In [None]:
def main():
    with open("mention_vectors.json") as f:
        mention_data = json.load(f)

    normalized_mention_data = {
        normalize(k): v for k, v in mention_data.items()
    }

    # Load MEDQA dataset
    with open("medqal.jsonl") as f:
        medqa_data = [json.loads(line) for line in f]

    #define y_true and y_pred (used for Evaluation)
    y_true = []
    y_pred = []
    total = 0

    #looping through Medqa Questions
    print("Running QA loop...")
    for idx, sample in enumerate(tqdm(medqa_data, desc="Evaluating")):
        question = sample["question"]
        option_dict = sample["options"]  # dict: {'A': ..., 'B': ..., ...}
        options = [option_dict[k] for k in sorted(option_dict.keys())]  # Keep order: A-E
        correct_idx = options.index(sample["answer"]) #we take it from the medqa file
        total += 1

        # Embed question using your existing function
        q_vec = embed_text(question, tokenizer, bert_model)

        # Similarity scoring
        scored = []
        for opt in options:
            opt_norm = normalize(opt)
            # print(f'Normalized Option: {opt_norm}')
            #checking for the availability of our option
            if opt_norm not in normalized_mention_data:
                print(f"[WARN] Option not found in mention_data: '{opt}'")
                scored.append((opt, -1.0, []))
                continue

            data = normalized_mention_data[opt_norm]

            # calculating the cosine similarity between question and the avg emb. vector of the options
            sim = cosine_sim(q_vec, data["avg_vector"])
            # print(f'Similarity check between Question and Options Average Embedding:\n Question Embedding: {q_vec}\n Option Embedding: {data["avg_vector"]}')
            # print(f"Shapes of the Emebddings:")
            # print(f" Question Embedding Shape: {len(q_vec)}")
            # print(f" Option Embedding Shape: {len(data['avg_vector'])}\n")
            context = data["top_sentences"][:5]
            # add each option and it's score + context sentences
            scored.append((opt, sim, context))

        # Sort by similarity
        top_options = sorted(scored, key=lambda x: x[1], reverse=True)
        #top1 and top2 are the top scoring options
        top1, top2, top3, top4, top5 = top_options[0], top_options[1], top_options[2], top_options[3], top_options[4]

        #prints used for debugging (not important)
        # print('*'*10)
        # print(f"Top1: {top1}\n")
        # print(f"Top2: {top2}\n")
        # print(f"Top3: {top3}\n")
        # print(f"Top4: {top4}\n")
        # print(f"Top5: {top5}\n")
        # print('*'*10)
        # print("\n")

        #Building Prompt
        prompt = build_prompt(question, options, top1, top2, top3, top4, top5)
        # print(f"Prompt:\n{prompt}")
        # print('*'*10)
        # print("\n")

        #generating output
        output = query_llm(llm_pipe, prompt)
        # print(output)
        # print('*'*10)
        #extracting the answer from the generated output
        pred_idx = extract_answer(output, options)
        # print(f"Predicted Index/Answer: {pred_idx}")
        # print('*'*10)
        # print("\n")

        #---------------------------------------------------------

        #append the correct answer and the predicted answer
        y_true.append(correct_idx)
        y_pred.append(pred_idx)

        # Print answer for each question
        print(f"\n--- Question {idx + 1} ---")
        print(f"Question: {question}")
        print(f"Options: {options}")
        print(f"LLM Output: {output}")
        if pred_idx != -1:
            print(f"Predicted Answer: {chr(65 + pred_idx)}. {options[pred_idx]}")
        else:
            print("Predicted Answer: Invalid or Unrecognized")
        print(f"Correct Answer: {chr(65 + correct_idx)}. {options[correct_idx]}")
        print('*--*'*100)

    # Evaluation (accuracy + F1_score)
    correct = [yt == yp for yt, yp in zip(y_true, y_pred)]
    accuracy = accuracy_score(y_true, y_pred)
    f1_micro = f1_score(y_true, y_pred, average="micro")
    f1_macro = f1_score(y_true, y_pred, average="macro")

    print("\n--- Evaluation Results ---")
    print(f"Total Questions: {total}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"F1 Score (micro): {f1_micro:.2%}")
    print(f"F1 Score (macro): {f1_macro:.2%}")
    print(f"Invalid Predictions: {sum(p == -1 for p in y_pred)}")

In [None]:
main()

## **4th: Promtping + Pubmed Context + Scoring**

In [None]:
def build_prompt(question, options, top1, top2, top3, top4, top5):
    ctx1 = "\n".join(top1[2])
    ctx2 = "\n".join(top2[2])
    ctx3 = "\n".join(top3[2])
    ctx4 = "\n".join(top4[2])
    ctx5 = "\n".join(top5[2])

    prompt = f"""
You are a highly specialized and precise medical diagnostic AI. Your sole objective is to identify the single, unequivocally correct answer to the provided medical multiple-choice question.

---

## Medical Question:
{question}

## Candidate Options:
A. {options[0]}
B. {options[1]}
C. {options[2]}
D. {options[3]}
E. {options[4]}

---

## Relevance Scores:
These scores indicate the semantic alignment between the question and each option. Use them as a preliminary guide, but **do not solely rely on them**.

-   **Option '{top1[0]}'**: {top1[1]:.4f}
-   **Option '{top2[0]}'**: {top2[1]:.4f}
-   **Option '{top3[0]}'**: {top3[1]:.4f}
-   **Option '{top4[0]}'**: {top4[1]:.4f}
-   **Option '{top5[0]}'**: {top5[1]:.4f}

---

## Definitive Medical Evidence:
**This section contains the critical information.** Evaluate each piece of evidence thoroughly and discern which option it directly and robustly supports or refutes. This is the primary determinant for your answer.

### Evidence for Option '{top1[0]}':
{ctx1}

### Evidence for Option '{top2[0]}':
{ctx2}

### Evidence for Option '{top3[0]}':
{ctx3}

### Evidence for Option '{top4[0]}':
{ctx4}

### Evidence for Option '{top5[0]}':
{ctx5}

---

## Decision Protocol:
1.  **Prioritize Evidence**: The correct answer is the option that is most clearly and directly supported by the **Definitive Medical Evidence**.
2.  **Evaluate All Options**: Systematically consider each of the five options in light of *all* provided evidence.
3.  **Identify Unambiguous Support**: Select the option for which the evidence provides the strongest, most direct, and unequivocal confirmation. If evidence for a particular option is absent or weak, it is less likely to be the correct answer.
4.  **No Speculation**: Do not infer or speculate beyond the provided information. Stick strictly to what is directly supported.

Provide only the letter (A, B, C, D, or E) corresponding to the single best and most evidence-backed option.

Answer:
"""
    return prompt

In [None]:
def main():
    with open("mention_vectors.json") as f:
        mention_data = json.load(f)

    normalized_mention_data = {
        normalize(k): v for k, v in mention_data.items()
    }

    # Load MEDQA dataset
    with open("medqal.jsonl") as f:
        medqa_data = [json.loads(line) for line in f]

    #define y_true and y_pred (used for Evaluation)
    y_true = []
    y_pred = []
    total = 0

    #looping through Medqa Questions
    print("Running QA loop...")
    for idx, sample in enumerate(tqdm(medqa_data, desc="Evaluating")):
        question = sample["question"]
        option_dict = sample["options"]  # dict: {'A': ..., 'B': ..., ...}
        options = [option_dict[k] for k in sorted(option_dict.keys())]  # Keep order: A-E
        correct_idx = options.index(sample["answer"]) #we take it from the medqa file
        total += 1

        # Embed question using your existing function
        q_vec = embed_text(question, tokenizer, bert_model)

        # Similarity scoring
        scored = []
        for opt in options:
            opt_norm = normalize(opt)
            print(f'Normalized Option: {opt_norm}')
            #checking for the availability of our option
            if opt_norm not in normalized_mention_data:
                print(f"[WARN] Option not found in mention_data: '{opt}'")
                scored.append((opt, -1.0, []))
                continue

            data = normalized_mention_data[opt_norm]

            # calculating the cosine similarity between question and the avg emb. vector of the options
            sim = cosine_sim(q_vec, data["avg_vector"])
            # print(f'Similarity check between Question and Options Average Embedding:\n Question Embedding: {q_vec}\n Option Embedding: {data["avg_vector"]}')
            # print(f"Shapes of the Emebddings:")
            # print(f" Question Embedding Shape: {len(q_vec)}")
            # print(f" Option Embedding Shape: {len(data['avg_vector'])}\n")
            context = data["top_sentences"][:5]
            # add each option and it's score + context sentences
            scored.append((opt, sim, context))

        # Sort by similarity
        top_options = sorted(scored, key=lambda x: x[1], reverse=True)
        #top1 and top2 are the top scoring options
        top1, top2, top3, top4, top5 = top_options[0], top_options[1], top_options[2], top_options[3], top_options[4]

        #prints used for debugging (not important)
        # print('*'*10)
        # print(f"Top1: {top1}\n")
        # print(f"Top2: {top2}\n")
        # print(f"Top3: {top3}\n")
        # print(f"Top4: {top4}\n")
        # print(f"Top5: {top5}\n")
        # print('*'*10)
        # print("\n")

        #Building Prompt
        prompt = build_prompt(question, options, top1, top2, top3, top4, top5)
        # print(f"Prompt:\n{prompt}")
        # print('*'*10)
        # print("\n")

        #generating output
        output = query_llm(llm_pipe, prompt)
        # print(output)

        #extracting the answer from the generated output
        pred_idx = extract_answer(output, options)
        # print(f"Predicted Index/Answer: {pred_idx}")
        # print('*'*10)
        # print("\n")

        #---------------------------------------------------------

        #append the correct answer and the predicted answer
        y_true.append(correct_idx)
        y_pred.append(pred_idx)

        # Print answer for each question
        print(f"\n--- Question {idx + 1} ---")
        print(f"Question: {question}")
        print(f"Options: {options}")
        print(f"LLM Output: {output}")
        if pred_idx != -1:
            print(f"Predicted Answer: {chr(65 + pred_idx)}. {options[pred_idx]}")
        else:
            print("Predicted Answer: Invalid or Unrecognized")
        print(f"Correct Answer: {chr(65 + correct_idx)}. {options[correct_idx]}")
        print('*--*'*100)

    # Evaluation (accuracy + F1_score)
    correct = [yt == yp for yt, yp in zip(y_true, y_pred)]
    accuracy = accuracy_score(y_true, y_pred)
    f1_micro = f1_score(y_true, y_pred, average="micro")
    f1_macro = f1_score(y_true, y_pred, average="macro")

    print("\n--- Evaluation Results ---")
    print(f"Total Questions: {total}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"F1 Score (micro): {f1_micro:.2%}")
    print(f"F1 Score (macro): {f1_macro:.2%}")
    print(f"Invalid Predictions: {sum(p == -1 for p in y_pred)}")

In [None]:
main()

## **5th: Prompting + UMLS + Pubmed**

In [None]:
def build_prompt(question, options, top1, top2, top3, top4, top5):
    ctx1 = "\n".join(top1[2]) #variable used to store context retreived from our knowlegde source for the top 1 option that's embeddding is mostly similar to the question's
    ctx2 = "\n".join(top2[2]) #variable used to store context retreived from our knowlegde source for the second top option that's embeddding is mostly similar to the question's
    ctx3 = "\n".join(top3[2])
    ctx4 = "\n".join(top4[2])
    ctx5 = "\n".join(top5[2])
    prompt = f"""
You are an expert medical question-answering AI. Your task is to accurately answer the given medical question by carefully analyzing the provided question, the five multiple-choice options, and the supporting evidence.

---

Question:
{question}

Options:
A. {options[0]}
B. {options[1]}
C. {options[2]}
D. {options[3]}
E. {options[4]}

--- Supporting Evidence ---

The following evidence is relevant to understanding the most plausible options:

Evidence for Option '{top1[0]}':
{ctx1}

Evidence for Option '{top2[0]}':
{ctx2}

Evidence for Option '{top3[0]}':
{ctx3}

Evidence for Option '{top4[0]}':
{ctx4}

Evidence for Option '{top5[0]}':
{ctx5}


---

Based on the comprehensive information above, select the single best option (A, B, C, D, or E) that most accurately answers the medical question.
Answer:
"""
    return prompt

In [None]:
def normalize(text):
    return text.strip().lower().replace(" ", "_") # lowercase + replace 'space' with '_'

In [None]:
def main():
    # Load mention vectors
    with open("optionll_embeddings_and_sentences.json") as f:
        mention_data = json.load(f)

    # Normalize mention keys
    normalized_mention_data = {
        normalize(k): v for k, v in mention_data.items()
    }

    # Load MEDQA dataset
    with open("medqal.jsonl") as f:
        medqa_data = [json.loads(line) for line in f]

    y_true = []
    y_pred = []
    total = 0

    print("Running QA loop...")
    for idx, sample in enumerate(tqdm(medqa_data, desc="Evaluating")):
        question = sample["question"]
        option_dict = sample["options"]  # dict: {'A': ..., 'B': ..., ...}
        options = [option_dict[k] for k in sorted(option_dict.keys())]  # Keep order: A-E
        correct_idx = options.index(sample["answer"])
        total += 1

        # Embed question using our existing function
        q_vec = embed_text(question, tokenizer, bert_model)

        # Similarity scoring
        scored = []
        for opt in options:
            opt_norm = normalize(opt)

            if opt_norm not in normalized_mention_data:
                print(f"[WARN] Option not found in mention_data: '{opt}'")
                scored.append((opt, -1.0, []))
                continue

            data = normalized_mention_data[opt_norm]
            #getting the similarites between the question and the avg embedding vector of theoptions
            sim = cosine_sim(q_vec, data["avg_vector"])
            context = data["top_sentences"][:2]
            scored.append((opt, sim, context))

        # Sort by similarity
        top_options = sorted(scored, key=lambda x: x[1], reverse=True)
        top1, top2, top3, top4, top5 = top_options[0], top_options[1], top_options[3], top_options[4],top_options[5]
        # Prompt
        prompt = build_prompt(question, options, top1, top2, top3, top4, top5)
        #generating the llm Output
        output = query_llm(llm_pipe, prompt)
        #extracting the correct option  from the llm output
        pred_idx = extract_answer(output, options)

        # these prints are used for debugging
        # print(prompt)
        # print('*'*10)
        # print(top1)
        # print('*'*10)
        # print(top2)
        # print('*'*100)
        # print('\n')
        #________________________________________________________

        y_true.append(correct_idx)
        y_pred.append(pred_idx)

        # Print answer for each question
        print(f"\n--- Question {idx + 1} ---")
        print(f"Question: {question}")
        print(f"Options: {options}")
        print(f"LLM Output: {output}")
        if pred_idx != -1:
            print(f"Predicted Answer: {chr(65 + pred_idx)}. {options[pred_idx]}")
        else:
            print("Predicted Answer: Invalid or Unrecognized")
        print(f"Correct Answer: {chr(65 + correct_idx)}. {options[correct_idx]}")
        print('-'*100)

    # Evaluation
    correct = [yt == yp for yt, yp in zip(y_true, y_pred)]
    accuracy = accuracy_score(y_true, y_pred)
    f1_micro = f1_score(y_true, y_pred, average="micro")
    f1_macro = f1_score(y_true, y_pred, average="macro")

    print("\n--- Evaluation Results ---")
    print(f"Total Questions: {total}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"F1 Score (micro): {f1_micro:.2%}")
    print(f"F1 Score (macro): {f1_macro:.2%}")
    print(f"Top-1 Hits: {sum(correct)} / {total}")
    print(f"Invalid Predictions: {sum(p == -1 for p in y_pred)}")

In [None]:
main()

## **6th: Prompting + UMLS + Pubmed + Scoring**

In [None]:
def build_prompt(question, options, top1, top2, top3, top4, top5):
    ctx1 = "\n".join(top1[2])
    ctx2 = "\n".join(top2[2])
    ctx3 = "\n".join(top3[2])
    ctx4 = "\n".join(top4[2])
    ctx5 = "\n".join(top5[2])

    prompt = f"""
You are a highly specialized and precise medical diagnostic AI. Your sole objective is to identify the single, unequivocally correct answer to the provided medical multiple-choice question.

---

## Medical Question:
{question}

## Candidate Options:
A. {options[0]}
B. {options[1]}
C. {options[2]}
D. {options[3]}
E. {options[4]}

---

## Relevance Scores:
These scores indicate the semantic alignment between the question and each option. Use them as a preliminary guide, but **do not solely rely on them**.

-   **Option '{top1[0]}'**: {top1[1]:.4f}
-   **Option '{top2[0]}'**: {top2[1]:.4f}
-   **Option '{top3[0]}'**: {top3[1]:.4f}
-   **Option '{top4[0]}'**: {top4[1]:.4f}
-   **Option '{top5[0]}'**: {top5[1]:.4f}

---

## Definitive Medical Evidence:
**This section contains the critical information.** Evaluate each piece of evidence thoroughly and discern which option it directly and robustly supports or refutes. This is the primary determinant for your answer.

### Evidence for Option '{top1[0]}':
{ctx1}

### Evidence for Option '{top2[0]}':
{ctx2}

### Evidence for Option '{top3[0]}':
{ctx3}

### Evidence for Option '{top4[0]}':
{ctx4}

### Evidence for Option '{top5[0]}':
{ctx5}

---

## Decision Protocol:
1.  **Prioritize Evidence**: The correct answer is the option that is most clearly and directly supported by the **Definitive Medical Evidence**.
2.  **Evaluate All Options**: Systematically consider each of the five options in light of *all* provided evidence.
3.  **Identify Unambiguous Support**: Select the option for which the evidence provides the strongest, most direct, and unequivocal confirmation. If evidence for a particular option is absent or weak, it is less likely to be the correct answer.
4.  **No Speculation**: Do not infer or speculate beyond the provided information. Stick strictly to what is directly supported.

Provide only the letter (A, B, C, D, or E) corresponding to the single best and most evidence-backed option.

Answer:
"""
    return prompt

In [None]:
def main():
    # Load mention vectors from our knowledge source
    with open("optionll_embeddings_and_sentences.json") as f:
        mention_data = json.load(f)

    # Normalize mention keys
    normalized_mention_data = {
        normalize(k): v for k, v in mention_data.items()
    }

    # Load MEDQA dataset
    with open("medqal.jsonl") as f:
        medqa_data = [json.loads(line) for line in f]

    #define y_true and y_pred (used for Evaluation)
    y_true = []
    y_pred = []
    total = 0

    #looping through Medqa Questions
    print("Running QA loop...")
    for idx, sample in enumerate(tqdm(medqa_data, desc="Evaluating")):
        question = sample["question"]
        option_dict = sample["options"]  # dict: {'A': ..., 'B': ..., ...}
        options = [option_dict[k] for k in sorted(option_dict.keys())]  # Keep order: A-E
        correct_idx = options.index(sample["answer"]) #we take it from the medqa file
        total += 1

        # Embed question using your existing function
        q_vec = embed_text(question, tokenizer, bert_model)

        # Similarity scoring
        scored = []
        for opt in options:
            opt_norm = normalize(opt)
            # print(f'Normalized Option: {opt_norm}')
            #checking for the availability of our option
            if opt_norm not in normalized_mention_data:
                print(f"[WARN] Option not found in mention_data: '{opt}'")
                scored.append((opt, -1.0, []))
                continue

            data = normalized_mention_data[opt_norm]

            # calculating the cosine similarity between question and the avg emb. vector of the options
            sim = cosine_sim(q_vec, data["avg_vector"])
            
            context = data["top_sentences"][:5]
            # add each option and it's score + context sentences
            scored.append((opt, sim, context))

        # Sort by similarity
        top_options = sorted(scored, key=lambda x: x[1], reverse=True)
        #top1 and top2 are the top scoring options
        top1, top2, top3, top4, top5 = top_options[0], top_options[1], top_options[2], top_options[3], top_options[4]

        #prints used for debugging (not important)
        # print('*'*10)
        # print(f"Top1: {top1}\n")
        # print(f"Top2: {top2}\n")
        # print(f"Top3: {top3}\n")
        # print(f"Top4: {top4}\n")
        # print(f"Top5: {top5}\n")
        # print('*'*10)
        # print("\n")

        #Building Prompt
        prompt = build_prompt(question, options, top1, top2, top3, top4, top5)
        # print(f"Prompt:\n{prompt}")
        # print('*'*10)
        # print("\n")

        #generating output
        output = query_llm(llm_pipe, prompt)
        # print(output)
        # print('*'*10)
        #extracting the answer from the generated output
        pred_idx = extract_answer(output, options)
        print(f"Predicted Index/Answer: {pred_idx}")
        print('*'*10)
        print("\n")

        #---------------------------------------------------------

        #append the correct answer and the predicted answer
        y_true.append(correct_idx)
        y_pred.append(pred_idx)

        # Print answer for each question
        print(f"\n--- Question {idx + 1} ---")
        print(f"Question: {question}")
        print(f"Options: {options}")
        print(f"LLM Output: {output}")
        if pred_idx != -1:
            print(f"Predicted Answer: {chr(65 + pred_idx)}. {options[pred_idx]}")
        else:
            print("Predicted Answer: Invalid or Unrecognized")
        print(f"Correct Answer: {chr(65 + correct_idx)}. {options[correct_idx]}")
        print('*--*'*100)

    # Evaluation (accuracy + F1_score)
    correct = [yt == yp for yt, yp in zip(y_true, y_pred)]
    accuracy = accuracy_score(y_true, y_pred)
    f1_micro = f1_score(y_true, y_pred, average="micro")
    f1_macro = f1_score(y_true, y_pred, average="macro")

    print("\n--- Evaluation Results ---")
    print(f"Total Questions: {total}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"F1 Score (micro): {f1_micro:.2%}")
    print(f"F1 Score (macro): {f1_macro:.2%}")
    print(f"Invalid Predictions: {sum(p == -1 for p in y_pred)}")

In [None]:
main()