# **Install Required Libraries**

In [None]:
pip install transformers huggingface-hub accelerate hf_xet

# **Login with HuggingFace Token**

In [None]:
from huggingface_hub import login

login(token="Your_huggingface_token")

# **Importing Libraries**

In [1]:
import json
import torch
import re
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score
from transformers import (AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline)

# **Setting Device to Gpu**

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **Load Emebdding Model**

In [3]:
def load_clinicalbert():
    model_name = "emilyalsentzer/Bio_ClinicalBERT"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    return tokenizer, model

# **Method to get the embedding of the question**

In [4]:
def embed_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]

# **Metod to Calculate Cosine Similarity**

In [5]:
def cosine_sim(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

# **Load Deepseek Model**

In [7]:
def load_llm_pipeline():
    model_id = "deepseek-ai/deepSeek-llm-7b-base"
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    return pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50)

# **Load Mistral Model**

In [6]:
def load_llm_pipeline():
    model_id = "mistralai/Mistral-7B-Instruct-v0.3"
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    return pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50)

# **Load LLama Model**

In [None]:
def load_llm_pipeline():
    model_id = "meta-llama/Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    return pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)

# **Query LLM Output**

In [11]:
def query_llm(llm_pipe, prompt):
    result = llm_pipe(prompt, do_sample=False)[0]["generated_text"]
    if "Answer:" in result:
        return result.split("Answer:")[-1].strip()
    return result.strip()

# **Extract Answer from Output**

In [12]:
def extract_answer(model_output, options):
  # Dictionary to map option letters to their corresponding index
    choices = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}

    # Use regex to find a single uppercase letter A–E with word boundaries
    match = re.search(r"\b([A-E])\b", model_output.strip().upper()) #Removes any leading/trailing spaces, Converts the text to uppercase for case-insensitive matching
    if match:
        letter = match.group(1)
        return choices[letter]

    # fallback: match full option text
    # If the model did not return a letter, this block tries to match the full option text
    for idx, opt in enumerate(options):
        if opt.lower() in model_output.lower():
            return idx
    # If No Match Found
    return -1  # unrecognized

# **Loading Our Model**

In [None]:
print("Setting up...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer, bert_model = load_clinicalbert()
llm_pipe = load_llm_pipeline()

# **Prompt**

In [None]:
def build_prompt(question, options):
    prompt = f"""
                You are a highly knowledgeable and reliable AI assistant trained in clinical reasoning and evidence-based medicine.Your task is to analyze and answer multiple-choice questions, which require integration of clinical findings, pathophysiological understanding, and diagnostic reasoning.
        Carefully evaluate the clinical scenario and the answer options provided. Determine the single best answer based on the information given.

Question:
{question}

Options:
A. {options[0]}
B. {options[1]}
C. {options[2]}
D. {options[3]}
E. {options[4]}

Select the single best option (A, B, C, D, or E) that most accurately answers the medical question
Answer:
"""
    return prompt

# **Main**

In [9]:
def main():

    # Load MEDQA dataset
    with open("medqal.jsonl") as f:
        medqa_data = [json.loads(line) for line in f]

    #define y_true and y_pred (used for Evaluation)
    y_true = []
    y_pred = []
    total = 0

    #looping through Medqa Questions
    print("Running QA loop...")
    for idx, sample in enumerate(tqdm(medqa_data, desc="Evaluating")):
        question = sample["question"]
        option_dict = sample["options"]  # dict: {'A': ..., 'B': ..., ...}
        options = [option_dict[k] for k in sorted(option_dict.keys())]  # Keep order: A-E
        correct_idx = options.index(sample["answer"]) #we take it from the medqa file
        total += 1

        #Building Prompt
        prompt = build_prompt(question, options)
        # print(f"Prompt:\n{prompt}")
        # print('*'*10)
        # print("\n")

        #generating output
        output = query_llm(llm_pipe, prompt)
        print("output:")
        print(output)

        #extracting the answer from the generated output
        pred_idx = extract_answer(output, options)

        #append the correct answer and the predicted answer
        y_true.append(correct_idx)
        y_pred.append(pred_idx)

        # Print answer for each question
        print(f"\n--- Question {idx + 1} ---")
        print(f"Question: {question}")
        print(f"Options: {options}")
        print(f"LLM Output: {output}")
        if pred_idx != -1:
            print(f"Predicted Answer: {chr(65 + pred_idx)}. {options[pred_idx]}")
        else:
            print("Predicted Answer: Invalid or Unrecognized")
        print(f"Correct Answer: {chr(65 + correct_idx)}. {options[correct_idx]}")

    # Evaluation (accuracy + F1_score)
    correct = [yt == yp for yt, yp in zip(y_true, y_pred)]
    accuracy = accuracy_score(y_true, y_pred)
    f1_micro = f1_score(y_true, y_pred, average="micro")
    f1_macro = f1_score(y_true, y_pred, average="macro")

    print("\n--- Evaluation Results ---")
    print(f"Total Questions: {total}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"F1 Score (micro): {f1_micro:.2%}")
    print(f"F1 Score (macro): {f1_macro:.2%}")
    print(f"Invalid Predictions: {sum(p == -1 for p in y_pred)}")

In [None]:
main()