In [1]:
# Install necessary libraries if not already installed
!pip install -q transformers datasets tqdm

# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import re
from tqdm import tqdm
import random

# Check if CUDA (GPU) is available, and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the GPT-Neo model and tokenizer, and move the model to the device
model_name = "microsoft/Phi-3.5-mini-instruct"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the MedQA dataset with training and testing splits
dataset = load_dataset("GBaker/MedQA-USMLE-4-options")
train_dataset = dataset['train']
test_dataset = dataset['test']

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.7/61.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.5/239.5 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/654 [00:00<?, ?B/s]

phrases_no_exclude_train.jsonl:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

phrases_no_exclude_test.jsonl:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

In [2]:
# Function to create few-shot examples from the training data
def create_few_shot_examples(train_data, num_examples):
    """
    Create a few-shot prompt using num_examples samples from the training dataset.
    Each example will include a question, options, and the correct answer.
    """
    # Shuffle and select random examples from the training set for diversity
    shuffled_data = random.sample(list(train_data), len(train_data))

    examples = ""
    selected_categories = set()  # Keep track of categories to ensure diversity

    # Use tqdm to monitor the progress of few-shot example creation
    for sample in tqdm(shuffled_data, desc="Creating few-shot examples", total=len(train_data)):
        question = sample['question']
        options = sample['options']
        correct_answer = sample['answer']  # Correct answer is the key ('A', 'B', 'C', or 'D')

        # Ensure diversity by limiting to different topics or question types (if available)
        if len(selected_categories) >= num_examples:
            break

        # Prepare the few-shot format (Question, Options, Answer)
        options_text = ', '.join([f"{key}: {value}" for key, value in options.items()])
        example = f"Question: {question}\nOptions: {options_text}\nAnswer: {correct_answer}\n\n"
        examples += example

        # Use some logic to track different question types/domains (optional)
        selected_categories.add(question)  # Replace with refined logic for topic detection

    return examples

# Function to perform inference with few-shot prompting and match answer using regex
def get_model_prediction(question, options_dict, few_shot_prompt):
    """
    Generate a few-shot prediction using GPT-Neo for a question with multiple choices.
    The options are provided as a dictionary with keys: 'A', 'B', 'C', 'D'.
    The few-shot examples are passed as part of the prompt.
    """
    # Prepare the prompt with few-shot examples + the current question
    options_text = ', '.join([f"{key}: {value}" for key, value in options_dict.items()])
    input_text = f"{few_shot_prompt}\nQuestion: {question}\nOptions: {options_text}\nAnswer:"''

    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # Generate output (can be tuned using temperature, max_length, etc.)
    output = model.generate(inputs["input_ids"], max_new_tokens=200)

    # Decode the output to text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)


    ind=generated_text.find(question)
    extracted_text=generated_text[ind+len(question)+len(options_text)+10:]
    ind2=extracted_text.find("Question")
    predicted=extracted_text[:ind2].strip()

    return predicted

In [None]:
# Evaluate model on the test dataset with tqdm to monitor the progress
correct_predictions = 0
total_predictions = 0

# Add tqdm to track the evaluation loop
for sample in tqdm(test_dataset, desc="Evaluating on test dataset", total=len(test_dataset)):
    few_shot_prompt = create_few_shot_examples(train_dataset, num_examples=3)
    question = sample['question']
    options = sample['options']  # This is a dictionary with 'A', 'B', 'C', 'D' as keys
    correct_answer = sample['answer']  # Correct answer is the key ('A', 'B', 'C', or 'D')

    # Get the model's prediction using few-shot prompting
    predicted_answer = get_model_prediction(question, options, few_shot_prompt)

    print(predicted_answer)
    print(correct_answer)

    # Check if prediction is correct
    if correct_answer in predicted_answer:
        correct_predictions += 1

    total_predictions += 1

    # Calculate and print the accuracy after each iteration
    accuracy = correct_predictions / total_predictions * 100
    print(f"Iteration {total_predictions}: Accuracy so far: {accuracy:.2f}%")


# Final accuracy
final_accuracy = correct_predictions / total_predictions * 100
print(f"Final Accuracy: {final_accuracy:.2f}%")

Evaluating on test dataset:   0%|          | 0/1273 [00:00<?, ?it/s]
Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 25471.48it/s]
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
You are not running the flash-attention implementation, expect numerical differences.
Evaluating on test dataset:   0%|          | 1/1273 [00:52<18:37:20, 52.70s/it]

Answer: Tell the attending that he cannot fail to disclose this mistake
Tell the attending that he cannot fail to disclose this mistake
Iteration 1: Accuracy so far: 100.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 22550.02it/s]
Evaluating on test dataset:   0%|          | 2/1273 [01:43<18:11:06, 51.51s/it]

Answer: Inhibition of proteasome
Cross-linking of DNA
Iteration 2: Accuracy so far: 50.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29606.85it/s]
Evaluating on test dataset:   0%|          | 3/1273 [02:44<19:43:06, 55.90s/it]

Answer: Cholesterol embolization
Cholesterol embolization
Iteration 3: Accuracy so far: 66.67%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 33465.19it/s]
Evaluating on test dataset:   0%|          | 4/1273 [03:43<20:04:51, 56.97s/it]

Answer: Encapsulated, gram-negative coccobacilli forming grey-colored colonies on charcoal blood agar
Lactose-fermenting, gram-negative rods forming pink colonies on MacConkey agar
Iteration 4: Accuracy so far: 50.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29468.18it/s]
Evaluating on test dataset:   0%|          | 5/1273 [04:41<20:12:46, 57.39s/it]

Answer: Ketotifen eye drops
Ketotifen eye drops
Iteration 5: Accuracy so far: 60.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 17722.41it/s]
Evaluating on test dataset:   0%|          | 6/1273 [05:42<20:43:12, 58.87s/it]

Answer: Reassurance and continuous monitoring
Reassurance and continuous monitoring
Iteration 6: Accuracy so far: 66.67%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 23607.71it/s]
Evaluating on test dataset:   1%|          | 7/1273 [06:39<20:22:49, 57.95s/it]

Answer: Benign prostatic hyperplasia
Common iliac artery aneurysm
Iteration 7: Accuracy so far: 57.14%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 20901.85it/s]
Evaluating on test dataset:   1%|          | 8/1273 [07:39<20:40:15, 58.83s/it]

Answer: Clopidogrel
Clopidogrel
Iteration 8: Accuracy so far: 62.50%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27838.30it/s]
Evaluating on test dataset:   1%|          | 9/1273 [08:40<20:51:57, 59.43s/it]

Answer: Active or recurrent pelvic inflammatory disease (PID)
Active or recurrent pelvic inflammatory disease (PID)
Iteration 9: Accuracy so far: 66.67%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 23967.45it/s]
Evaluating on test dataset:   1%|          | 10/1273 [09:33<20:09:39, 57.47s/it]

Answer: Silvery plaques on extensor surfaces
Silvery plaques on extensor surfaces
Iteration 10: Accuracy so far: 70.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 33465.19it/s]
Evaluating on test dataset:   1%|          | 11/1273 [10:32<20:15:13, 57.78s/it]

Answer: It is an HIV-1/HIV2 antibody differentiation immunoassay
It is an HIV-1/HIV2 antibody differentiation immunoassay
Iteration 11: Accuracy so far: 72.73%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 25890.77it/s]
Evaluating on test dataset:   1%|          | 12/1273 [11:31<20:25:05, 58.29s/it]

Answer: Ruxolitinib
Ruxolitinib
Iteration 12: Accuracy so far: 75.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 32513.98it/s]
Evaluating on test dataset:   1%|          | 13/1273 [12:29<20:21:44, 58.18s/it]

Answer: Meningioma
Meningioma
Iteration 13: Accuracy so far: 76.92%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 34285.86it/s]
Evaluating on test dataset:   1%|          | 14/1273 [13:24<20:03:58, 57.38s/it]

Answer: D: A reduction in diastolic filling time
A reduction in diastolic filling time
Iteration 14: Accuracy so far: 78.57%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 31775.03it/s]
Evaluating on test dataset:   1%|          | 15/1273 [14:23<20:11:02, 57.76s/it]

Answer: Vaccinia virus
Rotavirus
Iteration 15: Accuracy so far: 73.33%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 25216.26it/s]
Evaluating on test dataset:   1%|▏         | 16/1273 [15:21<20:10:49, 57.80s/it]

Answer: Acute cholecystitis
Gallbladder cancer
Iteration 16: Accuracy so far: 68.75%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 20763.88it/s]
Evaluating on test dataset:   1%|▏         | 17/1273 [16:31<21:23:41, 61.32s/it]

Answer: IL-13
IL-4
Iteration 17: Accuracy so far: 64.71%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27654.75it/s]
Evaluating on test dataset:   1%|▏         | 18/1273 [17:33<21:31:20, 61.74s/it]

Answer: Matching
Matching
Iteration 18: Accuracy so far: 66.67%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 24769.51it/s]
Evaluating on test dataset:   1%|▏         | 19/1273 [18:34<21:25:29, 61.51s/it]

Answer: Indomethacin +/- omeprazole
Ibuprofen + colchicine +/- omeprazole
Iteration 19: Accuracy so far: 63.16%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30690.03it/s]
Evaluating on test dataset:   2%|▏         | 20/1273 [19:42<22:05:47, 63.49s/it]

Answer: Ethanol intoxication
Benzodiazepine intoxication
"
Iteration 20: Accuracy so far: 60.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 25368.77it/s]
Evaluating on test dataset:   2%|▏         | 21/1273 [20:48<22:18:15, 64.13s/it]

Answer: Previous breast cancer
Previous radiation therapy
Iteration 21: Accuracy so far: 57.14%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 19599.55it/s]
Evaluating on test dataset:   2%|▏         | 22/1273 [21:49<22:00:21, 63.33s/it]

Answer: Lithium exposure in utero
22q11 deletion
Iteration 22: Accuracy so far: 54.55%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:01, 8349.64it/s]
Evaluating on test dataset:   2%|▏         | 23/1273 [22:54<22:09:54, 63.84s/it]

Answer: Histoplasma capsulatum infection
Histoplasma capsulatum infection
Iteration 23: Accuracy so far: 56.52%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28149.69it/s]
Evaluating on test dataset:   2%|▏         | 24/1273 [24:00<22:20:37, 64.40s/it]

Answer: Streptococcus pneumoniae
Staphylococcus aureus
Iteration 24: Accuracy so far: 54.17%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 21546.08it/s]
Evaluating on test dataset:   2%|▏         | 25/1273 [25:07<22:33:48, 65.09s/it]

Answer: Intubate with mechanical ventilation
Intubate with mechanical ventilation
Iteration 25: Accuracy so far: 56.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 23786.22it/s]
Evaluating on test dataset:   2%|▏         | 26/1273 [26:15<22:54:28, 66.13s/it]

Answer: Respiratory burst
Respiratory burst
Iteration 26: Accuracy so far: 57.69%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30467.10it/s]
Evaluating on test dataset:   2%|▏         | 27/1273 [27:17<22:22:53, 64.67s/it]

Answer: Steeple sign
Steeple sign
Iteration 27: Accuracy so far: 59.26%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27176.92it/s]
Evaluating on test dataset:   2%|▏         | 28/1273 [28:27<22:58:23, 66.43s/it]

Answer: A: Induction of CYP3A4 by rifampin leading to decreased serum levels of ethinylestradiol and progesterone
Induction of CYP3A4 by rifampin leading to decreased serum levels of ethinylestradiol and progesterone
Iteration 28: Accuracy so far: 60.71%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28992.88it/s]
Evaluating on test dataset:   2%|▏         | 29/1273 [29:36<23:09:34, 67.02s/it]

Answer: Increased cerebrospinal fluid protein with normal cell count
"
Increased cerebrospinal fluid protein with normal cell count
Iteration 29: Accuracy so far: 62.07%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 23045.63it/s]
Evaluating on test dataset:   2%|▏         | 30/1273 [30:38<22:41:29, 65.72s/it]

Answer: Reassurance
"
Reassurance
Iteration 30: Accuracy so far: 63.33%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 21326.97it/s]
Evaluating on test dataset:   2%|▏         | 31/1273 [31:41<22:19:50, 64.73s/it]

Answer: Obstruction of the cystic duct
Obstruction of the cystic duct
Iteration 31: Accuracy so far: 64.52%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 17822.82it/s]
Evaluating on test dataset:   3%|▎         | 32/1273 [32:39<21:42:07, 62.96s/it]

Answer: Impaired left ventricular contractility
Increased ventricular wall stiffness
Iteration 32: Accuracy so far: 62.50%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 22753.91it/s]
Evaluating on test dataset:   3%|▎         | 33/1273 [33:42<21:36:50, 62.75s/it]

Answer: Trimethoprim/sulfamethoxazole
Chloramphenicol
Iteration 33: Accuracy so far: 60.61%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26886.56it/s]
Evaluating on test dataset:   3%|▎         | 34/1273 [34:42<21:22:17, 62.10s/it]

Answer: Ectopic secretion of gastrin
Proliferation of gastric mucus-producing cells
Iteration 34: Accuracy so far: 58.82%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28468.13it/s]
Evaluating on test dataset:   3%|▎         | 35/1273 [35:45<21:23:03, 62.18s/it]

Answer: Insulin, potassium, IV fluids, and glucose
Insulin, potassium, IV fluids, and glucose
Iteration 35: Accuracy so far: 60.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 20695.58it/s]
Evaluating on test dataset:   3%|▎         | 36/1273 [36:44<21:04:39, 61.34s/it]

Answer: Psoriatic arthritis
Psoriatic arthritis
Iteration 36: Accuracy so far: 61.11%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28859.89it/s]
Evaluating on test dataset:   3%|▎         | 37/1273 [37:42<20:40:58, 60.24s/it]

Answer: Myasthenia gravis
Paraneoplastic syndrome from small cell carcinoma of the lung
Iteration 37: Accuracy so far: 59.46%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 25115.59it/s]
Evaluating on test dataset:   3%|▎         | 38/1273 [38:43<20:45:10, 60.49s/it]

Answer: Defective T cell function
Defective T cell function
Iteration 38: Accuracy so far: 60.53%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 22075.28it/s]
Evaluating on test dataset:   3%|▎         | 39/1273 [39:53<21:45:48, 63.49s/it]

Answer: 2.67
2.67
Iteration 39: Accuracy so far: 61.54%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27294.82it/s]
Evaluating on test dataset:   3%|▎         | 40/1273 [40:51<21:05:47, 61.60s/it]

Answer: Arcuate fasciculus
Arcuate fasciculus
Iteration 40: Accuracy so far: 62.50%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27176.92it/s]
Evaluating on test dataset:   3%|▎         | 41/1273 [41:58<21:43:52, 63.50s/it]

Answer: Polycystic ovarian syndrome (PCOS)
Polycystic ovarian syndrome (PCOS)
Iteration 41: Accuracy so far: 63.41%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 18836.69it/s]
Evaluating on test dataset:   3%|▎         | 42/1273 [42:59<21:26:12, 62.69s/it]

Answer: Strict blood glucose control
Strict blood glucose control
Iteration 42: Accuracy so far: 64.29%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:04, 2098.55it/s]
Evaluating on test dataset:   3%|▎         | 43/1273 [44:06<21:47:02, 63.76s/it]

Answer: Duodenal atresia
Duodenal atresia
Iteration 43: Accuracy so far: 65.12%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 21769.74it/s]
Evaluating on test dataset:   3%|▎         | 44/1273 [45:07<21:33:18, 63.14s/it]

Answer: Pulmonary artery
Coronary sinus
Iteration 44: Accuracy so far: 63.64%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26715.31it/s]
Evaluating on test dataset:   4%|▎         | 45/1273 [46:12<21:41:30, 63.59s/it]

Answer: Irregular 14-week sized uterus
Globular 10-week sized uterus
Iteration 45: Accuracy so far: 62.22%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 23130.35it/s]
Evaluating on test dataset:   4%|▎         | 46/1273 [47:14<21:32:52, 63.22s/it]

Answer: Fomepizole
Fomepizole
Iteration 46: Accuracy so far: 63.04%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30541.05it/s]
Evaluating on test dataset:   4%|▎         | 47/1273 [48:13<21:06:31, 61.98s/it]

Answer: B: 16
20
Iteration 47: Accuracy so far: 61.70%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29262.59it/s]
Evaluating on test dataset:   4%|▍         | 48/1273 [49:18<21:21:26, 62.76s/it]

Answer: Aortoiliac artery stenosis
Femoropopliteal artery stenosis
Iteration 48: Accuracy so far: 60.42%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 31694.99it/s]
Evaluating on test dataset:   4%|▍         | 49/1273 [50:17<20:56:40, 61.60s/it]

Answer: Obtain consent for parental genetic testing
Recommend autopsy of the infant
Iteration 49: Accuracy so far: 59.18%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29127.11it/s]
Evaluating on test dataset:   4%|▍         | 50/1273 [51:13<20:23:04, 60.00s/it]

Answer: Proliferation of surfactant-secreting cells
Proliferation of surfactant-secreting cells
Iteration 50: Accuracy so far: 60.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 19418.07it/s]
Evaluating on test dataset:   4%|▍         | 51/1273 [52:05<19:33:57, 57.64s/it]

Answer: Induces breaks in double-stranded DNA
Induces breaks in double-stranded DNA
Iteration 51: Accuracy so far: 60.78%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 33734.35it/s]
Evaluating on test dataset:   4%|▍         | 52/1273 [53:04<19:41:18, 58.05s/it]

Answer: Aldosterone excess
Aldosterone excess
Iteration 52: Accuracy so far: 61.54%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 22429.43it/s]
Evaluating on test dataset:   4%|▍         | 53/1273 [53:59<19:22:18, 57.16s/it]

Answer: Defective hepatic bile excretion
Defective hepatic bile excretion
Iteration 53: Accuracy so far: 62.26%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 34192.70it/s]
Evaluating on test dataset:   4%|▍         | 54/1273 [55:00<19:40:53, 58.12s/it]

Answer: Metoclopramide
Atropine
Iteration 54: Accuracy so far: 61.11%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26214.40it/s]
Evaluating on test dataset:   4%|▍         | 55/1273 [55:56<19:28:01, 57.54s/it]

Answer: Tardive dyskinesia
Tardive dyskinesia
Iteration 55: Accuracy so far: 61.82%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26159.90it/s]
Evaluating on test dataset:   4%|▍         | 56/1273 [56:56<19:40:54, 58.22s/it]

Answer: KOH examination of lesion scrapings
KOH examination of lesion scrapings
Iteration 56: Accuracy so far: 62.50%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 23563.51it/s]
Evaluating on test dataset:   4%|▍         | 57/1273 [57:58<20:06:43, 59.54s/it]

Answer: Gynecomastia
Gynecomastia
Iteration 57: Accuracy so far: 63.16%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 33734.35it/s]
Evaluating on test dataset:   5%|▍         | 58/1273 [59:00<20:16:10, 60.06s/it]

Answer: Variable β-sequence of the T cell receptor
Variable β-sequence of the T cell receptor
Iteration 58: Accuracy so far: 63.79%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30320.27it/s]
Evaluating on test dataset:   5%|▍         | 59/1273 [1:00:06<20:55:30, 62.05s/it]

Answer: Hemolytic uremic syndrome
Hemolytic uremic syndrome
Iteration 59: Accuracy so far: 64.41%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 35848.75it/s]
Evaluating on test dataset:   5%|▍         | 60/1273 [1:01:07<20:46:06, 61.64s/it]

Answer: Patients with this disorder are not further sub-typed
Patients with this disorder are not further sub-typed
Iteration 60: Accuracy so far: 65.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29676.68it/s]
Evaluating on test dataset:   5%|▍         | 61/1273 [1:02:05<20:21:24, 60.47s/it]

Answer: Salmonella typhi
Salmonella typhi
Iteration 61: Accuracy so far: 65.57%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 20393.70it/s]
Evaluating on test dataset:   5%|▍         | 62/1273 [1:03:03<20:06:52, 59.80s/it]

Answer: Cytochrome P-450 enzymes
Acetaldehyde
Iteration 62: Accuracy so far: 64.52%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 32181.36it/s]
Evaluating on test dataset:   5%|▍         | 63/1273 [1:04:05<20:22:36, 60.63s/it]

Answer: Serum B12 level
No tests required
Iteration 63: Accuracy so far: 63.49%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 32017.59it/s]
Evaluating on test dataset:   5%|▌         | 64/1273 [1:05:11<20:52:21, 62.15s/it]

Answer: Digital rectal exam
Insert a ‘straight cath’ into the patient’s bladder
Iteration 64: Accuracy so far: 62.50%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 22836.50it/s]
Evaluating on test dataset:   5%|▌         | 65/1273 [1:06:10<20:30:47, 61.13s/it]

Answer: Inhibition of 14-alpha-demethylase
Disruption of cell membrane permeability
Iteration 65: Accuracy so far: 61.54%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 17672.63it/s]
Evaluating on test dataset:   5%|▌         | 66/1273 [1:07:14<20:48:21, 62.06s/it]

Answer: Bulging disc impinging on lumbar spinal nerve
Lytic lesions of the lumbar spine
Iteration 66: Accuracy so far: 60.61%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27176.92it/s]
Evaluating on test dataset:   5%|▌         | 67/1273 [1:08:14<20:32:04, 61.30s/it]

Answer: Perform emergency laparotomy
Perform emergency laparotomy
Iteration 67: Accuracy so far: 61.19%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30467.10it/s]
Evaluating on test dataset:   5%|▌         | 68/1273 [1:09:13<20:19:18, 60.71s/it]

Answer: Transplacental passage of thyroid peroxidase antibodies
Transplacental passage of TSH receptor antibodies
Iteration 68: Accuracy so far: 60.29%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 24338.32it/s]
Evaluating on test dataset:   5%|▌         | 69/1273 [1:10:14<20:22:46, 60.94s/it]

Answer: Phentolamine
Nadalol
Iteration 69: Accuracy so far: 59.42%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28992.88it/s]
Evaluating on test dataset:   5%|▌         | 70/1273 [1:11:15<20:20:16, 60.86s/it]

Answer: Observe and get follow-up imaging in 3 months
Proceed with liver biopsy
Iteration 70: Accuracy so far: 58.57%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 21219.08it/s]
Evaluating on test dataset:   6%|▌         | 71/1273 [1:12:15<20:11:28, 60.47s/it]

Answer: Spontaneous bacterial peritonitis
Spontaneous bacterial peritonitis
Iteration 71: Accuracy so far: 59.15%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 35246.25it/s]
Evaluating on test dataset:   6%|▌         | 72/1273 [1:13:15<20:07:47, 60.34s/it]

Answer: Cardiac contusion
Cardiac contusion
Iteration 72: Accuracy so far: 59.72%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 33465.19it/s]
Evaluating on test dataset:   6%|▌         | 73/1273 [1:14:17<20:17:36, 60.88s/it]

Answer: Transjugular intrahepatic portosystemic shunting
Liver transplantation
Iteration 73: Accuracy so far: 58.90%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29676.68it/s]
Evaluating on test dataset:   6%|▌         | 74/1273 [1:15:19<20:22:23, 61.17s/it]

Answer: Propranolol
Methimazole
Iteration 74: Accuracy so far: 58.11%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 25523.15it/s]
Evaluating on test dataset:   6%|▌         | 75/1273 [1:16:22<20:33:41, 61.79s/it]

Answer: Skin biopsy
Potassium hydroxide preparation
Iteration 75: Accuracy so far: 57.33%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 31936.32it/s]
Evaluating on test dataset:   6%|▌         | 76/1273 [1:17:22<20:22:34, 61.28s/it]

Answer: 21
5
Iteration 76: Accuracy so far: 56.58%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 31615.36it/s]
Evaluating on test dataset:   6%|▌         | 77/1273 [1:18:23<20:18:06, 61.11s/it]

Answer: Biopsy of the mass
Biopsy of the mass
Iteration 77: Accuracy so far: 57.14%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 31223.11it/s]
Evaluating on test dataset:   6%|▌         | 78/1273 [1:19:23<20:14:30, 60.98s/it]

Answer: Deposition of calcium pyrophosphate (CPP) crystals
Deposition of calcium pyrophosphate (CPP) crystals
Iteration 78: Accuracy so far: 57.69%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 24385.49it/s]
Evaluating on test dataset:   6%|▌         | 79/1273 [1:20:25<20:15:02, 61.06s/it]

Answer: Transposition of great vessels
Transposition of great vessels
Iteration 79: Accuracy so far: 58.23%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27060.03it/s]
Evaluating on test dataset:   6%|▋         | 80/1273 [1:21:28<20:29:53, 61.86s/it]

Answer: Needle thoracostomy over the 2nd intercostal space
Interrupted 2-0 polypropylene suture with supporting pledgets
Iteration 80: Accuracy so far: 57.50%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 33554.43it/s]
Evaluating on test dataset:   6%|▋         | 81/1273 [1:22:33<20:48:01, 62.82s/it]

Answer: Reid Index > 50%
Reid Index > 50%
Iteration 81: Accuracy so far: 58.02%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29888.15it/s]
Evaluating on test dataset:   6%|▋         | 82/1273 [1:23:40<21:06:19, 63.79s/it]

Answer: Ras pathway transcription factors
Caspase-9
Iteration 82: Accuracy so far: 57.32%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 32263.88it/s]
Evaluating on test dataset:   7%|▋         | 83/1273 [1:24:46<21:24:05, 64.74s/it]

Answer: Acral lentiginous
Acral lentiginous
Iteration 83: Accuracy so far: 57.83%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 23130.35it/s]
Evaluating on test dataset:   7%|▋         | 84/1273 [1:25:57<21:57:46, 66.50s/it]

Answer: Jaw claudication
Jaw claudication
Iteration 84: Accuracy so far: 58.33%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 18696.75it/s]
Evaluating on test dataset:   7%|▋         | 85/1273 [1:27:03<21:54:15, 66.38s/it]

Answer: Endometrial tissue outside the uterine cavity
Endometrial tissue outside the uterine cavity
Iteration 85: Accuracy so far: 58.82%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29888.15it/s]
Evaluating on test dataset:   7%|▋         | 86/1273 [1:27:59<20:48:14, 63.10s/it]

Answer: Friable irregular masses attached to the valve
Friable irregular masses attached to the valve
Iteration 86: Accuracy so far: 59.30%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28728.11it/s]


In [None]:
total_predictions = 86  # Total number of predictions made up to iteration 86
accuracy_at_86 = 59.30  # Accuracy at iteration 86
correct_predictions = int((accuracy_at_86 / 100) * total_predictions)

for i, sample in enumerate(tqdm(test_dataset, desc="Evaluating on test dataset", total=len(test_dataset))):
    if i < 86:
        continue

    few_shot_prompt = create_few_shot_examples(train_dataset, num_examples=3)
    question = sample['question']
    options = sample['options']  # This is a dictionary with 'A', 'B', 'C', 'D' as keys
    correct_answer = sample['answer']  # Correct answer is the key ('A', 'B', 'C', or 'D')

    # Get the model's prediction using few-shot prompting
    predicted_answer = get_model_prediction(question, options, few_shot_prompt)

    print(predicted_answer)
    print(correct_answer)

    # Check if prediction is correct
    if correct_answer in predicted_answer:
        correct_predictions += 1

    total_predictions += 1

    # Calculate and print the accuracy after each iteration
    accuracy = correct_predictions / total_predictions * 100
    print(f"Iteration {i+1}: Accuracy so far: {accuracy:.2f}%")


Evaluating on test dataset:   0%|          | 0/1273 [00:00<?, ?it/s]
Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28339.89it/s]
Evaluating on test dataset:   7%|▋         | 87/1273 [01:02<14:16,  1.38it/s]

Answer: Hysteroscopy
Laparoscopy
Iteration 87: Accuracy so far: 57.47%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 19753.39it/s]
Evaluating on test dataset:   7%|▋         | 88/1273 [02:08<35:06,  1.78s/it]

Answer: High LDL-cholesterol
High LDL-cholesterol
Iteration 88: Accuracy so far: 57.95%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 31775.03it/s]
Evaluating on test dataset:   7%|▋         | 89/1273 [03:10<1:01:39,  3.12s/it]

Answer: Tension pneumothorax
Thoracic aortic rupture
Iteration 89: Accuracy so far: 57.30%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29399.33it/s]
Evaluating on test dataset:   7%|▋         | 90/1273 [04:04<1:33:17,  4.73s/it]

Answer: Propylthiouracil
Propylthiouracil
Iteration 90: Accuracy so far: 57.78%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26159.90it/s]
Evaluating on test dataset:   7%|▋         | 91/1273 [05:00<2:16:14,  6.92s/it]

Answer: Freshwater snails
Freshwater snails
Iteration 91: Accuracy so far: 58.24%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27354.16it/s]
Evaluating on test dataset:   7%|▋         | 92/1273 [06:03<3:18:40, 10.09s/it]

Answer: Normal hemoglobin in patients with tetralogy of Fallot does not rule out iron deficiency anemia.
Normal hemoglobin in patients with tetralogy of Fallot does not rule out iron deficiency anemia.
Iteration 92: Accuracy so far: 58.70%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 21883.33it/s]
Evaluating on test dataset:   7%|▋         | 93/1273 [07:09<4:42:43, 14.38s/it]

Answer: Delirium
Delirium
Iteration 93: Accuracy so far: 59.14%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:04, 2510.56it/s]
Evaluating on test dataset:   7%|▋         | 94/1273 [08:08<6:07:12, 18.69s/it]

Answer: Down syndrome
Rheumatoid arthritis
Iteration 94: Accuracy so far: 58.51%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28086.86it/s]
Evaluating on test dataset:   7%|▋         | 95/1273 [09:09<7:49:20, 23.91s/it]

Answer: D Paradoxical motion of part of the chest with breathing
A drop in systolic blood pressure of 14 mmHg during inspiration
Iteration 95: Accuracy so far: 57.89%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30247.38it/s]
Evaluating on test dataset:   8%|▊         | 96/1273 [10:06<9:26:33, 28.88s/it]

Answer: MR angiography of the brain
MR angiography of the brain
Iteration 96: Accuracy so far: 58.33%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29817.33it/s]
Evaluating on test dataset:   8%|▊         | 97/1273 [11:07<11:16:15, 34.50s/it]

Answer: X-linked recessive
Autosomal dominant
Iteration 97: Accuracy so far: 57.73%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 33554.43it/s]
Evaluating on test dataset:   8%|▊         | 98/1273 [12:10<13:09:29, 40.31s/it]

Answer: Antigenic variation
Antigenic variation
Iteration 98: Accuracy so far: 58.16%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27473.61it/s]
Evaluating on test dataset:   8%|▊         | 99/1273 [13:09<14:26:33, 44.29s/it]

Answer: Cervical immobilization
Cervical immobilization
Iteration 99: Accuracy so far: 58.59%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 24966.10it/s]
Evaluating on test dataset:   8%|▊         | 100/1273 [14:10<15:44:41, 48.32s/it]

Answer: Risperidone
Bromocriptine
Iteration 100: Accuracy so far: 58.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30541.05it/s]
Evaluating on test dataset:   8%|▊         | 101/1273 [15:13<16:59:13, 52.18s/it]

Answer: Anti-D antibodies
Anti-B antibodies
Iteration 101: Accuracy so far: 57.43%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29262.59it/s]
Evaluating on test dataset:   8%|▊         | 102/1273 [16:10<17:23:33, 53.47s/it]

Answer: Amantadine
Amantadine
Iteration 102: Accuracy so far: 57.84%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27294.82it/s]
Evaluating on test dataset:   8%|▊         | 103/1273 [16:59<17:00:43, 52.34s/it]

Answer: Stop magnesium sulfate and give calcium gluconate
Stop magnesium sulfate and give calcium gluconate
Iteration 103: Accuracy so far: 58.25%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26434.69it/s]
Evaluating on test dataset:   8%|▊         | 104/1273 [17:54<17:12:49, 53.01s/it]

Answer: Myxedema coma
Myxedema coma
Iteration 104: Accuracy so far: 58.65%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 32939.56it/s]
Evaluating on test dataset:   8%|▊         | 105/1273 [18:48<17:17:20, 53.29s/it]

Answer: Aortic regurgitation
Aortic regurgitation
Iteration 105: Accuracy so far: 59.05%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 24013.19it/s]
Evaluating on test dataset:   8%|▊         | 106/1273 [19:38<16:57:44, 52.33s/it]

Answer: Borderline personality disorder
Borderline personality disorder
Iteration 106: Accuracy so far: 59.43%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 25679.41it/s]
Evaluating on test dataset:   8%|▊         | 107/1273 [20:35<17:24:38, 53.75s/it]

Answer: Bronchoscopy-guided biopsy now
Positive emission tomography (PET) of chest now
Iteration 107: Accuracy so far: 58.88%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30840.47it/s]
Evaluating on test dataset:   8%|▊         | 108/1273 [21:28<17:17:34, 53.44s/it]

Answer: Primary spermatocyte
Primary spermatocyte
Iteration 108: Accuracy so far: 59.26%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27838.30it/s]
Evaluating on test dataset:   9%|▊         | 109/1273 [22:26<17:44:49, 54.89s/it]

Answer: Supportive therapy and observation
Surgical pinning of the femoral head
Iteration 109: Accuracy so far: 58.72%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29676.68it/s]
Evaluating on test dataset:   9%|▊         | 110/1273 [23:28<18:23:33, 56.93s/it]

Answer: Melanosis coli
Stool leukocytes
Iteration 110: Accuracy so far: 58.18%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30030.82it/s]
Evaluating on test dataset:   9%|▊         | 111/1273 [24:28<18:42:10, 57.94s/it]

Answer: Bacterial translocation
Bacterial translocation
Iteration 111: Accuracy so far: 58.56%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 22878.02it/s]
Evaluating on test dataset:   9%|▉         | 112/1273 [25:35<19:33:21, 60.64s/it]

Answer: Racemic epinephrine and intramuscular corticosteroid therapy
Racemic epinephrine and intramuscular corticosteroid therapy
Iteration 112: Accuracy so far: 58.93%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 32099.27it/s]
Evaluating on test dataset:   9%|▉         | 113/1273 [26:35<19:24:50, 60.25s/it]

Answer: Decreased lower esophageal tone
Esophageal fibrosis
Iteration 113: Accuracy so far: 58.41%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30102.66it/s]
Evaluating on test dataset:   9%|▉         | 114/1273 [27:30<18:55:02, 58.76s/it]

Answer: IP3
JAK/STAT
Iteration 114: Accuracy so far: 57.89%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29817.33it/s]
Evaluating on test dataset:   9%|▉         | 115/1273 [28:24<18:26:36, 57.34s/it]

Answer: Diarrhea
Metformin
Iteration 115: Accuracy so far: 57.39%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 24769.51it/s]
Evaluating on test dataset:   9%|▉         | 116/1273 [29:17<17:58:52, 55.95s/it]

Answer: Power stroke
Power stroke
Iteration 116: Accuracy so far: 57.76%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 24818.37it/s]
Evaluating on test dataset:   9%|▉         | 117/1273 [30:07<17:27:12, 54.35s/it]

Answer: TSC1 gene on chromosome 9
TSC1 gene on chromosome 9
Iteration 117: Accuracy so far: 58.12%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28212.81it/s]
Evaluating on test dataset:   9%|▉         | 118/1273 [31:03<17:37:17, 54.92s/it]

Answer: Metformin
Metformin
Iteration 118: Accuracy so far: 58.47%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 34192.70it/s]
Evaluating on test dataset:   9%|▉         | 119/1273 [32:04<18:08:13, 56.58s/it]

Answer: Idiopathic pulmonary fibrosis
Chronic obstructive pulmonary disease
Iteration 119: Accuracy so far: 57.98%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 33916.20it/s]
Evaluating on test dataset:   9%|▉         | 120/1273 [33:05<18:33:22, 57.94s/it]

Answer: Avoid exposure to birds
Avoid exposure to birds
Iteration 120: Accuracy so far: 58.33%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27473.61it/s]
Evaluating on test dataset:  10%|▉         | 121/1273 [34:06<18:52:08, 58.97s/it]

Answer: Uterine artery
Uterine artery
Iteration 121: Accuracy so far: 58.68%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29888.15it/s]
Evaluating on test dataset:  10%|▉         | 122/1273 [35:11<19:21:37, 60.55s/it]

Answer: Epstein-Barr virus
Epstein-Barr virus
Iteration 122: Accuracy so far: 59.02%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30174.85it/s]
Evaluating on test dataset:  10%|▉         | 123/1273 [36:11<19:19:45, 60.51s/it]

Answer: Zika virus
Zika virus
Iteration 123: Accuracy so far: 59.35%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 32682.89it/s]
Evaluating on test dataset:  10%|▉         | 124/1273 [37:17<19:51:00, 62.19s/it]

Answer: Mixing study
Mixing study
Iteration 124: Accuracy so far: 59.68%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27715.67it/s]
Evaluating on test dataset:  10%|▉         | 125/1273 [38:13<19:11:32, 60.19s/it]

Answer: Diarrhea
Diarrhea
Iteration 125: Accuracy so far: 60.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 33026.02it/s]
Evaluating on test dataset:  10%|▉         | 126/1273 [39:13<19:10:32, 60.19s/it]

Answer: No management indicated
Isoniazid
Iteration 126: Accuracy so far: 59.52%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27001.96it/s]
Evaluating on test dataset:  10%|▉         | 127/1273 [40:13<19:10:00, 60.21s/it]

Answer: Herniation of the uncus
Herniation of the uncus
Iteration 127: Accuracy so far: 59.84%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 31615.36it/s]
Evaluating on test dataset:  10%|█         | 128/1273 [41:12<18:58:22, 59.65s/it]

Answer: Natural killer cells
Natural killer cells
Iteration 128: Accuracy so far: 60.16%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28859.89it/s]
Evaluating on test dataset:  10%|█         | 129/1273 [42:12<19:00:21, 59.81s/it]

Answer: Dystrophin gene mutation on genetic analysis
Intrafascicular infiltration on muscle biopsy
Iteration 129: Accuracy so far: 59.69%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30916.25it/s]
Evaluating on test dataset:  10%|█         | 130/1273 [43:25<20:15:07, 63.79s/it]

Answer: Decreased acetylcholine release
Decreased acetylcholine release
Iteration 130: Accuracy so far: 60.00%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29746.84it/s]
Evaluating on test dataset:  10%|█         | 131/1273 [44:32<20:36:42, 64.98s/it]

Answer: Breast milk jaundice
Biliary atresia
Iteration 131: Accuracy so far: 59.54%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29959.31it/s]
Evaluating on test dataset:  10%|█         | 132/1273 [45:34<20:15:39, 63.93s/it]

Answer: QT prolongation
QT prolongation
Iteration 132: Accuracy so far: 59.85%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:03, 2669.26it/s]
Evaluating on test dataset:  10%|█         | 133/1273 [46:37<20:09:46, 63.67s/it]

Answer: Diverticulitis
Diverticulitis
Iteration 133: Accuracy so far: 60.15%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28793.85it/s]
Evaluating on test dataset:  11%|█         | 134/1273 [47:40<20:06:30, 63.56s/it]

Answer: Aortoiliac atherosclerosis
Aortoiliac atherosclerosis
Iteration 134: Accuracy so far: 60.45%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28403.86it/s]
Evaluating on test dataset:  11%|█         | 135/1273 [48:40<19:43:19, 62.39s/it]

Answer: Increased PTH, decreased calcium, increased phosphate, decreased calcitriol
Increased PTH, decreased calcium, increased phosphate, decreased calcitriol
Iteration 135: Accuracy so far: 60.74%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30030.82it/s]
Evaluating on test dataset:  11%|█         | 136/1273 [49:39<19:24:04, 61.43s/it]

Answer: Binds endogenous peptides that are present in the endosome
Binds endogenous peptides that have been transported by the TAP channel
Iteration 136: Accuracy so far: 60.29%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27060.03it/s]
Evaluating on test dataset:  11%|█         | 137/1273 [50:38<19:07:04, 60.59s/it]

Answer: Gastroesophageal junction incompetence
Gastroesophageal junction incompetence
Iteration 137: Accuracy so far: 60.58%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 31145.82it/s]
Evaluating on test dataset:  11%|█         | 138/1273 [51:41<19:23:18, 61.50s/it]

Answer: Treatment with radioactive iodine
Treatment with radioactive iodine
Iteration 138: Accuracy so far: 60.87%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 21290.88it/s]
Evaluating on test dataset:  11%|█         | 139/1273 [52:42<19:16:16, 61.18s/it]

Answer: Reassuring the parents and use of an enuresis alarm
Reassuring the parents and use of an enuresis alarm
Iteration 139: Accuracy so far: 61.15%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26772.15it/s]
Evaluating on test dataset:  11%|█         | 140/1273 [53:42<19:07:16, 60.76s/it]

Answer: Decreased vibratory sense in the ipsilateral arm
Decreased positional sense in the ipsilateral leg
Iteration 140: Accuracy so far: 60.71%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26490.34it/s]
Evaluating on test dataset:  11%|█         | 141/1273 [54:41<19:00:20, 60.44s/it]

Answer: Amoxicilin
Amoxicilin
Iteration 141: Accuracy so far: 60.99%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 20295.02it/s]
Evaluating on test dataset:  11%|█         | 142/1273 [55:48<19:32:40, 62.21s/it]

Answer: Inferior petrosal sinus sampling
MRI of the adrenal glands
Iteration 142: Accuracy so far: 60.56%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26105.63it/s]
Evaluating on test dataset:  11%|█         | 143/1273 [56:44<19:00:41, 60.57s/it]

Answer: Inhibition of beta-tubulin polymerization
Inhibition of beta-tubulin polymerization
Iteration 143: Accuracy so far: 60.84%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 30765.07it/s]
Evaluating on test dataset:  11%|█▏        | 144/1273 [57:42<18:40:15, 59.54s/it]

Answer: Aromatic amines
Aromatic amines
Iteration 144: Accuracy so far: 61.11%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:04, 2542.00it/s]
Evaluating on test dataset:  11%|█▏        | 145/1273 [58:37<18:17:53, 58.40s/it]

Answer: Reassurance and follow-up
Reassurance and follow-up
Iteration 145: Accuracy so far: 61.38%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 32099.27it/s]
Evaluating on test dataset:  11%|█▏        | 146/1273 [59:28<17:32:51, 56.05s/it]

Answer: Deposits of IgG and C3 at the glomerular basement membrane on immunofluoresence
Normal light microscopy findings
Iteration 146: Accuracy so far: 60.96%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 19152.07it/s]
Evaluating on test dataset:  12%|█▏        | 147/1273 [1:00:26<17:42:18, 56.61s/it]

Answer: Tumor arising from epidermal keratinocytes
Tumor arising from cutaneous T cells
Iteration 147: Accuracy so far: 60.54%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29468.18it/s]
Evaluating on test dataset:  12%|█▏        | 148/1273 [1:01:21<17:32:33, 56.14s/it]

Answer: Cognitive behavioral therapy
Cognitive behavioral therapy
Iteration 148: Accuracy so far: 60.81%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 18641.35it/s]
Evaluating on test dataset:  12%|█▏        | 149/1273 [1:02:16<17:28:32, 55.97s/it]

Answer: Vascular dementia
Frontotemporal dementia
Iteration 149: Accuracy so far: 60.40%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28793.85it/s]
Evaluating on test dataset:  12%|█▏        | 150/1273 [1:03:13<17:29:04, 56.05s/it]

Answer: Immunoglobulin- IgM
Immunoglobulin- IgM
Iteration 150: Accuracy so far: 60.67%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 29606.85it/s]
Evaluating on test dataset:  12%|█▏        | 151/1273 [1:04:08<17:24:42, 55.87s/it]

Answer: Levofloxacin
Levofloxacin
Iteration 151: Accuracy so far: 60.93%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 26324.08it/s]
Evaluating on test dataset:  12%|█▏        | 152/1273 [1:05:03<17:20:25, 55.69s/it]

Answer: Theca leutein cysts
Theca leutein cysts
Iteration 152: Accuracy so far: 61.18%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 28532.68it/s]
Evaluating on test dataset:  12%|█▏        | 153/1273 [1:05:53<16:47:53, 53.99s/it]

Answer: Homozygous mutation in the CCR5 gene
Serum antibodies against hemagglutinin
Iteration 153: Accuracy so far: 60.78%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 24432.84it/s]
Evaluating on test dataset:  12%|█▏        | 154/1273 [1:06:44<16:25:29, 52.84s/it]

Answer: Previous suicide attempt
Previous suicide attempt
Iteration 154: Accuracy so far: 61.04%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 27838.30it/s]
Evaluating on test dataset:  12%|█▏        | 155/1273 [1:07:42<16:58:44, 54.67s/it]

Answer: Chickenpox
Chickenpox
Iteration 155: Accuracy so far: 61.29%



Creating few-shot examples:   0%|          | 3/10178 [00:00<00:00, 24291.34it/s]
