# Medical Model Evaluator with Benchmark Metrics

The script evaluates our medical model using the same metrics as the medical leaderboard:
- MedQA (Medical Q&A).
- MedMCQA (Medical Multiple Choice QA).
- MMLU Medical subjects (Anatomy, Clinical Knowledge).
- And is it provides comprehensive medical evaluation.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
print("Drive mounted:", os.path.exists('/content/drive/MyDrive'))

Mounted at /content/drive
Drive mounted: True


In [2]:
import os

# Checking the directory.
base_path = "/content/drive/MyDrive/llama-medx-finetuned"
print(f"Base path exists: {os.path.exists(base_path)}")

if os.path.exists(base_path):
    print("\nContents of small_model directory:")
    for item in os.listdir(base_path):
        print(f"  {item}")

    # Checking for the checkpoints directories.
    checkpoint_dirs = [item for item in os.listdir(base_path) if item.startswith('checkpoint')]
    print(f"\nCheckpoint directories found: {checkpoint_dirs}")
else:
    print("Base directory doesn't exist!")

    parent_path = "/content/drive/MyDrive"
    if os.path.exists(parent_path):
        print(f"\nContents of MyDrive:")
        for item in os.listdir(parent_path):
            print(f"  {item}")

Base path exists: True

Contents of small_model directory:
  checkpoint-100
  checkpoint-200
  checkpoint-300
  checkpoint-400
  checkpoint-500

Checkpoint directories found: ['checkpoint-100', 'checkpoint-200', 'checkpoint-300', 'checkpoint-400', 'checkpoint-500']


In [3]:
import subprocess
import sys
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Any, Tuple
import random

In [4]:
def install_packages():
    packages = [
        'torch',
        'transformers',
        'accelerate',
        'bitsandbytes',
        'peft',
        'datasets',
        'scikit-learn',
        'numpy',
        'tqdm'
    ]

    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
            print(f"Installed {package}")
        except subprocess.CalledProcessError:
            print(f"Failed to install {package}")

print("Packages.")
install_packages()

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GenerationConfig
)
from peft import PeftModel
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

Packages.
Installed torch
Installed transformers
Installed accelerate
Installed bitsandbytes
Installed peft
Installed datasets
Installed scikit-learn
Installed numpy
Installed tqdm


In [5]:
class MedicalBenchmarkEvaluator:
    def __init__(self, checkpoint_path: str):
        self.checkpoint_path = checkpoint_path
        self.model = None
        self.tokenizer = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.results = {}

# I'm loading the trained model and the tokenizer.
    def load_model(self):
        print(f"Loading model from: {self.checkpoint_path}")
        print(f"Using device: {self.device}")

        try:
            # PEFT adapter or full model.
            adapter_config_path = os.path.join(self.checkpoint_path, "adapter_config.json")

            if os.path.exists(adapter_config_path):
                print("Detected PEFT adapter model.")
                self.load_peft_model()
            else:
                print("Detected full fine-tuned model.")
                self.load_full_model()

            print("Model loaded.")

        except Exception as e:
            print(f"Error loading model: {e}")
            raise

# PEFT adapter.
    def load_peft_model(self):
        with open(os.path.join(self.checkpoint_path, "adapter_config.json"), 'r') as f:
            adapter_config = json.load(f)

        base_model_name = adapter_config.get("base_model_name_or_path", "microsoft/DialoGPT-medium")
        print(f"Base model: {base_model_name}")

        # Tokenizer.
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        ) if self.device == "cuda" else None

        # Loading the base model.
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            quantization_config=bnb_config,
            device_map="auto" if self.device == "cuda" else None,
            trust_remote_code=True
        )

        # PEFT adapter.
        self.model = PeftModel.from_pretrained(base_model, self.checkpoint_path)
        self.model.eval()

    def load_full_model(self):
        # Loading the tokenizer.
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint_path)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Loading the model.
        self.model = AutoModelForCausalLM.from_pretrained(
            self.checkpoint_path,
            device_map="auto" if self.device == "cuda" else None,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            trust_remote_code=True
        )
        self.model.eval()

# Format input for the model.
    def format_input(self, instruction: str, input_text: str = "") -> str:
        if input_text:
            prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
        else:
            prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
        return prompt

# Responses.
    def generate_response(self, instruction: str, input_text: str = "", max_length: int = 256, temperature: float = 0.1) -> str:
        prompt = self.format_input(instruction, input_text)

        # Tokenizing the input.
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=1024
        ).to(self.device)

        # For evaluation.
        generation_config = GenerationConfig(
            max_length=len(inputs.input_ids[0]) + max_length,
            temperature=temperature,
            do_sample=True if temperature > 0 else False,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.1,
            pad_token_id=self.tokenizer.pad_token_id,
            eos_token_id=self.tokenizer.eos_token_id,
        )

        # Generate the response.
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                generation_config=generation_config
            )

        # Decode the response.
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extracting only the generated part.
        response = response[len(prompt):].strip()

        return response

# Extracting the answer from model's responses.
    def extract_answer_choice(self, response: str) -> str:
        patterns = [
            r'\b([ABCD])\)',
            r'\(([ABCD])\)',
            r'\b([ABCD])\.',
            r'answer\s*:?\s*([ABCD])',
            r'option\s*:?\s*([ABCD])',
            r'\b([ABCD])\s*[-:]',
            r'correct\s*answer\s*:?\s*([ABCD])'
        ]

        response_upper = response.upper()

        for pattern in patterns:
            match = re.search(pattern, response_upper)
            if match:
                return match.group(1)

        # Looking for the first occurrence of A, B, C or D.
        for char in ['A', 'B', 'C', 'D']:
            if char in response_upper:
                return char

        # Default fallback.
        return 'A'

# MedQA type evaluation examples.
    def create_medqa_samples(self) -> List[Dict]:
        samples = [
            {
                "question": "A 45-year-old man presents with crushing chest pain radiating to his left arm. He is diaphoretic and nauseous. His ECG shows ST-elevation in leads II, III, and aVF. What is the most likely diagnosis?",
                "options": ["A) Anterior myocardial infarction", "B) Inferior myocardial infarction", "C) Pulmonary embolism", "D) Aortic dissection"],
                "correct": "B"
            },
            {
                "question": "A 25-year-old woman presents with polyuria, polydipsia, and weight loss. Her random glucose is 350 mg/dL. What is the most appropriate initial treatment?",
                "options": ["A) Metformin", "B) Insulin", "C) Sulfonylurea", "D) Lifestyle modification only"],
                "correct": "B"
            },
            {
                "question": "A 60-year-old smoker presents with hemoptysis and weight loss. Chest X-ray shows a mass in the right upper lobe. What is the most likely diagnosis?",
                "options": ["A) Pneumonia", "B) Tuberculosis", "C) Lung cancer", "D) Pulmonary embolism"],
                "correct": "C"
            },
            {
                "question": "A patient with a history of hypertension presents with sudden severe headache described as 'the worst headache of my life'. What is the most likely diagnosis?",
                "options": ["A) Migraine", "B) Tension headache", "C) Subarachnoid hemorrhage", "D) Cluster headache"],
                "correct": "C"
            },
            {
                "question": "A 30-year-old woman presents with fatigue, cold intolerance, and weight gain. Her TSH is elevated and T4 is low. What is the most likely diagnosis?",
                "options": ["A) Hyperthyroidism", "B) Primary hypothyroidism", "C) Secondary hypothyroidism", "D) Euthyroid sick syndrome"],
                "correct": "B"
            }
        ]
        return samples

# MedMCQA type evaluation examples.
    def create_medmcqa_samples(self) -> List[Dict]:
        samples = [
            {
                "question": "Which of the following is the first-line treatment for acute bacterial meningitis in adults?",
                "options": ["A) Ampicillin", "B) Ceftriaxone", "C) Vancomycin", "D) Dexamethasone"],
                "correct": "B"
            },
            {
                "question": "The normal range for adult respiratory rate is:",
                "options": ["A) 8-12 breaths per minute", "B) 12-20 breaths per minute", "C) 20-30 breaths per minute", "D) 30-40 breaths per minute"],
                "correct": "B"
            },
            {
                "question": "Which cranial nerve is responsible for facial sensation?",
                "options": ["A) Cranial nerve III", "B) Cranial nerve V", "C) Cranial nerve VII", "D) Cranial nerve IX"],
                "correct": "B"
            },
            {
                "question": "The most common cause of acute pancreatitis is:",
                "options": ["A) Alcohol", "B) Gallstones", "C) Trauma", "D) Medications"],
                "correct": "B"
            },
            {
                "question": "Which of the following is a characteristic of upper motor neuron lesions?",
                "options": ["A) Flaccid paralysis", "B) Muscle atrophy", "C) Hyperreflexia", "D) Fasciculations"],
                "correct": "C"
            }
        ]
        return samples

# MMLU type evaluation examples.
    def create_mmlu_anatomy_samples(self) -> List[Dict]:
        samples = [
            {
                "question": "The biceps brachii muscle is innervated by which nerve?",
                "options": ["A) Radial nerve", "B) Median nerve", "C) Ulnar nerve", "D) Musculocutaneous nerve"],
                "correct": "D"
            },
            {
                "question": "Which bone forms the prominence of the cheek?",
                "options": ["A) Maxilla", "B) Zygomatic", "C) Mandible", "D) Temporal"],
                "correct": "B"
            },
            {
                "question": "The sinoatrial node is located in which chamber of the heart?",
                "options": ["A) Left atrium", "B) Right atrium", "C) Left ventricle", "D) Right ventricle"],
                "correct": "B"
            },
            {
                "question": "Which structure separates the thoracic and abdominal cavities?",
                "options": ["A) Pericardium", "B) Pleura", "C) Diaphragm", "D) Peritoneum"],
                "correct": "C"
            },
            {
                "question": "The longest bone in the human body is:",
                "options": ["A) Tibia", "B) Femur", "C) Humerus", "D) Fibula"],
                "correct": "B"
            }
        ]
        return samples

# MMLU Clinical evaluation examples.
    def create_mmlu_clinical_samples(self) -> List[Dict]:
        samples = [
            {
                "question": "What is the normal range for systolic blood pressure in adults?",
                "options": ["A) 90-120 mmHg", "B) 120-140 mmHg", "C) 140-160 mmHg", "D) 160-180 mmHg"],
                "correct": "A"
            },
            {
                "question": "Which laboratory value is most indicative of kidney function?",
                "options": ["A) Blood urea nitrogen (BUN)", "B) Serum creatinine", "C) Glomerular filtration rate (GFR)", "D) Urinalysis"],
                "correct": "C"
            },
            {
                "question": "The Glasgow Coma Scale assesses:",
                "options": ["A) Pain level", "B) Neurological function", "C) Cardiac function", "D) Respiratory function"],
                "correct": "B"
            },
            {
                "question": "Which medication is contraindicated in patients with asthma?",
                "options": ["A) Albuterol", "B) Propranolol", "C) Prednisone", "D) Montelukast"],
                "correct": "B"
            },
            {
                "question": "The most sensitive marker for myocardial infarction is:",
                "options": ["A) CK-MB", "B) Troponin", "C) LDH", "D) AST"],
                "correct": "B"
            }
        ]
        return samples

# Evaluating the model based on it's response from the multiple choice questions.
    def evaluate_multiple_choice(self, samples: List[Dict], dataset_name: str) -> Dict:
        print(f"\n Evaluating {dataset_name}...")

        correct = 0
        total = len(samples)
        predictions = []
        true_labels = []

        for i, sample in enumerate(tqdm(samples, desc=f"Processing {dataset_name}")):
            # Formating the question.
            question = sample["question"]
            options = "\n".join(sample["options"])
            instruction = f"{question}\n\n{options}\n\nPlease select the correct answer (A, B, C, or D)."

            # Response.
            try:
                response = self.generate_response(instruction, max_length=100, temperature=0.1)
                predicted_answer = self.extract_answer_choice(response)

                predictions.append(predicted_answer)
                true_labels.append(sample["correct"])

                if predicted_answer == sample["correct"]:
                    correct += 1

                if i < 2:
                    print(f"\n Sample {i+1}:")
                    print(f"Question: {question}")
                    print(f"Correct: {sample['correct']}")
                    print(f"Predicted: {predicted_answer}")
                    print(f"Response: {response[:100]}...")

            except Exception as e:
                print(f"Error processing the sample {i}: {e}")
                predictions.append("A")
                true_labels.append(sample["correct"])

        accuracy = (correct / total) * 100

        # Additional metrics.
        precision, recall, f1, _ = precision_recall_fscore_support(
            true_labels, predictions, average='weighted', zero_division=0
        )

        results = {
            "accuracy": accuracy,
            "correct": correct,
            "total": total,
            "precision": precision * 100,
            "recall": recall * 100,
            "f1": f1 * 100
        }

        print(f" {dataset_name} Results:")
        print(f"   Accuracy: {accuracy:.2f}% ({correct}/{total})")
        print(f"   Precision: {precision*100:.2f}%")
        print(f"   Recall: {recall*100:.2f}%")
        print(f"   F1-Score: {f1*100:.2f}%")

        return results

# Evaluating the RADQA type questions.
    def evaluate_radqa_style(self) -> Dict:
        print(f"\n Evaluating the RADQA performance.")

        samples = [
            {
                "context": "CHEST X-RAY: The heart size is normal. Bilateral lower lobe opacities are present. No pleural effusion. No pneumothorax.",
                "question": "Are there signs of pneumonia?",
                "expected_keywords": ["opacities", "pneumonia", "bilateral", "lower lobe"]
            },
            {
                "context": "CT ABDOMEN: The liver shows multiple hypodense lesions. The largest measures 3.2 cm in segment VII. No bile duct dilatation.",
                "question": "What abnormalities are seen in the liver?",
                "expected_keywords": ["hypodense", "lesions", "multiple", "liver"]
            },
            {
                "context": "MRI BRAIN: There is a 2.1 cm enhancing mass in the right frontal lobe with surrounding edema. Mass effect is present.",
                "question": "Describe the brain findings.",
                "expected_keywords": ["mass", "enhancing", "frontal", "edema", "mass effect"]
            }
        ]

        scores = []

        for i, sample in enumerate(samples):
            context = sample["context"]
            question = sample["question"]
            expected_keywords = sample["expected_keywords"]

            instruction = f"Context: {context}\n\nQuestion: {question}\n\nPlease provide a comprehensive answer based on the radiology report."

            try:
                response = self.generate_response(instruction, max_length=200)

                # Score based on the keyword coverage.
                response_lower = response.lower()
                keyword_score = sum(1 for keyword in expected_keywords if keyword.lower() in response_lower)
                total_keywords = len(expected_keywords)
                score = (keyword_score / total_keywords) * 100

                scores.append(score)

                print(f"\n RADQA Sample {i+1}:")
                print(f"Question: {question}")
                print(f"Keywords found: {keyword_score}/{total_keywords}")
                print(f"Score: {score:.1f}%")
                print(f"Response: {response[:150]}...")

            except Exception as e:
                print(f"Error processing RADQA sample {i}: {e}.")
                scores.append(0)

        average_score = np.mean(scores) if scores else 0

        results = {
            "radqa_score": average_score,
            "individual_scores": scores
        }

        print(f"RADQA type results:")
        print(f"   Average Score: {average_score:.2f}%")

        return results

# All results.
    def run_comprehensive_evaluation(self) -> Dict:
        print("Medical Evaluation")
        print("=" * 60)

        all_results = {}

        # Multiple choice evaluations.
        medqa_samples = self.create_medqa_samples()
        medmcqa_samples = self.create_medmcqa_samples()
        anatomy_samples = self.create_mmlu_anatomy_samples()
        clinical_samples = self.create_mmlu_clinical_samples()

        # Run evaluations.
        all_results["MedQA"] = self.evaluate_multiple_choice(medqa_samples, "MedQA")
        all_results["MedMCQA"] = self.evaluate_multiple_choice(medmcqa_samples, "MedMCQA")
        all_results["MMLU_Anatomy"] = self.evaluate_multiple_choice(anatomy_samples, "MMLU Anatomy")
        all_results["MMLU_Clinical"] = self.evaluate_multiple_choice(clinical_samples, "MMLU Clinical Knowledge")
        all_results["RADQA"] = self.evaluate_radqa_style()

        # Overall average.
        accuracy_scores = [
            all_results["MedQA"]["accuracy"],
            all_results["MedMCQA"]["accuracy"],
            all_results["MMLU_Anatomy"]["accuracy"],
            all_results["MMLU_Clinical"]["accuracy"],
            all_results["RADQA"]["radqa_score"]
        ]

        overall_average = np.mean(accuracy_scores)
        all_results["Overall_Average"] = overall_average

        return all_results

    def print_leaderboard_format(self, results: Dict):
        print("\n" + "=" * 80)
        print("Medical model evaluation results")
        print("=" * 80)
        print(f"{'Model':<40} {'Average':<10} {'MedQA':<10} {'MedMCQA':<10} {'MMLU Anatomy':<15} {'MMLU Clinical':<15}")
        print("-" * 80)

        model_name = f"Inference analytics model ({os.path.basename(self.checkpoint_path)})"
        average = results["Overall_Average"]
        medqa = results["MedQA"]["accuracy"]
        medmcqa = results["MedMCQA"]["accuracy"]
        anatomy = results["MMLU_Anatomy"]["accuracy"]
        clinical = results["MMLU_Clinical"]["accuracy"]

        print(f"{model_name:<40} {average:<10.2f} {medqa:<10.2f} {medmcqa:<10.2f} {anatomy:<15.2f} {clinical:<15.2f}")

        # Comparison with leaderboard leaders
        print("\n Comparison of the top models:")
        print(f"ProbeMedicalYonseiMAILab/medllama3-v20: 90.01 avg")
        print(f"aaditya/OpenBioLLMLlama-70B: 86.06 avg")
        print(f"Med-PaLM 2: 84.09 avg")
        print(f"Inference analytics model: {average:.2f} avg")

        if average >= 85:
            print("\n The model performs at state-of-the-art level!")
        elif average >= 75:
            print("\n The model shows strong medical performance.")
        elif average >= 65:
            print("\n The model demonstrates solid medical knowledge.")
        else:
            print("\n Room for improvement. Consider more training data or longer training.")

        # Additional insights
        print(f"\n Insights:")
        print(f"    Strongest area: {max(results, key=lambda k: results[k]['accuracy'] if 'accuracy' in results[k] else 0)}")
        print(f"    RADQA Performance: {results['RADQA']['radqa_score']:.2f}% (radiology analysis)")

        return results

def find_checkpoint():
    search_paths = [
        "/content/drive/My Drive/llama-medx-finetuned/checkpoint-500",
        "/content/drive/MyDrive/llama-medx-finetuned/checkpoint-500",
        "./checkpoint-500",
        "/content/checkpoint-500",
        "/content/drive/MyDrive/checkpoint-500",
        "/content/drive/My Drive/checkpoint-500"
    ]

    for path in search_paths:
        if os.path.exists(path):
            print(f"Found checkpoint at: {path}")
            return path

    return None

def main():
    print("Medical Model Benchmark Evaluator")
    print("=" * 50)

    # Finding the checkpoint.
    checkpoint_path = find_checkpoint()

    if not checkpoint_path:
        print("Checkpoint not found.")
        checkpoint_path = input("Enter path to the model's checkpoint: ").strip()

        if not os.path.exists(checkpoint_path):
            print(f"Path not found: {checkpoint_path}")
            return

    # Evaluator.
    evaluator = MedicalBenchmarkEvaluator(checkpoint_path)

    try:
        # Loading the model.
        evaluator.load_model()

        # Running the evaluation.
        results = evaluator.run_comprehensive_evaluation()

        # Results in the leaderboard format.
        evaluator.print_leaderboard_format(results)

        results_file = "medical_evaluation_results.json"
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2)

        print(f"\n Results saved to: {results_file}")

        # Download in Colab.
        try:
            from google.colab import files
            files.download(results_file)
            print("Results file downloaded.")
        except:
            print(f"Results available at: {results_file}")

    except Exception as e:
        print(f"Error during evaluation: {e}.")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Medical Model Benchmark Evaluator
Found checkpoint at: /content/drive/My Drive/llama-medx-finetuned/checkpoint-500
Loading model from: /content/drive/My Drive/llama-medx-finetuned/checkpoint-500
Using device: cuda
Detected PEFT adapter model.
Base model: skumar9/Llama-medx_v3.2


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/419 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

Model loaded.
Medical Evaluation

 Evaluating MedQA...


Processing MedQA:  20%|██        | 1/5 [00:17<01:10, 17.61s/it]


 Sample 1:
Question: A 45-year-old man presents with crushing chest pain radiating to his left arm. He is diaphoretic and nauseous. His ECG shows ST-elevation in leads II, III, and aVF. What is the most likely diagnosis?
Correct: B
Predicted: B
Response: B) Inferior myocardial infarction

The patient's symptoms of crushing chest pain radiating to the le...


Processing MedQA:  40%|████      | 2/5 [00:34<00:50, 16.90s/it]


 Sample 2:
Question: A 25-year-old woman presents with polyuria, polydipsia, and weight loss. Her random glucose is 350 mg/dL. What is the most appropriate initial treatment?
Correct: B
Predicted: A
Response: B Insulin://www.ncbi.nlm.nih.gov/pubmed/2141239 "Insulin therapy in diabetic ketoacidosis." [PMID: 2...


Processing MedQA: 100%|██████████| 5/5 [01:22<00:00, 16.57s/it]


 MedQA Results:
   Accuracy: 80.00% (4/5)
   Precision: 100.00%
   Recall: 80.00%
   F1-Score: 88.00%

 Evaluating MedMCQA...


Processing MedMCQA:  20%|██        | 1/5 [00:16<01:05, 16.42s/it]


 Sample 1:
Question: Which of the following is the first-line treatment for acute bacterial meningitis in adults?
Correct: B
Predicted: A
Response: A) Ampicillin. The first-line treatment for acute bacterial meningitis in adults is ampicillin. This...


Processing MedMCQA:  40%|████      | 2/5 [00:32<00:49, 16.33s/it]


 Sample 2:
Question: The normal range for adult respiratory rate is:
Correct: B
Predicted: A
Response: A) 8-12 breaths per minute. The normal range for an adult's respiratory rate is typically between 8 ...


Processing MedMCQA: 100%|██████████| 5/5 [01:10<00:00, 14.01s/it]


 MedMCQA Results:
   Accuracy: 40.00% (2/5)
   Precision: 100.00%
   Recall: 40.00%
   F1-Score: 52.00%

 Evaluating MMLU Anatomy...


Processing MMLU Anatomy:  20%|██        | 1/5 [00:15<01:02, 15.74s/it]


 Sample 1:
Question: The biceps brachii muscle is innervated by which nerve?
Correct: D
Predicted: A
Response: D Musculocutaneous nerve. The biceps brachii muscle is innervated by the musculocutaneous nerve. Thi...


Processing MMLU Anatomy:  40%|████      | 2/5 [00:31<00:47, 15.69s/it]


 Sample 2:
Question: Which bone forms the prominence of the cheek?
Correct: B
Predicted: B
Response: B) Zygomatic - The zygomatic bone is responsible for forming the prominence of the cheek. It is a pa...


Processing MMLU Anatomy: 100%|██████████| 5/5 [01:16<00:00, 15.28s/it]


 MMLU Anatomy Results:
   Accuracy: 80.00% (4/5)
   Precision: 80.00%
   Recall: 80.00%
   F1-Score: 80.00%

 Evaluating MMLU Clinical Knowledge...


Processing MMLU Clinical Knowledge:  20%|██        | 1/5 [00:16<01:04, 16.13s/it]


 Sample 1:
Question: What is the normal range for systolic blood pressure in adults?
Correct: A
Predicted: A
Response: The normal range for systolic blood pressure in adults is A) 90-120 mmHg.://www.mayoclinic.com/healt...


Processing MMLU Clinical Knowledge:  40%|████      | 2/5 [00:22<00:30, 10.25s/it]


 Sample 2:
Question: Which laboratory value is most indicative of kidney function?
Correct: C
Predicted: A
Response: A) Blood urea nitrogen (BUN)://www.ncbi.nlm.nih.gov/books/NBK2795/://www.ncbi.nlm.nih.gov/books/NBK2...


Processing MMLU Clinical Knowledge: 100%|██████████| 5/5 [01:01<00:00, 12.26s/it]


 MMLU Clinical Knowledge Results:
   Accuracy: 60.00% (3/5)
   Precision: 66.67%
   Recall: 60.00%
   F1-Score: 58.00%

 Evaluating the RADQA performance.

 RADQA Sample 1:
Question: Are there signs of pneumonia?
Keywords found: 4/4
Score: 100.0%
Response: Based on the radiology report: Bilateral lower lobe opacities are present. No pleural effusion. No pneumothorax. This finding suggests that there may ...

 RADQA Sample 2:
Question: What abnormalities are seen in the liver?
Keywords found: 4/4
Score: 100.0%
Response: Multiple hypodense lesions are seen in the liver, with the largest measuring 3.2 cm in segment VII. There is no evidence of bile duct dilatation.://ww...

 RADQA Sample 3:
Question: Describe the brain findings.
Keywords found: 5/5
Score: 100.0%
Response: The brain findings are as follows: there is a 2.1 cm enhancing mass in the right frontal lobe, which is causing surrounding edema and mass effect. The...
RADQA type results:
   Average Score: 100.00%

Medical model evalu

Traceback (most recent call last):
  File "/tmp/ipython-input-3166480509.py", line 518, in main
    evaluator.print_leaderboard_format(results)
  File "/tmp/ipython-input-3166480509.py", line 470, in print_leaderboard_format
    print(f"    Strongest area: {max(results, key=lambda k: results[k]['accuracy'] if 'accuracy' in results[k] else 0)}")
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-3166480509.py", line 470, in <lambda>
    print(f"    Strongest area: {max(results, key=lambda k: results[k]['accuracy'] if 'accuracy' in results[k] else 0)}")
                                                                                      ^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: argument of type 'numpy.float64' is not iterable
