In [5]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re
import string
from collections import Counter

# --- Configuration ---
# NOTE: Qwen2.5-7B-Instruct requires significant VRAM (>= 16GB) to run.
# Consider using quantization (load_in_4bit=True) or a smaller model if resources are limited.
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
DATASET_NAME = "hotpot_qa"
SUBSET_NAME = "fullwiki"
SPLIT = "validation" # Use the validation set for testing
NUM_SAMPLES = 100    # Set a small number for quick testing (e.g., 100). Increase for full evaluation.

def load_model_and_tokenizer(model_id):
    """Loads the model and tokenizer, optimizing for available hardware."""
    print(f"Loading model: {model_id}...")

    # Use GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Setup for model loading (using bfloat16 for better performance on modern GPUs)
    kwargs = {
        "torch_dtype": torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8 else torch.float16,
        "device_map": "auto",
        "low_cpu_mem_usage": True
    }

    # Try using 4-bit quantization if GPU is available to save VRAM
    if device == "cuda" and torch.cuda.get_device_properties(0).total_memory / (1024**3) < 24:
        print("Warning: Low VRAM detected. Attempting to load with 4-bit quantization.")
        try:
            from transformers import BitsAndBytesConfig
            bnb_config = BitsAndBytesConfig(load_in_4bit=True)
            kwargs["quantization_config"] = bnb_config
        except ImportError:
            print("Install 'bitsandbytes' for 4-bit quantization to work on low VRAM.")

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
        model.eval()
        print(f"Model loaded successfully on device: {device}")
        return tokenizer, model, device
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Please ensure you have sufficient hardware (GPU/RAM) and required libraries installed.")
        return None, None, None

def normalize_answer(s):
    """Lowercases, removes punctuation and articles (a, an, the), and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        # Create a translation table to remove punctuation
        translator = str.maketrans('', '', string.punctuation)
        return text.translate(translator)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    """Returns a list of tokens after normalization."""
    if not s:
        return []
    return normalize_answer(s).split()

def compute_f1(a_gold, a_pred):
    """
    Computes F1 score based on token overlap.
    NOTE: Returns a single float value for the F1 score.
    """
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)

    # Handle edge case where one or both answers are empty
    if not gold_toks or not pred_toks:
        # F1 is 1.0 if both are empty (perfect match), 0.0 otherwise.
        return float(gold_toks == pred_toks)

    common = 0
    pred_counter = Counter(pred_toks)

    for gold_tok in gold_toks:
        if pred_counter[gold_tok] > 0:
            common += 1
            pred_counter[gold_tok] -= 1

    prec = common / len(pred_toks)
    rec = common / len(gold_toks)

    f1 = (2 * prec * rec) / (prec + rec) if (prec + rec) > 0 else 0
    return f1

def compute_metrics(gold_answers, pred_answers):
    """Calculates Exact Match (EM) and F1 Score for a list of answers."""
    em_total = 0
    f1_total = 0

    for gold, pred in zip(gold_answers, pred_answers):
        # HotpotQA has multiple possible gold answers; use the best one
        # For simplicity, we just use the first gold answer text.
        gold_text = gold[0]
        pred_text = pred

        # Exact Match Check
        norm_gold = normalize_answer(gold_text)
        norm_pred = normalize_answer(pred_text)
        em = 1.0 if norm_gold == norm_pred else 0.0

        # F1 Check
        f1 = compute_f1(gold_text, pred_text)

        em_total += em
        f1_total += f1

    num_samples = len(gold_answers)
    return {
        "EM": (em_total / num_samples) * 100,
        "F1": (f1_total / num_samples) * 100
    }


def main():
    """Main function to run the HotpotQA evaluation."""
    tokenizer, model, device = load_model_and_tokenizer(MODEL_ID)
    if model is None:
        return

    # 1. Load Dataset
    print(f"\nLoading HotpotQA dataset (split: {SPLIT}, samples: {NUM_SAMPLES})...")
    try:
        dataset = load_dataset(DATASET_NAME, SUBSET_NAME, split=SPLIT)
        # Filter for the first N samples
        dataset = dataset.select(range(min(NUM_SAMPLES, len(dataset))))
        print(f"Dataset loaded with {len(dataset)} samples.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    gold_answers = []
    pred_answers = []

    print("\nStarting closed-book evaluation...")

    for i, example in tqdm(enumerate(dataset), total=len(dataset)):
        # HotpotQA question format
        question = example['question']

        # The gold answer is a list of possible answers
        # HotpotQA stores the answer as a string or list of strings.
        # We ensure it's a list for consistency with multiple possible answers.
        gold_answer_list = example['answer'] if isinstance(example['answer'], list) else [example['answer']]
        gold_answers.append(gold_answer_list)

        # --- Closed-Book Prompting ---
        # The prompt only contains the question, not the context,
        # forcing the model to rely on its internal knowledge.
        prompt = f"""You are an expert at giving concise answers. Do not give any explanations, only a short answer. For example:
        Question: Which magazine was started first Arthur's Magazine or First for Women?
        Answer: Arthur's Magazine

        Question: Is Children's National Medical Center or MedStar Washington Hospital Center the largest private hospital in Washington, D.C.?
        Answer: MedStar Washington Hospital Center

        Now answer the question:

        Question: {question}
        Answer: """

        # Qwen-specific instruction format (if using the instruct version)
        messages = [
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=50, # Sufficient for a concise QA answer
                do_sample=False,
                num_beams=1,
                pad_token_id=tokenizer.eos_token_id # Important for generation
            )

        # Decode the generated text, skipping the input prompt
        generated_text = tokenizer.decode(output[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()

        pred_answers.append(generated_text)

        if i % (NUM_SAMPLES // 5 or 1) == 0 and i > 0:
            print(f"\n--- Sample {i+1}/{len(dataset)} ---")
            print(f"Q: {question}")
            # Print the first gold answer for readability
            print(f"Gold: {gold_answer_list[0]}")
            print(f"Pred: {generated_text}")

    # 3. Compute Metrics
    print("\n--- Evaluation Complete ---")
    metrics = compute_metrics(gold_answers, pred_answers)

    print(f"Results for {MODEL_ID} on HotpotQA ({len(dataset)} samples):")
    print(f"  Exact Match (EM): {metrics['EM']:.2f}%")
    print(f"  F1 Score (F1): {metrics['F1']:.2f}%")


In [6]:
main()

Loading model: Qwen/Qwen2.5-7B-Instruct...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully on device: cuda

Loading HotpotQA dataset (split: validation, samples: 100)...
Dataset loaded with 100 samples.

Starting closed-book evaluation...


 21%|██        | 21/100 [00:23<02:19,  1.77s/it]


--- Sample 21/100 ---
Q: Which other Mexican Formula One race car driver has held the podium besides the Force India driver born in 1990?
Gold: Pedro Rodríguez
Pred: Answer: As of the latest information available, no other Mexican Formula One race car driver has consistently held the podium besides Sergio Perez, who was born in 1990 and drives for Force India (now known as Aston Martin). However, this


 41%|████      | 41/100 [00:39<00:46,  1.26it/s]


--- Sample 41/100 ---
Q: Which dog's ancestors include Gordon and Irish Setters: the Manchester Terrier or the Scotch Collie?
Gold: Scotch Collie
Pred: Scotch Collie


 61%|██████    | 61/100 [01:04<01:00,  1.55s/it]


--- Sample 61/100 ---
Q: What distinction is held by the former NBA player who was a member of the Charlotte Hornets during their 1992-93 season and was head coach for the WNBA team Charlotte Sting?
Gold: shortest player ever to play in the National Basketball Association
Pred: Former NBA player Larry Brown holds that distinction.


 81%|████████  | 81/100 [01:28<00:16,  1.14it/s]


--- Sample 81/100 ---
Q: What is the county seat of the county where East Lempster, New Hampshire is located?
Gold: Newport
Pred: Lempster


100%|██████████| 100/100 [01:50<00:00,  1.10s/it]


--- Evaluation Complete ---
Results for Qwen/Qwen2.5-7B-Instruct on HotpotQA (100 samples):
  Exact Match (EM): 8.00%
  F1 Score (F1): 17.95%





In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re
import string
from collections import Counter

# --- Configuration ---
# NOTE: Qwen2.5-7B-Instruct requires significant VRAM (>= 16GB) to run.
# Consider using quantization (load_in_4bit=True) or a smaller model if resources are limited.
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
DATASET_NAME = "hotpot_qa"
SUBSET_NAME = "fullwiki"
SPLIT = "validation" # Use the validation set for testing
NUM_SAMPLES = 100    # Set a small number for quick testing (e.g., 100). Increase for full evaluation.

def load_model_and_tokenizer(model_id):
    """Loads the model and tokenizer, optimizing for available hardware."""
    print(f"Loading model: {model_id}...")

    # Use GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Setup for model loading (using bfloat16 for better performance on modern GPUs)
    kwargs = {
        "torch_dtype": torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8 else torch.float16,
        "device_map": "auto",
        "low_cpu_mem_usage": True
    }

    # Try using 4-bit quantization if GPU is available to save VRAM
    if device == "cuda" and torch.cuda.get_device_properties(0).total_memory / (1024**3) < 24:
        print("Warning: Low VRAM detected. Attempting to load with 4-bit quantization.")
        try:
            from transformers import BitsAndBytesConfig
            bnb_config = BitsAndBytesConfig(load_in_4bit=True)
            kwargs["quantization_config"] = bnb_config
        except ImportError:
            print("Install 'bitsandbytes' for 4-bit quantization to work on low VRAM.")

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
        model.eval()
        print(f"Model loaded successfully on device: {device}")
        return tokenizer, model, device
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Please ensure you have sufficient hardware (GPU/RAM) and required libraries installed.")
        return None, None, None

def normalize_answer(s):
    """Lowercases, removes punctuation and articles (a, an, the), and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        # Create a translation table to remove punctuation
        translator = str.maketrans('', '', string.punctuation)
        return text.translate(translator)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    """Returns a list of tokens after normalization."""
    if not s:
        return []
    return normalize_answer(s).split()

def compute_f1(a_gold, a_pred):
    """
    Computes F1 score based on token overlap.
    NOTE: Returns a single float value for the F1 score.
    """
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)

    # Handle edge case where one or both answers are empty
    if not gold_toks or not pred_toks:
        # F1 is 1.0 if both are empty (perfect match), 0.0 otherwise.
        return float(gold_toks == pred_toks)

    common = 0
    pred_counter = Counter(pred_toks)

    for gold_tok in gold_toks:
        if pred_counter[gold_tok] > 0:
            common += 1
            pred_counter[gold_tok] -= 1

    prec = common / len(pred_toks)
    rec = common / len(gold_toks)

    f1 = (2 * prec * rec) / (prec + rec) if (prec + rec) > 0 else 0
    return f1

def compute_metrics(gold_answers, pred_answers):
    """Calculates Exact Match (EM) and F1 Score for a list of answers."""
    em_total = 0
    f1_total = 0

    for gold, pred in zip(gold_answers, pred_answers):
        # HotpotQA has multiple possible gold answers; use the best one
        # For simplicity, we just use the first gold answer text.
        gold_text = gold[0]
        pred_text = pred

        # Exact Match Check
        norm_gold = normalize_answer(gold_text)
        norm_pred = normalize_answer(pred_text)
        em = 1.0 if norm_gold == norm_pred else 0.0

        # F1 Check
        f1 = compute_f1(gold_text, pred_text)

        em_total += em
        f1_total += f1

    num_samples = len(gold_answers)
    return {
        "EM": (em_total / num_samples) * 100,
        "F1": (f1_total / num_samples) * 100
    }


def main():
    """Main function to run the HotpotQA evaluation."""
    tokenizer, model, device = load_model_and_tokenizer(MODEL_ID)
    if model is None:
        return

    # 1. Load Dataset
    print(f"\nLoading HotpotQA dataset (split: {SPLIT}, samples: {NUM_SAMPLES})...")
    try:
        dataset = load_dataset(DATASET_NAME, SUBSET_NAME, split=SPLIT)
        # Filter for the first N samples
        dataset = dataset.select(range(min(NUM_SAMPLES, len(dataset))))
        print(f"Dataset loaded with {len(dataset)} samples.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    gold_answers = []
    pred_answers = []

    print("\nStarting closed-book evaluation...")

    for i, example in tqdm(enumerate(dataset), total=len(dataset)):
        # HotpotQA question format
        question = example['question']

        # The gold answer is a list of possible answers
        # HotpotQA stores the answer as a string or list of strings.
        # We ensure it's a list for consistency with multiple possible answers.
        gold_answer_list = example['answer'] if isinstance(example['answer'], list) else [example['answer']]
        gold_answers.append(gold_answer_list)

        # --- Closed-Book Prompting ---
        # The prompt only contains the question, not the context,
        # forcing the model to rely on its internal knowledge.
        prompt = f"""You are an expert at giving concise answers. Do not give any explanations, only a short answer. For example:
        Question: Which magazine was started first Arthur's Magazine or First for Women?
        Answer: Arthur's Magazine

        Question: Is Children's National Medical Center or MedStar Washington Hospital Center the largest private hospital in Washington, D.C.?
        Answer: MedStar Washington Hospital Center

        Now answer the question:

        Question: {question}
        Answer: """

        # Qwen-specific instruction format (if using the instruct version)
        messages = [
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=50, # Sufficient for a concise QA answer
                do_sample=False,
                num_beams=1,
                pad_token_id=tokenizer.eos_token_id # Important for generation
            )

        # Decode the generated text, skipping the input prompt
        generated_text = tokenizer.decode(output[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()

        pred_answers.append(generated_text)

        if i % (NUM_SAMPLES // 5 or 1) == 0 and i > 0:
            print(f"\n--- Sample {i+1}/{len(dataset)} ---")
            print(f"Q: {question}")
            # Print the first gold answer for readability
            print(f"Gold: {gold_answer_list[0]}")
            print(f"Pred: {generated_text}")

    # 3. Compute Metrics
    print("\n--- Evaluation Complete ---")
    metrics = compute_metrics(gold_answers, pred_answers)

    print(f"Results for {MODEL_ID} on HotpotQA ({len(dataset)} samples):")
    print(f"  Exact Match (EM): {metrics['EM']:.2f}%")
    print(f"  F1 Score (F1): {metrics['F1']:.2f}%")


In [2]:
main()

Loading model: Qwen/Qwen2.5-7B-Instruct...


`torch_dtype` is deprecated! Use `dtype` instead!


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully on device: cuda

Loading HotpotQA dataset (split: validation, samples: 100)...
Dataset loaded with 100 samples.

Starting closed-book evaluation...


  0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 21%|██        | 21/100 [00:18<01:04,  1.22it/s]


--- Sample 21/100 ---
Q: Which other Mexican Formula One race car driver has held the podium besides the Force India driver born in 1990?
Gold: Pedro Rodríguez
Pred: Stevens Davies


 41%|████      | 41/100 [00:31<00:40,  1.46it/s]


--- Sample 41/100 ---
Q: Which dog's ancestors include Gordon and Irish Setters: the Manchester Terrier or the Scotch Collie?
Gold: Scotch Collie
Pred: Scotch Collie


 61%|██████    | 61/100 [00:46<00:31,  1.26it/s]


--- Sample 61/100 ---
Q: What distinction is held by the former NBA player who was a member of the Charlotte Hornets during their 1992-93 season and was head coach for the WNBA team Charlotte Sting?
Gold: shortest player ever to play in the National Basketball Association
Pred: first head coach of the Charlotte Sting


 81%|████████  | 81/100 [01:02<00:13,  1.45it/s]


--- Sample 81/100 ---
Q: What is the county seat of the county where East Lempster, New Hampshire is located?
Gold: Newport
Pred: Lempster


100%|██████████| 100/100 [01:16<00:00,  1.31it/s]


--- Evaluation Complete ---
Results for Qwen/Qwen2.5-7B-Instruct on HotpotQA (100 samples):
  Exact Match (EM): 18.00%
  F1 Score (F1): 29.46%





In [2]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re
import string
from collections import Counter

# --- Configuration ---
# NOTE: Qwen2.5-7B-Instruct requires significant VRAM (>= 16GB) to run.
# Consider using quantization (load_in_4bit=True) or a smaller model if resources are limited.
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
DATASET_NAME = "hotpot_qa"
SUBSET_NAME = "fullwiki"
SPLIT = "validation" # Use the validation set for testing
NUM_SAMPLES = 100    # Set a small number for quick testing (e.g., 100). Increase for full evaluation.

def load_model_and_tokenizer(model_id):
    """Loads the model and tokenizer, optimizing for available hardware."""
    print(f"Loading model: {model_id}...")

    # Use GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Setup for model loading (using bfloat16 for better performance on modern GPUs)
    kwargs = {
        "torch_dtype": torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8 else torch.float16,
        "device_map": "auto",
        "low_cpu_mem_usage": True
    }

    # Try using 4-bit quantization if GPU is available to save VRAM
    if device == "cuda" and torch.cuda.get_device_properties(0).total_memory / (1024**3) < 24:
        print("Warning: Low VRAM detected. Attempting to load with 4-bit quantization.")
        try:
            from transformers import BitsAndBytesConfig
            bnb_config = BitsAndBytesConfig(load_in_4bit=True)
            kwargs["quantization_config"] = bnb_config
        except ImportError:
            print("Install 'bitsandbytes' for 4-bit quantization to work on low VRAM.")

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
        model.eval()
        print(f"Model loaded successfully on device: {device}")
        return tokenizer, model, device
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Please ensure you have sufficient hardware (GPU/RAM) and required libraries installed.")
        return None, None, None

def normalize_answer(s):
    """
    Official HotpotQA normalization for answers.
    Lowercases, removes punctuation, articles (a, an, the), and extra whitespace.
    """
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    """Computes F1 score based on token overlap (official HotpotQA version)."""
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    ZERO_METRIC = (0.0, 0.0, 0.0) # F1, Precision, Recall

    # Handling 'yes', 'no', 'noanswer' as non-overlapping tokenization causes issues.
    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC
    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()

    # Token matching using Counter
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return ZERO_METRIC

    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1, precision, recall

def exact_match_score(prediction, ground_truth):
    """Computes Exact Match score (official HotpotQA version)."""
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

def compute_metrics(gold_answers, pred_answers):
    """Calculates Exact Match (EM), F1 Score, Precision, and Recall for a list of answers."""
    em_total = 0.0
    f1_total = 0.0
    prec_total = 0.0
    recall_total = 0.0

    for gold_list, pred in zip(gold_answers, pred_answers):
        # HotpotQA official metric uses the MAX score across all gold answers
        best_em = 0.0
        best_f1 = 0.0
        best_prec = 0.0
        best_recall = 0.0

        for gold_text in gold_list:
            # 1. EM Score
            em = exact_match_score(pred, gold_text)
            best_em = max(best_em, float(em))

            # 2. F1 Score (Returns F1, Precision, Recall tuple)
            f1, prec, recall = f1_score(pred, gold_text)

            if f1 > best_f1:
                best_f1 = f1
                best_prec = prec
                best_recall = recall

        em_total += best_em
        f1_total += best_f1
        prec_total += best_prec
        recall_total += best_recall

    num_samples = len(gold_answers)
    return {
        "EM": (em_total / num_samples) * 100,
        "F1": (f1_total / num_samples) * 100,
        "Precision": (prec_total / num_samples) * 100,
        "Recall": (recall_total / num_samples) * 100,
    }

# NOTE: The previous compute_f1 function is removed as it's replaced by the official f1_score.
# NOTE: All supporting fact (sp) and joint metrics logic is removed as this is a closed-book test.


def main():
    """Main function to run the HotpotQA evaluation."""
    tokenizer, model, device = load_model_and_tokenizer(MODEL_ID)
    if model is None:
        return

    # 1. Load Dataset
    print(f"\nLoading HotpotQA dataset (split: {SPLIT}, samples: {NUM_SAMPLES})...")
    try:
        dataset = load_dataset(DATASET_NAME, SUBSET_NAME, split=SPLIT)
        # Filter for the first N samples
        dataset = dataset.select(range(min(NUM_SAMPLES, len(dataset))))
        print(f"Dataset loaded with {len(dataset)} samples.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    gold_answers = []
    pred_answers = []

    print("\nStarting closed-book evaluation...")

    for i, example in tqdm(enumerate(dataset), total=len(dataset)):
        # HotpotQA question format
        question = example['question']

        # The gold answer is a list of possible answers
        # HotpotQA stores the answer as a string or list of strings.
        # We ensure it's a list for consistency with multiple possible answers.
        gold_answer_list = example['answer'] if isinstance(example['answer'], list) else [example['answer']]
        gold_answers.append(gold_answer_list)

        # --- Closed-Book Prompting ---
        # The prompt only contains the question, not the context,
        # forcing the model to rely on its internal knowledge.
        # prompt = f"Answer the following question concisely:\nQuestion: {question}\nAnswer:"
        prompt = f"""You are an expert at giving concise answers. Do not give any explanations, only a short answer. For example:
        Question: Which magazine was started first Arthur's Magazine or First for Women?
        Answer: Arthur's Magazine

        Question: Is Children's National Medical Center or MedStar Washington Hospital Center the largest private hospital in Washington, D.C.?
        Answer: MedStar Washington Hospital Center

        Now answer the question:

        Question: {question}
        Answer: """

        # Qwen-specific instruction format (if using the instruct version)
        messages = [
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=50, # Sufficient for a concise QA answer
                do_sample=False,
                num_beams=1,
                pad_token_id=tokenizer.eos_token_id # Important for generation
            )

        # Decode the generated text, skipping the input prompt
        generated_text = tokenizer.decode(output[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()

        pred_answers.append(generated_text)

        if i % (NUM_SAMPLES // 5 or 1) == 0 and i > 0:
            print(f"\n--- Sample {i+1}/{len(dataset)} ---")
            print(f"Q: {question}")
            # Print the first gold answer for readability
            print(f"Gold: {gold_answer_list[0]}")
            print(f"Pred: {generated_text}")

    # 3. Compute Metrics
    print("\n--- Evaluation Complete ---")
    metrics = compute_metrics(gold_answers, pred_answers)

    print(f"Results for {MODEL_ID} on HotpotQA ({len(dataset)} samples):")
    print(f"  Exact Match (EM): {metrics['EM']:.2f}%")
    print(f"  F1 Score (F1): {metrics['F1']:.2f}%")
    print(f"  Precision: {metrics['Precision']:.2f}%")
    print(f"  Recall: {metrics['Recall']:.2f}%")


if __name__ == "__main__":
    # Ensure all required libraries are installed:
    # pip install transformers datasets accelerate torch bitsandbytes tqdm
    main()

Loading model: Qwen/Qwen2.5-7B-Instruct...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully on device: cuda

Loading HotpotQA dataset (split: validation, samples: 100)...
Dataset loaded with 100 samples.

Starting closed-book evaluation...


 21%|██        | 21/100 [00:18<01:06,  1.19it/s]


--- Sample 21/100 ---
Q: Which other Mexican Formula One race car driver has held the podium besides the Force India driver born in 1990?
Gold: Pedro Rodríguez
Pred: Stevens Davies


 41%|████      | 41/100 [00:32<00:41,  1.42it/s]


--- Sample 41/100 ---
Q: Which dog's ancestors include Gordon and Irish Setters: the Manchester Terrier or the Scotch Collie?
Gold: Scotch Collie
Pred: Scotch Collie


 61%|██████    | 61/100 [00:47<00:31,  1.25it/s]


--- Sample 61/100 ---
Q: What distinction is held by the former NBA player who was a member of the Charlotte Hornets during their 1992-93 season and was head coach for the WNBA team Charlotte Sting?
Gold: shortest player ever to play in the National Basketball Association
Pred: first head coach of the Charlotte Sting


 81%|████████  | 81/100 [01:03<00:13,  1.45it/s]


--- Sample 81/100 ---
Q: What is the county seat of the county where East Lempster, New Hampshire is located?
Gold: Newport
Pred: Lempster


100%|██████████| 100/100 [01:16<00:00,  1.30it/s]


--- Evaluation Complete ---
Results for Qwen/Qwen2.5-7B-Instruct on HotpotQA (100 samples):
  Exact Match (EM): 18.00%
  F1 Score (F1): 29.46%
  Precision: 30.83%
  Recall: 29.38%





In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re
import string
from collections import Counter

# --- Configuration ---
# NOTE: Qwen2.5-7B-Instruct requires significant VRAM (>= 16GB) to run.
# Consider using quantization (load_in_4bit=True) or a smaller model if resources are limited.
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
DATASET_NAME = "hotpot_qa"
SUBSET_NAME = "fullwiki"
SPLIT = "validation" # Use the validation set for testing
NUM_SAMPLES = 10    # Set a small number for quick testing (e.g., 100). Increase for full evaluation.

def load_model_and_tokenizer(model_id):
    """Loads the model and tokenizer, optimizing for available hardware."""
    print(f"Loading model: {model_id}...")

    # Use GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using Device: {device}")

    # Setup for model loading (using bfloat16 for better performance on modern GPUs)
    kwargs = {
        "torch_dtype": torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8 else torch.float16,
        "device_map": "auto",
        "low_cpu_mem_usage": True
    }

    # Try using 4-bit quantization if GPU is available to save VRAM
    if device == "cuda" and torch.cuda.get_device_properties(0).total_memory / (1024**3) < 24:
        print("Warning: Low VRAM detected. Attempting to load with 4-bit quantization.")
        try:
            from transformers import BitsAndBytesConfig
            bnb_config = BitsAndBytesConfig(load_in_4bit=True)
            kwargs["quantization_config"] = bnb_config
        except ImportError:
            print("Install 'bitsandbytes' for 4-bit quantization to work on low VRAM.")

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
        model.eval()
        print(f"Model loaded successfully on device: {device}")
        return tokenizer, model, device
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Please ensure you have sufficient hardware (GPU/RAM) and required libraries installed.")
        return None, None, None

def normalize_answer(s):
    """
    Official HotpotQA normalization for answers.
    Lowercases, removes punctuation, articles (a, an, the), and extra whitespace.
    """
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    """Computes F1 score based on token overlap (official HotpotQA version)."""
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    ZERO_METRIC = (0.0, 0.0, 0.0) # F1, Precision, Recall

    # Handling 'yes', 'no', 'noanswer' as non-overlapping tokenization causes issues.
    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC
    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()

    # Token matching using Counter
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return ZERO_METRIC

    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1, precision, recall

def exact_match_score(prediction, ground_truth):
    """Computes Exact Match score (official HotpotQA version)."""
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

def compute_metrics(gold_answers, pred_answers):
    """Calculates Exact Match (EM), F1 Score, Precision, and Recall for a list of answers."""
    em_total = 0.0
    f1_total = 0.0
    prec_total = 0.0
    recall_total = 0.0

    for gold_list, pred in zip(gold_answers, pred_answers):
        # HotpotQA official metric uses the MAX score across all gold answers
        best_em = 0.0
        best_f1 = 0.0
        best_prec = 0.0
        best_recall = 0.0

        for gold_text in gold_list:
            # 1. EM Score
            em = exact_match_score(pred, gold_text)
            best_em = max(best_em, float(em))

            # 2. F1 Score (Returns F1, Precision, Recall tuple)
            f1, prec, recall = f1_score(pred, gold_text)

            if f1 > best_f1:
                best_f1 = f1
                best_prec = prec
                best_recall = recall

        em_total += best_em
        f1_total += best_f1
        prec_total += best_prec
        recall_total += best_recall

    num_samples = len(gold_answers)
    return {
        "EM": (em_total / num_samples) * 100,
        "F1": (f1_total / num_samples) * 100,
        "Precision": (prec_total / num_samples) * 100,
        "Recall": (recall_total / num_samples) * 100,
    }

# NOTE: The previous compute_f1 function is removed as it's replaced by the official f1_score.
# NOTE: All supporting fact (sp) and joint metrics logic is removed as this is a closed-book test.


def main():
    """Main function to run the HotpotQA evaluation."""
    tokenizer, model, device = load_model_and_tokenizer(MODEL_ID)
    if model is None:
        return

    # 1. Load Dataset
    print(f"\nLoading HotpotQA dataset (split: {SPLIT}, samples: {NUM_SAMPLES})...")
    try:
        dataset = load_dataset(DATASET_NAME, SUBSET_NAME, split=SPLIT)
        # Filter for the first N samples
        dataset = dataset.select(range(min(NUM_SAMPLES, len(dataset))))
        print(f"Dataset loaded with {len(dataset)} samples.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    gold_answers = []
    pred_answers = []

    print("\nStarting closed-book evaluation...")

    for i, example in tqdm(enumerate(dataset), total=len(dataset)):
        # HotpotQA question format
        question = example['question']

        # The gold answer is a list of possible answers
        # HotpotQA stores the answer as a string or list of strings.
        # We ensure it's a list for consistency with multiple possible answers.
        gold_answer_list = example['answer'] if isinstance(example['answer'], list) else [example['answer']]
        gold_answers.append(gold_answer_list)

        # --- Closed-Book Prompting ---
        # The prompt only contains the question, not the context,
        # forcing the model to rely on its internal knowledge.
        # prompt = f"Answer the following question concisely:\nQuestion: {question}\nAnswer:"
        prompt = f"""You are an expert at giving concise answers. Do not give any explanations, only a short answer. For example:
        Question: Which magazine was started first Arthur's Magazine or First for Women?
        Answer: Arthur's Magazine

        Question: Is Children's National Medical Center or MedStar Washington Hospital Center the largest private hospital in Washington, D.C.?
        Answer: MedStar Washington Hospital Center

        Now answer the question:

        Question: {question}
        Answer: """

        # Qwen-specific instruction format (if using the instruct version)
        messages = [
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=50, # Sufficient for a concise QA answer
                do_sample=False,
                num_beams=1,
                pad_token_id=tokenizer.eos_token_id # Important for generation
            )

        # Decode the generated text, skipping the input prompt
        generated_text = tokenizer.decode(output[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()

        pred_answers.append(generated_text)

        if i % (NUM_SAMPLES // 5 or 1) == 0 and i > 0:
            print(f"\n--- Sample {i+1}/{len(dataset)} ---")
            print(f"Q: {question}")
            # Print the first gold answer for readability
            print(f"Gold: {gold_answer_list[0]}")
            print(f"Pred: {generated_text}")

    # 3. Compute Metrics
    print("\n--- Evaluation Complete ---")
    metrics = compute_metrics(gold_answers, pred_answers)

    print(f"Results for {MODEL_ID} on HotpotQA ({len(dataset)} samples):")
    print(f"  Exact Match (EM): {metrics['EM']:.2f}%")
    print(f"  F1 Score (F1): {metrics['F1']:.2f}%")
    print(f"  Precision: {metrics['Precision']:.2f}%")
    print(f"  Recall: {metrics['Recall']:.2f}%")


if __name__ == "__main__":
    # Ensure all required libraries are installed:
    # pip install transformers datasets accelerate torch bitsandbytes tqdm
    main()

  from .autonotebook import tqdm as notebook_tqdm


Loading model: Qwen/Qwen2.5-7B-Instruct...


`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 4 files: 100%|██████████| 4/4 [00:55<00:00, 13.96s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.41s/it]


Model loaded successfully on device: cpu

Loading HotpotQA dataset (split: validation, samples: 10)...


Generating train split: 100%|██████████| 90447/90447 [00:02<00:00, 44690.37 examples/s]
Generating validation split: 100%|██████████| 7405/7405 [00:00<00:00, 38545.71 examples/s]
Generating test split: 100%|██████████| 7405/7405 [00:00<00:00, 45341.81 examples/s]


Dataset loaded with 10 samples.

Starting closed-book evaluation...


  0%|          | 0/10 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 30%|███       | 3/10 [02:00<04:49, 41.38s/it]


--- Sample 3/10 ---
Q: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
Gold: Animorphs
Pred: Star Wars: The Rise of Kylo Ren


 50%|█████     | 5/10 [03:15<03:15, 39.17s/it]


--- Sample 5/10 ---
Q: The director of the romantic comedy "Big Stone Gap" is based in what New York city?
Gold: Greenwich Village, New York City
Pred: New York City


 70%|███████   | 7/10 [04:32<01:56, 38.68s/it]


--- Sample 7/10 ---
Q: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?
Gold: Eenasul Fateh
Pred: Tom Peters


 90%|█████████ | 9/10 [05:47<00:37, 37.89s/it]


--- Sample 9/10 ---
Q: Who is older, Annie Morton or Terry Richardson?
Gold: Terry Richardson
Pred: Annie Morton


100%|██████████| 10/10 [06:22<00:00, 38.28s/it]



--- Evaluation Complete ---
Results for Qwen/Qwen2.5-7B-Instruct on HotpotQA (10 samples):
  Exact Match (EM): 30.00%
  F1 Score (F1): 37.50%
  Precision: 40.00%
  Recall: 36.00%


Loading model: Qwen/Qwen2.5-7B-Instruct...
Using Device: cpu


Loading checkpoint shards:  25%|██▌       | 1/4 [00:05<00:17,  5.74s/it]


KeyboardInterrupt: 

In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re
import string
from collections import Counter
torch.cuda.is_available()

  from .autonotebook import tqdm as notebook_tqdm


True