# 02 â€” BioGPT Prompting on PubMedQA (small subset)

In [None]:
!pip -q install transformers datasets accelerate evaluate pandas numpy

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np, pandas as pd
from tqdm.auto import tqdm
from src.utils import PROMPT_TEMPLATE, normalize_label

# Load data
ds = load_dataset("pubmed_qa", "pqa_labeled")
val = ds["validation"]

# Use base BioGPT (smaller than Large; safer on Colab T4)
model_name = "microsoft/biogpt"
tok = AutoTokenizer.from_pretrained(model_name)
gen = AutoModelForCausalLM.from_pretrained(model_name)
pipe = pipeline("text-generation", model=gen, tokenizer=tok, device_map="auto", max_new_tokens=32)

def ask_biogpt(question, context):
    prompt = PROMPT_TEMPLATE.format(question=question, context=context[:1500])
    out = pipe(prompt, do_sample=False)[0]["generated_text"]
    # Extract portion after 'Answer:'
    ans = out.split("Answer:")[-1].strip()
    return normalize_label(ans), ans

# Evaluate on a small slice for speed
N = 200
subset = val.select(range(min(N, len(val))))
preds, golds = [], []
raw = []

for ex in tqdm(subset):
    pred_label, raw_out = ask_biogpt(ex["question"], ex["context"])
    preds.append(pred_label)
    golds.append(ex["final_decision"])
    raw.append(raw_out)

acc = (np.array(preds) == np.array(golds)).mean()
print("BioGPT subset accuracy:", round(float(acc), 4))