# 01 — Load PubMedQA and Train a Fast Baseline (TF–IDF + LogisticRegression)

In [None]:
!pip -q install -U pip
!pip -q install transformers datasets evaluate scikit-learn pandas numpy joblib

In [None]:
from datasets import load_dataset
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load the labeled split
ds = load_dataset("pubmed_qa", "pqa_labeled")
train_ds = ds["train"]
val_ds   = ds["validation"]

def to_df(split):
    # fields: 'question', 'context', 'final_decision' (yes/no/maybe)
    q = [x["question"] for x in split]
    c = [x["context"] for x in split]
    y = [x["final_decision"] for x in split]
    # concatenate question + context as features
    X = [f"Q: {qi}
C: {ci}" for qi,ci in zip(q,c)]
    return pd.DataFrame({"text": X, "label": y})

train_df = to_df(train_ds)
val_df   = to_df(val_ds)
print(train_df.head())

In [None]:
# Majority baseline (for report)
maj = train_df['label'].mode()[0]
maj_acc = (val_df['label'] == maj).mean()
print("Majority label:", maj, "Validation Acc:", round(maj_acc, 4))

In [None]:
# TF-IDF + Logistic Regression baseline
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None))
])
pipe.fit(train_df['text'], train_df['label'])

pred = pipe.predict(val_df['text'])
print("TFIDF+LR Accuracy:", round(accuracy_score(val_df['label'], pred), 4))
print(classification_report(val_df['label'], pred))

# Save model for reuse
joblib.dump(pipe, "tfidf_lr_pubmedqa.joblib")
print("Saved model: tfidf_lr_pubmedqa.joblib")