# 01 => Load PubMedQA and Train a Fast Baseline (TF–IDF + LogisticRegression)

In [1]:
!pip -q install -U pip
!pip -q install transformers datasets evaluate scikit-learn pandas numpy joblib

In [2]:
# clone once
!rm -rf pubmedqa-llm-bot
!git clone https://github.com/AnnaJazayeri/pubmedqa-llm-bot.git
%cd /content/pubmedqa-llm-bot

# install dependencies
!pip -q install -r requirements.txt

# make sure Python can see the project root so `src` imports work
import sys
if '/content/pubmedqa-llm-bot' not in sys.path:
    sys.path.append('/content/pubmedqa-llm-bot')

# then jump into notebooks folder if you want to open/run them there
%cd notebooks

Cloning into 'pubmedqa-llm-bot'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 28 (delta 9), reused 9 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (28/28), 11.17 KiB | 2.79 MiB/s, done.
Resolving deltas: 100% (9/9), done.
/content/pubmedqa-llm-bot
/content/pubmedqa-llm-bot/notebooks


In [3]:
from datasets import load_dataset
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

# pulls the PubMedQA data (this config only has a single 'train' split)
ds = load_dataset("pubmed_qa", "pqa_labeled")
all_items = ds["train"]

def to_dataframe(items):
    # grab fields we care about
    questions = [it["question"] for it in items]
    # the context text lives under context["contexts"] as a list; join into one string
    contexts = [" ".join(it["context"]["contexts"]) for it in items]
    labels = [it["final_decision"] for it in items]  # 'yes' / 'no' / 'maybe'

    # glue question and context together so a basic model can read it as one text
    texts = [f"Q: {q}\nC: {c}" for q, c in zip(questions, contexts)]

    # return an easy table: one column for text, one for the label
    return pd.DataFrame({"text": texts, "label": labels})

# build one table, then split 80/20 into train/validation
df = to_dataframe(all_items)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

print(train_df.head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

pqa_labeled/train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

                                                  text label
29   Q: Visceral adipose tissue area measurement at...   yes
535  Q: Production of chemokines by perivascular ad...   yes
695  Q: Is the Androgen Deficiency of Aging Men (AD...    no
557  Q: Epidural analgesia for surgical treatment o...    no
836  Q: Can dogs prime autistic children for therap...   yes


In [4]:
# Majority baseline (for report)
maj = train_df['label'].mode()[0]
maj_acc = (val_df['label'] == maj).mean()
print("Majority label:", maj, "Validation Acc:", round(maj_acc, 4))

Majority label: yes Validation Acc: 0.61


In [5]:
# TF-IDF + Logistic Regression baseline
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None))
])
pipe.fit(train_df['text'], train_df['label'])

pred = pipe.predict(val_df['text'])
print("TFIDF+LR Accuracy:", round(accuracy_score(val_df['label'], pred), 4))
print(classification_report(val_df['label'], pred))

# Save model for reuse
joblib.dump(pipe, "tfidf_lr_pubmedqa.joblib")
print("Saved model: tfidf_lr_pubmedqa.joblib")

TFIDF+LR Accuracy: 0.62
              precision    recall  f1-score   support

       maybe       0.00      0.00      0.00        15
          no       0.67      0.06      0.12        63
         yes       0.62      0.98      0.76       122

    accuracy                           0.62       200
   macro avg       0.43      0.35      0.29       200
weighted avg       0.59      0.62      0.50       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved model: tfidf_lr_pubmedqa.joblib
