# 01 => Load PubMedQA and Train a Fast Baseline (TFâ€“IDF + LogisticRegression)

In [2]:
!pip -q install -U pip
!pip -q install transformers datasets evaluate scikit-learn pandas numpy joblib

In [4]:
# clone once
!rm -rf pubmedqa-llm-bot
!git clone https://github.com/AnnaJazayeri/pubmedqa-llm-bot.git
%cd /content/pubmedqa-llm-bot

# install dependencies
!pip -q install -r requirements.txt

# make sure Python can see the project root so `src` imports work
import sys
if '/content/pubmedqa-llm-bot' not in sys.path:
    sys.path.append('/content/pubmedqa-llm-bot')

# then jump into notebooks folder if you want to open/run them there
%cd notebooks

Cloning into 'pubmedqa-llm-bot'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 20 (delta 3), reused 12 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (20/20), 7.80 KiB | 2.60 MiB/s, done.
Resolving deltas: 100% (3/3), done.
/content/pubmedqa-llm-bot
/content/pubmedqa-llm-bot/notebooks


In [5]:
from datasets import load_dataset
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

# pulls the PubMedQA data (this config only has a single 'train' split)
ds = load_dataset("pubmed_qa", "pqa_labeled")
all_items = ds["train"]

def to_dataframe(items):
    # grab fields we care about
    questions = [it["question"] for it in items]
    # the context text lives under context["contexts"] as a list; join into one string
    contexts = [" ".join(it["context"]["contexts"]) for it in items]
    labels = [it["final_decision"] for it in items]  # 'yes' / 'no' / 'maybe'

    # glue question and context together so a basic model can read it as one text
    texts = [f"Q: {q}\nC: {c}" for q, c in zip(questions, contexts)]

    # return an easy table: one column for text, one for the label
    return pd.DataFrame({"text": texts, "label": labels})

# build one table, then split 80/20 into train/validation
df = to_dataframe(all_items)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

print(train_df.head())


SyntaxError: unterminated f-string literal (detected at line 21) (ipython-input-3563362596.py, line 21)

In [None]:
# Majority baseline (for report)
maj = train_df['label'].mode()[0]
maj_acc = (val_df['label'] == maj).mean()
print("Majority label:", maj, "Validation Acc:", round(maj_acc, 4))

In [None]:
# TF-IDF + Logistic Regression baseline
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None))
])
pipe.fit(train_df['text'], train_df['label'])

pred = pipe.predict(val_df['text'])
print("TFIDF+LR Accuracy:", round(accuracy_score(val_df['label'], pred), 4))
print(classification_report(val_df['label'], pred))

# Save model for reuse
joblib.dump(pipe, "tfidf_lr_pubmedqa.joblib")
print("Saved model: tfidf_lr_pubmedqa.joblib")