In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import os
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score


In [None]:
from google.colab import drive
drive.mount('/content/drive')
drive_path = "/content/drive/MyDrive/BoolQ_Project"

Mounted at /content/drive


In [None]:
ls "/content/drive/MyDrive/BoolQ_Project/"

[0m[01;34mboolq_finetuned_bert_best[0m/            pubmedqa_eval_metrics_tapt6.txt
[01;34mboolq_finetuned_bert_best_current[0m/    pubmedqa_eval_metrics.txt
[01;34mboolq_finetuned_distilbert_best[0m/      pubmedqa_model_comparison.csv
[01;34mboolq_finetuned_roberta_best[0m/         pubmedqa_passages_10k.txt
boolq_test_predictions_bert.csv       pubmedqa_passages_full.txt
boolq_test_predictions.csv            pubmedqa_test_metrics_10k.txt
boolq_test_predictions_epoch3.csv     pubmedqa_test_predictions_10k.csv
boolq_test_predictions_roberta.csv    pubmedqa_train.csv
dev_df_clean.csv                      [01;34mroberta_pubmedqa_finetuned_10k[0m/
[01;34mPubMedQA[0m/                             [01;34mroberta_pubmedqa_finetuned_6k[0m/
pubmedqa_dev.csv                      [01;34mroberta_tapt_pubmedqa[0m/
pubmedqa_dev_preds_boolq.csv          [01;34mroberta_tapt_pubmedqa_10k[0m/
pubmedqa_dev_preds_tapt6.csv          test_df_clean.csv
pubmedqa_dev_with_predictions.csv     t

In [None]:
train_df = pd.read_csv(f"{drive_path}/pubmedqa_train.csv")
dev_df = pd.read_csv(f"{drive_path}/pubmedqa_dev.csv")
train_df['answer'] = train_df['answer'].astype(int)
dev_df['answer'] = dev_df['answer'].astype(int)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def preprocess(df):
    return Dataset.from_pandas(df.rename(columns={"question": "QUESTION", "passage": "CONTEXTS", "answer": "label"}))

dataset = {
    "train": preprocess(train_df),
    "test": preprocess(dev_df)
}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def tokenize_fn(batch):
    return tokenizer(batch["QUESTION"], batch["CONTEXTS"], padding="max_length", truncation=True)

tokenized = {
    split: dataset[split].map(tokenize_fn, batched=True).remove_columns(["QUESTION", "CONTEXTS"]).with_format("torch")
    for split in dataset
}

Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1).numpy()
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="/content/roberta_pubmedqa_baseline",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="/content/logs",
    logging_steps=50,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4289,0.425953,0.848333,0.917944
2,0.4514,0.428224,0.848333,0.917944
3,0.438,0.433405,0.848333,0.917944


TrainOutput(global_step=1800, training_loss=0.41276571803622775, metrics={'train_runtime': 1494.9281, 'train_samples_per_second': 9.633, 'train_steps_per_second': 1.204, 'total_flos': 3788799197184000.0, 'train_loss': 0.41276571803622775, 'epoch': 3.0})

In [None]:
model_path = "/content/roberta_pubmedqa_baseline"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
!cp -r {model_path} "{drive_path}/"
print("✅ Baseline model saved to Drive")

✅ Baseline model saved to Drive


## 15K model

In [None]:
base_path = drive_path + "/PubMedQA/"
with open(base_path + "ori_pqaa.json") as f:
    pqaa_data = json.load(f)
with open(base_path + "ori_pqau.json") as f:
    pqau_data = json.load(f)
with open(base_path + "ori_pqal.json") as f:
    pqal_data = json.load(f)

In [None]:
import random
pqaa_keys = random.sample(list(pqaa_data.keys()), 9000)
pqau_keys = random.sample(list(pqau_data.keys()), 5000)
pqal_keys = list(pqal_data.keys())[:1000]  # first 1K from labeled

In [None]:
# Extract contexts
extract_passages = lambda data, keys: [' '.join(data[k]['CONTEXTS']) for k in keys if 'CONTEXTS' in data[k]]
passages = extract_passages(pqaa_data, pqaa_keys) + \
           extract_passages(pqau_data, pqau_keys) + \
           extract_passages(pqal_data, pqal_keys)

# Save to file
with open("/content/pubmedqa_passages_15k.txt", "w") as f:
    for p in passages:
        f.write(p.strip() + "\n")
print(f"✅ Total passages written: {len(passages)}")

✅ Total passages written: 15000


In [None]:
import torch, random, json
import pandas as pd
from transformers import RobertaTokenizer, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from tqdm import tqdm

# Load tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained(drive_path + "/boolq_finetuned_roberta_best")

# Load dataset and tokenize
dataset = load_dataset("text", data_files="/content/pubmedqa_passages_15k.txt")

def tokenize(examples):
    return roberta_tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [None]:
# Load model
roberta_model = RobertaForMaskedLM.from_pretrained(drive_path + "/boolq_finetuned_roberta_best")
roberta_model.to("cuda")

# Data collator for MLM
collator = DataCollatorForLanguageModeling(
    tokenizer=roberta_tokenizer,
    mlm=True,
    mlm_probability=0.15
)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at /content/drive/MyDrive/BoolQ_Project/boolq_finetuned_roberta_best and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
args = TrainingArguments(
    output_dir="/content/roberta_tapt_pubmedqa_15k",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True,
    report_to="none"
)

trainer = Trainer(
    model=roberta_model,
    args=args,
    train_dataset=tokenized["train"],
    tokenizer=roberta_tokenizer,
    data_collator=collator,
)


  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss
100,6.4509
200,3.7571
300,2.8205
400,2.4614
500,2.3021
600,2.1215
700,2.0308
800,2.0164
900,1.9295
1000,1.8912


TrainOutput(global_step=5625, training_loss=1.741540781656901, metrics={'train_runtime': 6044.7991, 'train_samples_per_second': 7.444, 'train_steps_per_second': 0.931, 'total_flos': 1.184694584832e+16, 'train_loss': 1.741540781656901, 'epoch': 3.0})

In [None]:
save_path = "/content/roberta_tapt_pubmedqa_15k"
roberta_model.save_pretrained(save_path)
roberta_tokenizer.save_pretrained(save_path)
!cp -r {save_path} "{drive_path}/"
print("✅ TAPT-15K model saved!")


✅ TAPT-15K model saved!


Fine-tune TAPT-15K RoBERTa on PubMedQA Labeled Data

In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_scheduler
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import torch
import pandas as pd
from torch.optim import AdamW

In [None]:
# Load model from TAPT-15K
model = RobertaForSequenceClassification.from_pretrained("/content/roberta_tapt_pubmedqa_15k", num_labels=2)
tokenizer = RobertaTokenizer.from_pretrained("/content/roberta_tapt_pubmedqa_15k")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load labeled data
train_df = pd.read_csv(f"{drive_path}/pubmedqa_train.csv")
dev_df = pd.read_csv(f"{drive_path}/pubmedqa_dev.csv")

# Ensure binary labels
train_df['answer'] = train_df['answer'].astype(int)
dev_df['answer'] = dev_df['answer'].astype(int)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/roberta_tapt_pubmedqa_15k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Dataset class
class PubMedQADataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        inputs = self.tokenizer(
            row['question'],
            row['passage'],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in inputs.items()}
        item['labels'] = torch.tensor(row['answer'], dtype=torch.long)
        return item

# Datasets and loaders
train_dataset = PubMedQADataset(train_df, tokenizer)
dev_dataset = PubMedQADataset(dev_df, tokenizer)
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)
dev_loader = DataLoader(dev_dataset, sampler=SequentialSampler(dev_dataset), batch_size=8)

# Training setup
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
# Fine-tuning loop
best_f1 = 0
for epoch in range(3):
    print(f"\n🔥 Fine-Tuning TAPT-15K | Epoch {epoch+1}/3")
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        lr_scheduler.step()

    print(f"📉 Avg Train Loss: {total_loss/len(train_loader):.4f}")

    # Evaluation
    model.eval()
    dev_preds, dev_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dev_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            dev_preds.extend(preds.cpu().numpy())
            dev_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(dev_labels, dev_preds)
    f1 = f1_score(dev_labels, dev_preds)
    print(f"✅ Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        model.save_pretrained("/content/roberta_pubmedqa_finetuned_15k")
        tokenizer.save_pretrained("/content/roberta_pubmedqa_finetuned_15k")
        !cp -r /content/roberta_pubmedqa_finetuned_15k "{drive_path}/"
        print(f"💾 Saved best model (F1 = {f1:.4f}) to Drive ✅")


🔥 Fine-Tuning TAPT-15K | Epoch 1/3


Training:   0%|          | 2/600 [00:01<05:58,  1.67it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   0%|          | 3/600 [00:01<06:32,  1.52it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   1%|          | 6/600 [00:04<06:58,  1.42it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   2%|▏         | 12/600 [00:08<07:10,  1.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence 

📉 Avg Train Loss: 0.3824


Evaluating:   1%|          | 1/150 [00:00<01:45,  1.41it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   1%|▏         | 2/150 [00:00<01:03,  2.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   7%|▋         | 10/150 [00:02<00:33,  4.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy.

✅ Accuracy: 0.8683 | F1 Score: 0.9273
💾 Saved best model (F1 = 0.9273) to Drive ✅

🔥 Fine-Tuning TAPT-15K | Epoch 2/3


Training:   0%|          | 2/600 [00:01<05:47,  1.72it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   1%|          | 5/600 [00:03<06:58,  1.42it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   2%|▏         | 12/600 [00:08<07:16,  1.35it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   4%|▍         | 23/600 [00:16<07:15,  1.33it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence

📉 Avg Train Loss: 0.2912


Evaluating:   1%|          | 1/150 [00:00<01:45,  1.41it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   1%|▏         | 2/150 [00:00<01:03,  2.33it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   7%|▋         | 10/150 [00:02<00:32,  4.31it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy.

✅ Accuracy: 0.8700 | F1 Score: 0.9213

🔥 Fine-Tuning TAPT-15K | Epoch 3/3


Training:   0%|          | 0/600 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   1%|          | 4/600 [00:02<06:47,  1.46it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   2%|▏         | 12/600 [00:08<07:20,  1.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   2%|▏         | 13/600 [00:09<07:19,  1.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

📉 Avg Train Loss: 0.2076


Evaluating:   1%|          | 1/150 [00:00<01:46,  1.40it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   1%|▏         | 2/150 [00:00<01:03,  2.33it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   7%|▋         | 10/150 [00:02<00:34,  4.09it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy.

✅ Accuracy: 0.8892 | F1 Score: 0.9343
💾 Saved best model (F1 = 0.9343) to Drive ✅


In [None]:
save_path = "/content/roberta_pubmedqa_finetuned_15k"
roberta_model.save_pretrained(save_path)
roberta_tokenizer.save_pretrained(save_path)


('/content/roberta_pubmedqa_finetuned_15k/tokenizer_config.json',
 '/content/roberta_pubmedqa_finetuned_15k/special_tokens_map.json',
 '/content/roberta_pubmedqa_finetuned_15k/vocab.json',
 '/content/roberta_pubmedqa_finetuned_15k/merges.txt',
 '/content/roberta_pubmedqa_finetuned_15k/added_tokens.json')

In [None]:
!cp -r /content/roberta_pubmedqa_finetuned_15k "{drive_path}/"
print("✅ TAPT-15K fine-tuned model saved to Drive!")


✅ TAPT-15K fine-tuned model saved to Drive!


In [None]:
import os
print(os.listdir(f"{drive_path}/roberta_pubmedqa_finetuned_15k"))


['config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'vocab.json', 'merges.txt']


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Define the path
drive_path = "/content/drive/MyDrive/BoolQ_Project"
model_dir = f"{drive_path}/roberta_pubmedqa_finetuned_15k"

# Check if files exist
import os
if os.path.exists(model_dir):
    print(f"✅ Folder exists: {model_dir}")
    print("📂 Files inside:")
    for filename in os.listdir(model_dir):
        print(" -", filename)
else:
    print("❌ Folder not found. Model may not be saved.")

Mounted at /content/drive
✅ Folder exists: /content/drive/MyDrive/BoolQ_Project/roberta_pubmedqa_finetuned_15k
📂 Files inside:
 - config.json
 - model.safetensors
 - tokenizer_config.json
 - special_tokens_map.json
 - merges.txt
 - vocab.json


In [None]:
!ls "/content/drive/MyDrive/BoolQ_Project/roberta_pubmedqa_finetuned_15k"

config.json  model.safetensors	      tokenizer_config.json
merges.txt   special_tokens_map.json  vocab.json
