In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls "/content/drive/My Drive/Colab Notebooks/BoolQ_Project/PubMedQA/"

ori_pqaa.json  ori_pqal.json  ori_pqau.json  test_ground_truth.json


In [None]:
# Import necessary libraries
import json
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_scheduler, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define paths
base_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project/PubMedQA/"
drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project/"

Using device: cuda


In [None]:
# Load each JSON file and count the number of instances
with open(base_path + 'ori_pqal.json', 'r') as f:
    pqal_data = json.load(f)
    pqal_count = len(pqal_data)

with open(base_path + 'ori_pqaa.json', 'r') as f:
    pqaa_data = json.load(f)
    pqaa_count = len(pqaa_data)

with open(base_path + 'ori_pqau.json', 'r') as f:
    pqau_data = json.load(f)
    pqau_count = len(pqau_data)

print(f"Number of instances in ori_pqal.json: {pqal_count}")
print(f"Number of instances in ori_pqaa.json: {pqaa_count}")
print(f"Number of instances in ori_pqau.json: {pqau_count}")


Number of instances in ori_pqal.json: 1000
Number of instances in ori_pqaa.json: 211269
Number of instances in ori_pqau.json: 61249


In [None]:
# Convert to DataFrame
def pubmedqa_to_dataframe(data):
    rows = []
    for pubmed_id, item in data.items():
        question = item['QUESTION']
        context = ' '.join(item['CONTEXTS'])
        answer = item['final_decision']
        rows.append({'pubmed_id': pubmed_id, 'question': question, 'passage': context, 'answer': answer})
    return pd.DataFrame(rows)

# Create DataFrames
pqal_df = pubmedqa_to_dataframe(pqal_data)
pqaa_df = pubmedqa_to_dataframe(pqaa_data)

# Sample a subset of pqaa_df (5,000 instances)
pqaa_subset_df = pqaa_df.sample(n=5000, random_state=42)

# Combine the labeled and sampled artificially generated data
combined_df = pd.concat([pqal_df, pqaa_subset_df], ignore_index=True)

# Clean the answer column (map to binary for simplicity, treating 'maybe' as 'no')
combined_df['answer'] = combined_df['answer'].apply(lambda x: True if x == 'yes' else False)

# Split into train and dev sets (80% train, 20% dev)
pubmedqa_train_df, pubmedqa_dev_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Save the DataFrames
pubmedqa_train_df.to_csv('/content/pubmedqa_train.csv', index=False)
pubmedqa_dev_df.to_csv('/content/pubmedqa_dev.csv', index=False)

# Save to Google Drive
!cp /content/pubmedqa_train.csv "{drive_path}/"
!cp /content/pubmedqa_dev.csv "{drive_path}/"

print("PubMedQA DataFrames created and saved:")
print("Train shape:", pubmedqa_train_df.shape)
print("Dev shape:", pubmedqa_dev_df.shape)
print("Sample train data:\n", pubmedqa_train_df.head())

PubMedQA DataFrames created and saved:
Train shape: (4800, 4)
Dev shape: (1200, 4)
Sample train data:
      pubmed_id                                           question  \
3897  20122254  Do adherens junctions connect stress fibres be...   
5628  14501610  Does factor V Leiden mutation accelerate the o...   
1756  15596562  Does nuclear factor-kappaB repress hypoxia-ind...   
2346  21311167  Is microvascular endothelial dysfunction in ob...   
2996  24405788  Is subdivision of arthropod cap-n-collar expre...   

                                                passage  answer  
3897  Endothelial cell-cell junctions maintain endot...    True  
5628  Smoking is consistently associated with a youn...    True  
1756  Oxygen deprivation for prolonged periods of ti...    True  
2346  Endothelial dysfunction has recently been demo...    True  
2996  The monophyly of Mandibulata - the division of...    True  


In [None]:
# ✅ Load RoBERTa tokenizer and convert your classification model into MLM format
from transformers import RobertaTokenizer, RobertaForMaskedLM

# Load the tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained('/content/drive/My Drive/Colab Notebooks/BoolQ_Project/boolq_finetuned_roberta_best')

# Load the model for MLM (still uses weights from your BoolQ fine-tuned model)
roberta_model = RobertaForMaskedLM.from_pretrained('/content/drive/My Drive/Colab Notebooks/BoolQ_Project/boolq_finetuned_roberta_best')
roberta_model.to(device)
print("RoBERTa model (for TAPT) loaded successfully.")


Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at /content/drive/My Drive/Colab Notebooks/BoolQ_Project/boolq_finetuned_roberta_best and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa model (for TAPT) loaded successfully.


In [None]:
# ✅ Extract passages from PubMedQA combined_df
passages = combined_df['passage'].tolist()
with open('/content/pubmedqa_passages.txt', 'w') as f:
    for passage in passages:
        f.write(passage + '\n')
print("PubMedQA passages saved for TAPT.")


PubMedQA passages saved for TAPT.


In [None]:
# ✅ Load as HuggingFace dataset and tokenize
from datasets import load_dataset

# Load as text dataset
dataset = load_dataset('text', data_files='/content/pubmedqa_passages.txt')

# Tokenize the passages
def tokenize_function(examples):
    return roberta_tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])
print("Passages tokenized.")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Passages tokenized.


In [None]:
# ✅ Set up TAPT training using HuggingFace Trainer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=roberta_tokenizer, mlm=True, mlm_probability=0.15
)

# Training arguments (disable wandb, logs only to stdout)
training_args = TrainingArguments(
    output_dir="/content/roberta_tapt_pubmedqa",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True,
    report_to="none"
)

# Trainer for TAPT
trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    tokenizer=roberta_tokenizer,
    data_collator=data_collator,
)

# ✅ Start TAPT
trainer.train()

  trainer = Trainer(


Step,Training Loss
100,6.3812
200,3.6775
300,2.783
400,2.4235
500,2.2544
600,2.1279
700,2.027
800,1.9182
900,1.9079
1000,1.8507


TrainOutput(global_step=2250, training_loss=2.1431409166124133, metrics={'train_runtime': 2431.0414, 'train_samples_per_second': 7.404, 'train_steps_per_second': 0.926, 'total_flos': 4738778339328000.0, 'train_loss': 2.1431409166124133, 'epoch': 3.0})

In [None]:
# ✅ Save the adapted model (after TAPT)
roberta_model.save_pretrained("/content/roberta_tapt_pubmedqa")
roberta_tokenizer.save_pretrained("/content/roberta_tapt_pubmedqa")

# Copy to Drive
!cp -r /content/roberta_tapt_pubmedqa "{drive_path}/"
print("TAPT-adapted RoBERTa model saved to Drive.")

TAPT-adapted RoBERTa model saved to Drive.


In [None]:
import json
import random

# Set paths
base_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project/PubMedQA/"
drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project/"

# Load JSON files
with open(base_path + 'ori_pqal.json', 'r') as f:
    pqal_data = json.load(f)

with open(base_path + 'ori_pqaa.json', 'r') as f:
    pqaa_data = json.load(f)

with open(base_path + 'ori_pqau.json', 'r') as f:
    pqau_data = json.load(f)

# Sample 7K from pqaa and 2K from pqau
sampled_pqaa_keys = random.sample(list(pqaa_data.keys()), 7000)
sampled_pqau_keys = random.sample(list(pqau_data.keys()), 2000)

# Helper to extract passages
def extract_passages(data_dict, keys=None):
    passages = []
    iterable = keys if keys else data_dict.keys()
    for k in iterable:
        item = data_dict[k]
        contexts = item.get('CONTEXTS', [])
        if contexts:
            passages.append(' '.join(contexts))
    return passages

# Collect passages
pqal_passages = extract_passages(pqal_data)
pqaa_sampled_passages = extract_passages(pqaa_data, sampled_pqaa_keys)
pqau_sampled_passages = extract_passages(pqau_data, sampled_pqau_keys)

# Combine all into one list
tapt10k_passages = pqal_passages + pqaa_sampled_passages + pqau_sampled_passages
print(f"✅ Total passages prepared for TAPT (~10K): {len(tapt10k_passages)}")


✅ Total passages prepared for TAPT (~10K): 10000


In [None]:
# Save to file
tapt10k_file = '/content/pubmedqa_passages_10k.txt'
with open(tapt10k_file, 'w') as f:
    for passage in tapt10k_passages:
        f.write(passage + '\n')

# Backup to Drive
!cp {tapt10k_file} "{drive_path}/"
print("✅ Passage file saved to Drive as 'pubmedqa_passages_10k.txt'")

✅ Passage file saved to Drive as 'pubmedqa_passages_10k.txt'


In [None]:
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load tokenizer and BoolQ-finetuned model (as MLM)
roberta_tokenizer_10k = RobertaTokenizer.from_pretrained('/content/drive/My Drive/Colab Notebooks/BoolQ_Project/boolq_finetuned_roberta_best')
roberta_model_10k = RobertaForMaskedLM.from_pretrained('/content/drive/My Drive/Colab Notebooks/BoolQ_Project/boolq_finetuned_roberta_best')
roberta_model_10k.to(device)

# Load dataset from text file
dataset_10k = load_dataset('text', data_files=tapt10k_file)

# Tokenize passages
def tokenize_10k(examples):
    return roberta_tokenizer_10k(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_dataset_10k = dataset_10k.map(tokenize_10k, batched=True, remove_columns=['text'])
print("✅ Tokenization of 10K TAPT dataset complete.")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at /content/drive/My Drive/Colab Notebooks/BoolQ_Project/boolq_finetuned_roberta_best and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

✅ Tokenization of 10K TAPT dataset complete.


In [None]:
# Data collator for MLM
data_collator_10k = DataCollatorForLanguageModeling(
    tokenizer=roberta_tokenizer_10k, mlm=True, mlm_probability=0.15
)

# Training arguments
training_args_10k = TrainingArguments(
    output_dir="/content/roberta_tapt_pubmedqa_10k",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True,
    report_to="none"
)

# Trainer
trainer_10k = Trainer(
    model=roberta_model_10k,
    args=training_args_10k,
    train_dataset=tokenized_dataset_10k['train'],
    tokenizer=roberta_tokenizer_10k,
    data_collator=data_collator_10k,
)

# Start TAPT
trainer_10k.train()

# Save model and tokenizer
roberta_model_10k.save_pretrained("/content/roberta_tapt_pubmedqa_10k")
roberta_tokenizer_10k.save_pretrained("/content/roberta_tapt_pubmedqa_10k")

# Backup to Drive
!cp -r /content/roberta_tapt_pubmedqa_10k "{drive_path}/"
print("✅ TAPT-10K model saved as 'roberta_tapt_pubmedqa_10k'")


  trainer_10k = Trainer(


Step,Training Loss
100,6.4613
200,3.822
300,2.9461
400,2.4642
500,2.2912
600,2.1631
700,2.0252
800,2.0096
900,1.9234
1000,1.8926


✅ TAPT-10K model saved as 'roberta_tapt_pubmedqa_10k'


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Load labeled CSVs
train_df = pd.read_csv('/content/pubmedqa_train.csv')
dev_df = pd.read_csv('/content/pubmedqa_dev.csv')


In [None]:
train_df

Unnamed: 0,pubmed_id,question,passage,answer
0,20122254,Do adherens junctions connect stress fibres be...,Endothelial cell-cell junctions maintain endot...,True
1,14501610,Does factor V Leiden mutation accelerate the o...,Smoking is consistently associated with a youn...,True
2,15596562,Does nuclear factor-kappaB repress hypoxia-ind...,Oxygen deprivation for prolonged periods of ti...,True
3,21311167,Is microvascular endothelial dysfunction in ob...,Endothelial dysfunction has recently been demo...,True
4,24405788,Is subdivision of arthropod cap-n-collar expre...,The monophyly of Mandibulata - the division of...,True
...,...,...,...,...
4795,22589267,Is mitochondrial DNA copy number in peripheral...,It has been suggested that mitochondrial dysfu...,True
4796,26309624,Does the decreased expression of miR-625 predi...,Previous study has detected the expression of ...,True
4797,25913510,Does simultaneous Removal of Third Molars Duri...,An increasing number of patients who are 30 ye...,False
4798,17090189,Does morphologic examination of sequential bon...,Nonmyeloablative stem cell transplantation (NM...,True


In [None]:
dev_df

Unnamed: 0,pubmed_id,question,passage,answer
0,15927999,Does infliximab activate replication of lympho...,The reactivation of human lymphotropic herpesv...,False
1,19583679,Are polymorphisms in the neurokinin-2 receptor...,Treatment with angiotensin-converting enzyme (...,True
2,26485091,Does the use of atypical antipsychotics as adj...,Several atypical antipsychotics (AAPs) are use...,False
3,22911222,Does androgen deprivation induce senescence ch...,The treatment of non-localized prostate cancer...,True
4,22558117,Do a small molecule SMAC mimic LBW242 potentia...,Ovarian cancer remains a leading cause of deat...,True
...,...,...,...,...
1195,24001463,Is robot-assisted radical prostatectomy a safe...,We present our departmental experience with ro...,True
1196,24499054,Are lower serum levels of total cholesterol as...,"Lower serum total (TC), high-density lipoprote...",True
1197,16442976,Do comparison of electric stimulation methods ...,To compare the effect of 3 methods of electric...,True
1198,10706075,Is proliferation of dendritic cell progenitors...,A unique long term culture (LTC) system has be...,False


In [None]:
# Convert boolean to int labels (if not already)
train_df['answer'] = train_df['answer'].astype(int)
dev_df['answer'] = dev_df['answer'].astype(int)

# Dataset class
class PubMedQADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        question = self.data.iloc[index]['question']
        passage = self.data.iloc[index]['passage']
        inputs = self.tokenizer(
            question,
            passage,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in inputs.items()}
        item['labels'] = torch.tensor(self.data.iloc[index]['answer'], dtype=torch.long)
        return item


In [None]:
# Load tokenizer and TAPT-10K model for classification
roberta_tokenizer_10k = RobertaTokenizer.from_pretrained('/content/roberta_tapt_pubmedqa_10k')
roberta_model_10k = RobertaForSequenceClassification.from_pretrained('/content/roberta_tapt_pubmedqa_10k', num_labels=2)
roberta_model_10k.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/roberta_tapt_pubmedqa_10k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataset = PubMedQADataset(train_df, roberta_tokenizer_10k)
dev_dataset = PubMedQADataset(dev_df, roberta_tokenizer_10k)

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)
dev_loader = DataLoader(dev_dataset, sampler=SequentialSampler(dev_dataset), batch_size=8)

In [None]:
import torch
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

optimizer = AdamW(roberta_model_10k.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

best_f1 = 0
for epoch in range(3):
    print(f"\nEpoch {epoch + 1}/3")
    roberta_model_10k.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = roberta_model_10k(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(roberta_model_10k.parameters(), 1.0)
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Avg Train Loss: {avg_train_loss:.4f}")

    # Evaluation
    roberta_model_10k.eval()
    dev_preds, dev_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dev_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = roberta_model_10k(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            dev_preds.extend(preds.cpu().numpy())
            dev_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(dev_labels, dev_preds)
    f1 = f1_score(dev_labels, dev_preds)
    print(f"Dev Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")

    # Save best
    if f1 > best_f1:
        best_f1 = f1
        roberta_model_10k.save_pretrained('/content/roberta_pubmedqa_finetuned_10k')
        roberta_tokenizer_10k.save_pretrained('/content/roberta_pubmedqa_finetuned_10k')
        !cp -r /content/roberta_pubmedqa_finetuned_10k "{drive_path}/"
        print(f"Best model saved with F1 = {f1:.4f}")



Epoch 1/3


Training:   0%|          | 0/600 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   0%|          | 2/600 [00:01<06:48,  1.46it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   1%|▏         | 8/600 [00:06<07:43,  1.28it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   2%|▏         | 10/600 [00:07<07:44,  1.27it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs wi

Avg Train Loss: 0.3727


Evaluating:   1%|          | 1/150 [00:00<01:48,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   1%|▏         | 2/150 [00:00<01:05,  2.27it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   7%|▋         | 10/150 [00:02<00:34,  4.08it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy.

Dev Accuracy: 0.8792 | F1 Score: 0.9325
Best model saved with F1 = 0.9325

Epoch 2/3


Training:   0%|          | 0/600 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   0%|          | 3/600 [00:01<06:37,  1.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   1%|          | 5/600 [00:03<07:11,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   1%|▏         | 8/600 [00:05<07:28,  1.32it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs wit

Avg Train Loss: 0.3087


Evaluating:   1%|          | 1/150 [00:00<01:47,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   1%|▏         | 2/150 [00:00<01:04,  2.28it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   7%|▋         | 10/150 [00:02<00:33,  4.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy.

Dev Accuracy: 0.8867 | F1 Score: 0.9333
Best model saved with F1 = 0.9333

Epoch 3/3


Training:   0%|          | 2/600 [00:01<05:59,  1.66it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   1%|          | 4/600 [00:02<06:58,  1.43it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   1%|          | 5/600 [00:03<07:09,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training:   2%|▏         | 10/600 [00:07<07:27,  1.32it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence 

Avg Train Loss: 0.2137


Evaluating:   1%|          | 1/150 [00:00<01:48,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   1%|▏         | 2/150 [00:00<01:04,  2.28it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   7%|▋         | 10/150 [00:02<00:33,  4.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy.

Dev Accuracy: 0.8983 | F1 Score: 0.9408
Best model saved with F1 = 0.9408


In [None]:
# Save final model and tokenizer
roberta_model_10k.save_pretrained('/content/roberta_pubmedqa_finetuned_10k')
roberta_tokenizer_10k.save_pretrained('/content/roberta_pubmedqa_finetuned_10k')

# Copy to Google Drive
!cp -r /content/roberta_pubmedqa_finetuned_10k "{drive_path}/"
print("✅ Fine-tuned PubMedQA model saved to Drive.")

✅ Fine-tuned PubMedQA model saved to Drive.


In [None]:
with open('/content/pubmedqa_eval_metrics.txt', 'w') as f:
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")

!cp /content/pubmedqa_eval_metrics.txt "{drive_path}/"
print("📊 Eval metrics saved.")


📊 Eval metrics saved.
