In [286]:
### Load the Dataset and Model
model_checkpoint = "bert-base-uncased"

## Loading the dataset
from datasets import load_dataset, ClassLabel, Metric
from evaluate import load
dataset = load_dataset("pubmed_qa", name="pqa_labeled")

features = dataset['train'].features.copy()
features['final_decision'] = ClassLabel(3, ["yes","no", "maybe"])
dataset['train'] = dataset['train'].cast(features)
dataset = dataset.rename_column('final_decision','label')
metric: Metric = load("f1")

## Loading the model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Found cached dataset pubmed_qa (/Users/ellington/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)
100%|██████████| 1/1 [00:00<00:00, 120.82it/s]
Loading cached processed dataset at /Users/ellington/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924/cache-8bf389753f3146a2.arrow
loading configuration file config.json from cache at /Users/ellington/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_em

In [287]:
# demonstrating that our dataset has no null values

import pandas as pd

ds_df = pd.DataFrame({
    'question': dataset['train']['question'],
    'context' : dataset['train']['context']
})
ds_df = ds_df.convert_dtypes(infer_objects=True)
ds_na = ds_df[ds_df.isnull().any(axis=1)]
ds_na

Unnamed: 0,question,context


In [380]:
def preprocess_with_long_answer(examples):
    return tokenizer(
        examples["question"],
        examples["long_answer"],
        truncation=True,
        padding=True,
    )

def preprocess_with_context(examples):
    question = examples['question']
    context = examples['context.contexts']
    
    # Combine context sentences into a single string
    context_strs = [' '.join(context_str) for context_str in context]
    
    # Tokenize inputs with overlap
    return tokenizer(
        question,
        context_strs,
        padding='max_length',
        truncation=True,
        max_length=512,
        stride=256,
        return_tensors='pt'
    )


encoded_reasoning_required = dataset.flatten().map(preprocess_with_context, batched=True)
encoded_reasoning_free = dataset.map(preprocess_with_long_answer, batched=True)

encoded_dataset = encoded_reasoning_required
# encoded_dataset = encoded_reasoning_free

Loading cached processed dataset at /Users/ellington/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924/cache-ce9a785483dcb7af.arrow


In [366]:
from datasets import DatasetDict
train_valid = encoded_dataset['train'].train_test_split(test_size=.5)

train_test = train_valid['train'].train_test_split(test_size=.1)
train_test_valid_dataset = DatasetDict({
    'train':train_test['train'],
    'test':train_test['test'],
    'validation':train_valid['test']
})
# train_test_valid_dataset = train_test_valid_dataset.remove_columns(('context.contexts', 'context.labels', 'context.meshes', 'context.reasoning_required_pred', 'context.reasoning_free_pred', 'long_answer', 'pubid', 'question'))
train_test_valid_dataset

Loading cached split indices for dataset at /Users/ellington/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924/cache-dbe64cf34e928c5f.arrow and /Users/ellington/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924/cache-036d862f973f5849.arrow
Loading cached split indices for dataset at /Users/ellington/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924/cache-049cdee58ce41ff8.arrow and /Users/ellington/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924/cache-6c1dbcc4aae5adbb.arrow


DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'long_answer', 'label', 'context.contexts', 'context.labels', 'context.meshes', 'context.reasoning_required_pred', 'context.reasoning_free_pred', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['pubid', 'question', 'long_answer', 'label', 'context.contexts', 'context.labels', 'context.meshes', 'context.reasoning_required_pred', 'context.reasoning_free_pred', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
    validation: Dataset({
        features: ['pubid', 'question', 'long_answer', 'label', 'context.contexts', 'context.labels', 'context.meshes', 'context.reasoning_required_pred', 'context.reasoning_free_pred', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
})

In [367]:
## Fine-tuning the model
from transformers import TrainingArguments, Trainer, IntervalStrategy
import numpy as np
# To instantiate a `Trainer`, we will need to define two more things.
# The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments),
# which is a class that contains all the attributes to customize the
# training. It requires one folder name, which will be used to save
# the checkpoints of the model, and all other arguments are optional:

metric_name = "f1"
i = -2 if model_checkpoint.endswith('/') else -1
model_name = model_checkpoint.split("/")[i]
batch_size = 32

args = TrainingArguments(
    f"{model_name}-finetuned-pqa-l",
    evaluation_strategy = IntervalStrategy.EPOCH,
    do_eval=True,
    save_strategy = IntervalStrategy.EPOCH,
    logging_strategy=IntervalStrategy.EPOCH,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model="f1",
    push_to_hub=False,
)

# Here we set the evaluation to be done at the end of each epoch, tweak the
# learning rate, use the `batch_size` defined at the top of the script and
# customize the number of epochs for training, as well as the weight decay.
# Since the best model might not be the one at the end of training, we ask the
# `Trainer` to load the best model it saved (according to `metric_name`) at the
# end of training.
# The last thing to define for our `Trainer` is how to compute the metrics from
# the predictions. We need to define a function for this, which will just use
# the `metric` we loaded earlier, the only preprocessing we have to do is to
# take the argmax of our predicted logits
from transformers import EvalPrediction

def compute_metrics(eval_pred: EvalPrediction):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average='micro')


# Then we just need to pass all of this along with our datasets to the `Trainer`:
trainer = Trainer(
    model,
    args,
    train_dataset=train_test_valid_dataset["train"],
    eval_dataset=train_test_valid_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# for batch in trainer.get_eval_dataloader(train_test_valid_dataset["validation"]):
    # print(batch)
    # break

print(trainer.evaluate(train_test_valid_dataset["test"]))
# We can now finetune our model by just calling the `train` method:
# trainer.train()

# We can check with the `evaluate` method that our `Trainer` did
# reload the best model properly (if it was not the last one):
# trainer.evaluate()

# Testing and printing results
# print(trainer.predict(test_dataset=train_test_valid_dataset["valid"]).metrics)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: question, pubid, long_answer, context.reasoning_required_pred, context.labels, context.contexts, context.reasoning_free_pred, context.meshes. If question, pubid, long_answer, context.reasoning_required_pred, context.labels, context.contexts, context.reasoning_free_pred, context.meshes are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 50
  Batch size = 32


ValueError: too many values to unpack (expected 2)