In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

dataset = load_dataset("ImperialCollegeLondon/health_fact")
tokenizer = AutoTokenizer.from_pretrained("nbroad/bigbird-base-health-fact")

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['claim_id', 'claim', 'date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'label', 'subjects'],
        num_rows: 9832
    })
    test: Dataset({
        features: ['claim_id', 'claim', 'date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'label', 'subjects'],
        num_rows: 1235
    })
    validation: Dataset({
        features: ['claim_id', 'claim', 'date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'label', 'subjects'],
        num_rows: 1225
    })
})

In [None]:
def preprocess_function(examples):
    inputs = [claim + " " + explanation for claim, explanation in zip(examples["claim"], examples["explanation"])]
    return tokenizer(
        inputs, 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )

def add_labels(examples):
    return {"labels": examples["label"]}

In [None]:
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_validation = dataset["validation"].map(preprocess_function, batched=True)
tokenized_test = dataset["test"].map(preprocess_function, batched=True)


tokenized_train = tokenized_train.map(add_labels, batched=True)
tokenized_validation = tokenized_validation.map(add_labels, batched=True)
tokenized_test = tokenized_test.map(add_labels, batched=True)

Map: 100%|██████████| 9832/9832 [00:01<00:00, 8826.12 examples/s]
Map: 100%|██████████| 1225/1225 [00:00<00:00, 8585.46 examples/s]
Map: 100%|██████████| 1235/1235 [00:00<00:00, 8415.96 examples/s]
Map: 100%|██████████| 9832/9832 [00:00<00:00, 359266.43 examples/s]
Map: 100%|██████████| 1225/1225 [00:00<00:00, 176279.63 examples/s]
Map: 100%|██████████| 1235/1235 [00:00<00:00, 185018.59 examples/s]


In [17]:
tokenized_train[2]

{'claim_id': '11358',
 'claim': 'SBRT Offers Prostate Cancer Patients High Cancer Control and Low Toxicity in Fewer Treatments',
 'date_published': 'September 28, 2016',
 'explanation': 'This news release describes five-year outcomes for 309 men with early-stage prostate cancer who received stereotactic body radiation therapy (SBRT), which delivers targeted doses of radiation cheaper and faster than the prevailing radiation therapy treatment for prostate cancer. The study measured the rates of severe injury to surrounding tissues and disease-free survival. The news release said it’s the first large, multi-institutional study of this technology in prostate cancer with long-term follow-up, involving patients at 21 community, regional, and academic hospitals across the U.S. The release does a good job of quantifying the evidence but could have helped readers better understand the implications of the data by giving cost and safety comparisons with other radiation treatment, and by discussi

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "nbroad/bigbird-base-health-fact",
    num_labels=4
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "nbroad/bigbird-base-health-fact",
    num_labels=4 
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(
  0%|          | 0/308 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
  0%|          | 1/308 [00:39<3:19:48, 39.05s/it]

KeyboardInterrupt: 

In [None]:
trainer.save_model("./fine_tuned_bigbird_health_fact")
test_results = trainer.evaluate(eval_dataset=tokenized_test)
print(test_results)
