# Prepare Environment

In [None]:
# -*- coding: utf-8 -*-
"""
@author: Yu Jihan
"""

import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AdamW, get_scheduler
!pip install datasets
from datasets import load_dataset, load_metric


# check for GPU device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device available:', device)

# Loading GLUE Dataset : CoLA, SST, MRPC, STS-B, QQP, MNLI, QNLI, RTE

In [21]:
GLUE_tasks = ['cola', 'sst2', 'mrpc', 'stsb']
task = 'cola'
checkpoint = 'bert-base-uncased'
batch_size = 32
learning_rate = 3e-5
epoch = 5

dataset = load_dataset('glue', task)
metric = load_metric('glue', task)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})
{'accuracy': 1.0, 'f1': 1.0}


In [22]:
raw_dataset = load_dataset('glue', task)
raw_metric = load_metric('glue', task)

raw_dataset['train'].features

{'matthews_correlation': 1.0}


{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['unacceptable', 'acceptable'], id=None),
 'idx': Value(dtype='int32', id=None)}

# Tokenizer and Data Collator

In [9]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)

# Data collator for dynamic padding as per batch
data_collator = DataCollatorWithPadding(tokenizer)

In [10]:
task_to_keys = {
    "cola": ("sentence", None),
    "mrpc": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
}

In [11]:
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")

Sentence 1: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
Sentence 2: Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .


In [12]:
# define a tokenize function
def tokenize_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [14]:
# tokenize entire data
tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
if sentence2_key is None:
  tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence"])
else:
  tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
tokenized_datasets["train"].column_names


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [16]:
train_data = DataLoader(tokenized_datasets["train"],
                                         shuffle=True,
                                         batch_size=8,
                                         collate_fn=data_collator
                                        )
validation_data = DataLoader(tokenized_datasets["validation"],
                                       batch_size=8,
                                       collate_fn=data_collator
                                      )
test_data = DataLoader(tokenized_datasets["test"],
                                        batch_size=8,
                                        collate_fn=data_collator
                                       )

In [17]:
# do a chekck for proper data preprocessing
for batch in train_data:
    [print('{:>20} : {}'.format(k,v.shape)) for k,v in batch.items()]
    break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


              labels : torch.Size([8])
           input_ids : torch.Size([8, 80])
      token_type_ids : torch.Size([8, 80])
      attention_mask : torch.Size([8, 80])


# Fine-tuning BERT

In [18]:
num_labels = 1 if task=="stsb" else 2

# cache a pre-trained BERT model for two-class classification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
references = [0, 1]
predictions = [0, 1]
results = raw_metric.compute(predictions=predictions, references=references)
#model_predictions = model(model_inputs)
#final_score = metric.compute(predictions=model_predictions, references=gold_references)
print(results)

{'accuracy': 1.0, 'f1': 1.0}


In [57]:
metric_name = "spearmanr" if task == "stsb" else "matthews_correlation" if task == "cola" else "f1" if task == "mrpc" else "accuracy"
model_name = checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
)

ImportError: ignored

In [56]:
EPOCHS = 3
NUM_TRAINING_STEPS = EPOCHS * len(train_data)
print(NUM_TRAINING_STEPS)

optimizer = AdamW(model.parameters(), lr=3e-5) # args.learning_rate = {3e−5 , 3e−4 , 3e−3}
lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=NUM_TRAINING_STEPS
                            )

model.to(device)
device

3207




device(type='cpu')

In [None]:
progress_bar = tqdm(range(NUM_TRAINING_STEPS))

model.train()
for epoch in range(EPOCHS):
    for batch in train_data:
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3207 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
metric = load_metric("glue","cola")

model.eval()
for batch in val_data:
    batch = {k:v.to(device) for k,v in batch.items()}
    print(batch['labels'], batch['labels'].shape)
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=preds,references=batch['labels'])
metric.compute()

# Prediction

In [None]:
# make predictions
preds = []
model.eval()
for batch in test_data:
    batch['labels'] = torch.ones(len(batch['labels'])).type(torch.int64)
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    yhat = torch.argmax(logits, dim=-1)
    preds.append(yhat)

In [None]:
preds