# Prepare Environment

In [1]:
# Make necessary imports

# for array operations
import numpy as np
# PyTorch framework
import torch
# plotting
from matplotlib import pyplot as plt
# reproducibility
import random
# to watch progress
from tqdm.auto import tqdm

# HuggingFace ecosystem
# tokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding
# model
from transformers import AutoModelForSequenceClassification
# optimizer, lr-scheduler
from transformers import AdamW, get_scheduler
# dataset
!pip install datasets
from datasets import load_dataset, load_metric

# a seed for reproducibility
SEED = 42
# set seed
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

# check for GPU device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device available:', device)

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Device available: cpu


# Loading GLUE Dataset : CoLA, SST, MRPC, STS-B, QQP, MNLI, QNLI, RTE

In [2]:
GLUE_tasks = ['cola', 'sst2', 'mrpc', 'stsb', 'qqp', 'mnli', 'qnli', 'rte']
task = 'cola'
BERT_model = 'bert-large-uncased' # BERT_LARGE model
batch_size = 32

dataset = load_dataset('glue', task)
metric = load_metric('glue', task)

print(dataset)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/377k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

  metric = load_metric('glue', task)


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})


In [17]:
train_dataset = load_dataset('glue', 'cola', split='train')

dataset['train'].features

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8551
})


{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['unacceptable', 'acceptable'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [4]:
sentences = train_dataset['sentence']
labels = train_dataset['label']

# Tokenizer and Data Collator

In [5]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(BERT_model, use_fast=True)

# Data collator for dynamic padding as per batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
# define a tokenize function
def Tokenize_function(example):
    return tokenizer(example['sentence'], truncation=True)

In [7]:
# tokenize entire data
tokenized_data = dataset.map(Tokenize_function, batched=True)

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [8]:
tokenized_data = tokenized_data.remove_columns(['idx','sentence'])
tokenized_data = tokenized_data.rename_column('label','labels')
tokenized_data.set_format('pt')
tokenized_data["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [9]:
train_data = torch.utils.data.DataLoader(tokenized_data["train"],
                                         shuffle=True,
                                         batch_size=8,
                                         collate_fn=data_collator
                                        )
val_data = torch.utils.data.DataLoader(tokenized_data["validation"],
                                       batch_size=8,
                                       collate_fn=data_collator
                                      )
test_data = torch.utils.data.DataLoader(tokenized_data["test"],
                                        batch_size=8,
                                        collate_fn=data_collator
                                       )

In [10]:
# do a chekck for proper data preprocessing
for batch in train_data:
    [print('{:>20} : {}'.format(k,v.shape)) for k,v in batch.items()]
    break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


              labels : torch.Size([8])
           input_ids : torch.Size([8, 14])
      token_type_ids : torch.Size([8, 14])
      attention_mask : torch.Size([8, 14])


# Fine-tuning BERT

In [13]:
# cache a pre-trained BERT model for two-class classification
model = AutoModelForSequenceClassification.from_pretrained(BERT_model, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7186, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [15]:
EPOCHS = 3
NUM_TRAINING_STEPS = EPOCHS * len(train_data)
print(NUM_TRAINING_STEPS)

optimizer = AdamW(model.parameters(), lr=3e-5) # args.learning_rate = {3e−5 , 3e−4 , 3e−3}
lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=NUM_TRAINING_STEPS
                            )

model.to(device)
device

3207




device(type='cpu')

In [16]:
progress_bar = tqdm(range(NUM_TRAINING_STEPS))

model.train()
for epoch in range(EPOCHS):
    for batch in train_data:
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3207 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
metric = load_metric("glue","cola")

model.eval()
for batch in val_data:
    batch = {k:v.to(device) for k,v in batch.items()}
    print(batch['labels'], batch['labels'].shape)
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=preds,references=batch['labels'])
metric.compute()

# Prediction

In [None]:
# make predictions
preds = []
model.eval()
for batch in test_data:
    batch['labels'] = torch.ones(len(batch['labels'])).type(torch.int64)
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    yhat = torch.argmax(logits, dim=-1)
    preds.append(yhat)

In [None]:
preds