#Install libraries

In [2]:
!pip install transformers
!pip install torch torchvision
!pip install pandas
!pip install numpy
!pip install datasets
!pip install pytorch_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, http

Import the required libraries

In [3]:
import torch
from torch.utils.data import (TensorDataset, DataLoader,
                              RandomSampler, SequentialSampler)

from pytorch_transformers import BertTokenizer, BertConfig
from pytorch_transformers import BertForSequenceClassification
from pytorch_transformers import AdamW, WarmupLinearSchedule

from distutils.version import LooseVersion as LV

from sklearn.model_selection import train_test_split

import io

import pandas as pd
import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


sns.set()

if torch.cuda.is_available():
    device = torch.device('cuda')
    devicename = '['+torch.cuda.get_device_name(0)+']'
else:
    device = torch.device('cpu')
    devicename = ""
    
print('Using PyTorch version:', torch.__version__,
      'Device:', device, devicename)
assert(LV(torch.__version__) >= LV("1.0.0"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using PyTorch version: 1.13.1+cu116 Device: cuda [Tesla T4]


Download the IMDb and SST-2 datasets and extract them.

In [4]:
# Load the IMDB dataset
imdb_dataset = load_dataset("imdb")


# Load the SST-2 dataset
sst2_dataset = load_dataset("glue", "sst2")

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
print(imdb_dataset.column_names)

{'train': ['text', 'label'], 'test': ['text', 'label'], 'unsupervised': ['text', 'label']}


Load the IMDb dataset using pandas, and preprocess the text data by removing HTML tags, non-alphanumeric characters, and stop words.

In [6]:
# Load the IMDb dataset
train_df = pd.DataFrame(imdb_dataset['train'])
test_df = pd.DataFrame(imdb_dataset['test'])

print('\nIMDB data loaded:')
print('train:', train_df.shape)
print('test:', test_df.shape)


IMDB data loaded:
train: (25000, 2)
test: (25000, 2)


In [7]:
# Let's view some random training reviews:
train_df.sample(10)
print(train_df.dtypes)

text     object
label     int64
dtype: object


In [8]:
# Preprocess the text data
train_df ['text'] = train_df ['text'].str.replace('<.*?>', '', regex=True) # remove HTML tags
train_df ['text'] = train_df ['text'].str.replace('[^a-zA-Z0-9\s]', '', regex=True) # remove non-alphanumeric characters
stop_words = set(stopwords.words('english'))
train_df ['text'] = train_df ['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # remove stop words


test_df ['text'] = test_df ['text'].str.replace('<.*?>', '', regex=True) # remove HTML tags
train_df ['text'] = test_df ['text'].str.replace('[^a-zA-Z0-9\s]', '', regex=True) # remove non-alphanumeric characters
stop_words = set(stopwords.words('english'))
test_df ['text'] = test_df ['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # remove stop words

Load the SST-2 dataset using pandas, and preprocess the text data in the same way as the IMDb dataset

In [None]:
# Load the SST-2 dataset
sst2_data = pd.concat([pd.DataFrame(sst2_dataset['train']), pd.DataFrame(sst2_dataset['test'])])
sst2_data = sst2_data[['sentence', 'label']]
sst2_data = sst2_data.rename(columns={'sentence': 'text', 'label':'category'})
sst2_data = sst2_data.reset_index(drop=True)

print(sst2_data.columns)
print(sst2_data.head(5))
len(sst2_data)

Index(['text', 'category'], dtype='object')
                                                text  category
0       hide new secretions from the parental units          0
1               contains no wit , only labored gags          0
2  that loves its characters and communicates som...         1
3  remains utterly satisfied to remain the same t...         0
4  on the worst revenge-of-the-nerds clichés the ...         0


69170

In [None]:
# Preprocess the text data
sst2_data['text'] = sst2_data['text'].str.replace('<.*?>', '', regex=True) # remove HTML tags
sst2_data['text'] = sst2_data['text'].str.replace('[^a-zA-Z0-9\s]', '', regex=True) # remove non-alphanumeric characters
sst2_data['text'] = sst2_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # remove stop words


The token `[CLS]` is a special token required by BERT at the beginning of the sentence.

In [9]:
sentences_train = train_df.text.values
sentences_train = ["[CLS] " + s for s in sentences_train]

sentences_test = test_df.text.values
sentences_test = ["[CLS] " + s for s in sentences_test]

labels_train = train_df.label.values
labels_test  = test_df.label.values

print ("\nThe first training sentence:")
print(sentences_train[0], 'LABEL:', labels_train[0])



The first training sentence:
[CLS] I love scifi and am willing to put up with a lot Scifi moviesTV are usually underfunded underappreciated and misunderstood I tried to like this I really did but it is to good TV scifi as Babylon 5 is to Star Trek the original Silly prosthetics cheap cardboard sets stilted dialogues CG that doesnt match the background and painfully onedimensional characters cannot be overcome with a scifi setting Im sure there are those of you out there who think Babylon 5 is good scifi TV Its not Its clichd and uninspiring While US viewers might like emotion and character development scifi is a genre that does not take itself seriously cf Star Trek It may treat important issues yet not as a serious philosophy Its really difficult to care about the characters here as they are not simply foolish just missing a spark of life Their actions and reactions are wooden and predictable often painful to watch The makers of Earth KNOW its rubbish as they have to always say Gene 

Next we use the BERT tokenizer to convert the sentences into tokens
that match the data BERT was trained on.


In [10]:
BERTMODEL = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(BERTMODEL,
                                          do_lower_case=True)

tokenized_train = [tokenizer.tokenize(s) for s in sentences_train]
tokenized_test  = [tokenizer.tokenize(s) for s in sentences_test]

print ("\nThe full tokenized first training sentence:")
print (tokenized_train[0])


  0%|          | 0/231508 [00:00<?, ?B/s][A
  7%|▋         | 16384/231508 [00:00<00:02, 76421.12B/s][A
 22%|██▏       | 51200/231508 [00:00<00:01, 124635.70B/s][A
100%|██████████| 231508/231508 [00:00<00:00, 346547.60B/s]



The full tokenized first training sentence:
['[CLS]', 'i', 'love', 'sci', '##fi', 'and', 'am', 'willing', 'to', 'put', 'up', 'with', 'a', 'lot', 'sci', '##fi', 'movies', '##tv', 'are', 'usually', 'under', '##fu', '##nded', 'under', '##app', '##re', '##cia', '##ted', 'and', 'misunderstood', 'i', 'tried', 'to', 'like', 'this', 'i', 'really', 'did', 'but', 'it', 'is', 'to', 'good', 'tv', 'sci', '##fi', 'as', 'babylon', '5', 'is', 'to', 'star', 'trek', 'the', 'original', 'silly', 'pro', '##st', '##hetic', '##s', 'cheap', 'cardboard', 'sets', 'stil', '##ted', 'dialogues', 'c', '##g', 'that', 'doesn', '##t', 'match', 'the', 'background', 'and', 'painfully', 'one', '##dim', '##ens', '##ional', 'characters', 'cannot', 'be', 'overcome', 'with', 'a', 'sci', '##fi', 'setting', 'im', 'sure', 'there', 'are', 'those', 'of', 'you', 'out', 'there', 'who', 'think', 'babylon', '5', 'is', 'good', 'sci', '##fi', 'tv', 'its', 'not', 'its', 'cl', '##ich', '##d', 'and', 'un', '##ins', '##pi', '##ring', 'whi


Now we set the maximum sequence lengths for our training and test
sentences as `MAX_LEN_TRAIN` and `MAX_LEN_TEST`. The maximum length
supported by the used BERT model is 512.

The token `[SEP]` is another special token required by BERT at the
end of the sentence.

In [11]:
MAX_LEN_TRAIN, MAX_LEN_TEST = 128, 512

tokenized_train = [t[:(MAX_LEN_TRAIN-1)]+['SEP'] for t in tokenized_train]
tokenized_test  = [t[:(MAX_LEN_TEST-1)]+['SEP'] for t in tokenized_test]

print ("\nThe truncated tokenized first training sentence:")
print (tokenized_train[0])


The truncated tokenized first training sentence:
['[CLS]', 'i', 'love', 'sci', '##fi', 'and', 'am', 'willing', 'to', 'put', 'up', 'with', 'a', 'lot', 'sci', '##fi', 'movies', '##tv', 'are', 'usually', 'under', '##fu', '##nded', 'under', '##app', '##re', '##cia', '##ted', 'and', 'misunderstood', 'i', 'tried', 'to', 'like', 'this', 'i', 'really', 'did', 'but', 'it', 'is', 'to', 'good', 'tv', 'sci', '##fi', 'as', 'babylon', '5', 'is', 'to', 'star', 'trek', 'the', 'original', 'silly', 'pro', '##st', '##hetic', '##s', 'cheap', 'cardboard', 'sets', 'stil', '##ted', 'dialogues', 'c', '##g', 'that', 'doesn', '##t', 'match', 'the', 'background', 'and', 'painfully', 'one', '##dim', '##ens', '##ional', 'characters', 'cannot', 'be', 'overcome', 'with', 'a', 'sci', '##fi', 'setting', 'im', 'sure', 'there', 'are', 'those', 'of', 'you', 'out', 'there', 'who', 'think', 'babylon', '5', 'is', 'good', 'sci', '##fi', 'tv', 'its', 'not', 'its', 'cl', '##ich', '##d', 'and', 'un', '##ins', '##pi', '##ring',


Next we use the BERT tokenizer to convert each token into an integer
index in the BERT vocabulary. We also pad any shorter sequences to
`MAX_LEN_TRAIN` or `MAX_LEN_TEST` indices with trailing zeros.

In [12]:
ids_train = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_train]
ids_train = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)),
                             mode='constant') for i in ids_train])

ids_test = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_test]
ids_test = np.array([np.pad(i, (0, MAX_LEN_TEST-len(i)),
                            mode='constant') for i in ids_test])

print ("\nThe indices of the first training sentence:")
print (ids_train[0])


The indices of the first training sentence:
[  101  1045  2293 16596  8873  1998  2572  5627  2000  2404  2039  2007
  1037  2843 16596  8873  5691  9189  2024  2788  2104 11263 25848  2104
 29098  2890  7405  3064  1998 28947  1045  2699  2000  2066  2023  1045
  2428  2106  2021  2009  2003  2000  2204  2694 16596  8873  2004 17690
  1019  2003  2000  2732 10313  1996  2434 10021  4013  3367 20086  2015
 10036 19747  4520 25931  3064 22580  1039  2290  2008  2987  2102  2674
  1996  4281  1998 16267  2028 22172  6132 19301  3494  3685  2022  9462
  2007  1037 16596  8873  4292 10047  2469  2045  2024  2216  1997  2017
  2041  2045  2040  2228 17690  1019  2003  2204 16596  8873  2694  2049
  2025  2049 18856  7033  2094  1998  4895  7076  8197  4892  2096  2149
  7193  2453  2066  7603  1998  2839  2458   100]


BERT also requires *attention masks*, with 1 for each real token in
the sequences and 0 for the padding:

In [13]:
amasks_train, amasks_test = [], []

for seq in ids_train:
  seq_mask = [float(i>0) for i in seq]
  amasks_train.append(seq_mask)

for seq in ids_test:
  seq_mask = [float(i>0) for i in seq]
  amasks_test.append(seq_mask)

We use scikit-learn's train_test_split() to use 10% of our training
data as a validation set, and then convert all data into
torch.tensors.

In [14]:
(train_inputs, validation_inputs,
 train_labels, validation_labels) = train_test_split(ids_train, labels_train,
                                                     random_state=42,
                                                     test_size=0.1)
(train_masks, validation_masks,
 _, _) = train_test_split(amasks_train, ids_train,
                          random_state=42, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks  = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks  = torch.tensor(validation_masks)
test_inputs = torch.tensor(ids_test)
test_labels = torch.tensor(labels_test)
test_masks  = torch.tensor(amasks_test)


Next we create PyTorch *DataLoader*s for all data sets.
For fine-tuning BERT on a specific task, the authors recommend a
batch size of 16 or 32.

In [15]:
BATCH_SIZE = 32

print('\nDatasets:')
print('Train: ', end="")
train_data = TensorDataset(train_inputs, train_masks,
                           train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
                              batch_size=BATCH_SIZE)
print(len(train_data), 'reviews')

print('Validation: ', end="")
validation_data = TensorDataset(validation_inputs, validation_masks,
                                validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,
                                   sampler=validation_sampler,
                                   batch_size=BATCH_SIZE)
print(len(validation_data), 'reviews')

print('Test: ', end="")
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler,
                             batch_size=BATCH_SIZE)
print(len(test_data), 'reviews')


Datasets:
Train: 22500 reviews
Validation: 2500 reviews
Test: 25000 reviews


BERT MODEL INITIALIZATION

We now load a pretrained BERT model with a single linear
classification layer added on top.


In [16]:
model = BertForSequenceClassification.from_pretrained(BERTMODEL,
                                                      num_labels=2)

model.cuda()
print('\nPretrained BERT model "{}" loaded'.format(BERTMODEL))

100%|██████████| 433/433 [00:00<00:00, 90584.75B/s]
100%|██████████| 440473133/440473133 [00:38<00:00, 11553603.76B/s]



Pretrained BERT model "bert-base-uncased" loaded



We set the remaining hyperparameters needed for fine-tuning the
pretrained model: 
 * EPOCHS: the number of training epochs in fine-tuning
   (recommended values between 2 and 4) 
 * WEIGHT_DECAY: weight decay for the Adam optimizer 
 * LR: learning rate for the Adam optimizer 
   (2e-5 to 5e-5 recommended) 
 * WARMUP_STEPS: number of warmup steps to (linearly) reach the
   set learning rate

 We also need to grab the training parameters from the pretrained
 model.

In [17]:
EPOCHS = 4
WEIGHT_DECAY = 0.01
LR = 2e-5
WARMUP_STEPS =int(0.2*len(train_dataloader))

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)],
     'weight_decay': WEIGHT_DECAY},
    {'params': [p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS,
                                 t_total=len(train_dataloader)*EPOCHS)

LEARNING

Let's now define functions to train() and evaluate() the model:

In [18]:
def train(epoch, loss_vector=None, log_interval=200):
  # Set model to training mode
  model.train()

  # Loop over each batch from the training set
  for step, batch in enumerate(train_dataloader):

    # Copy data to GPU if needed
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Zero gradient buffers
    optimizer.zero_grad()

    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None,
                    attention_mask=b_input_mask, labels=b_labels)

    loss = outputs[0]
    if loss_vector is not None:
        loss_vector.append(loss.item())

    # Backward pass
    loss.backward()

    # Update weights
    scheduler.step()
    optimizer.step()

    if step % log_interval == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, step * len(b_input_ids),
                len(train_dataloader.dataset),
                100. * step / len(train_dataloader), loss))

def evaluate(loader):
  model.eval()

  n_correct, n_all = 0, 0

  for batch in loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)
      logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    predictions = np.argmax(logits, axis=1)

    labels = b_labels.to('cpu').numpy()
    n_correct += np.sum(predictions == labels)
    n_all += len(labels)

  print('Accuracy: [{}/{}] {:.4f}'.format(n_correct, n_all,
                                          n_correct/n_all))

Now we are ready to train our model using the train()
function. After each epoch, we evaluate the model using the
validation set and evaluate().

In [None]:
train_lossv = []
for epoch in range(1, EPOCHS + 1):
    print()
    train(epoch, train_lossv)
    print('\nValidation set:')
    evaluate(validation_dataloader)




	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1420.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)



Validation set:
Accuracy: [2205/2500] 0.8820


Validation set:
Accuracy: [2231/2500] 0.8924



Let's take a look at our training loss over all batches:

In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_lossv, label='original')
plt.plot(np.convolve(train_lossv, np.ones(101), 'same') / 101,
         label='averaged')
plt.legend(loc='best')
plt.savefig("training-loss.png")

Inference

For a better measure of the quality of the model, let's see the
model accuracy for the test reviews.

In [None]:
print('\nTest set:')
evaluate(test_dataloader)

# eof

Once training is complete, we can evaluate the model on the SST2 dataset:

Finally, we'll save the hidden layers and trained values of the model:

In [None]:
# Save hidden layers and trained values
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'hidden_layers': model.encoder.layer[-1].output_hidden_states,
    'trained_values': model.pooler.dense.weight
}, 'path/to/save/model.pt')
