#Install libraries

In [1]:
!pip install transformers
!pip install torch torchvision
!pip install pandas
!pip install numpy
!pip install datasets
!pip install pytorch_transformers
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Import the required libraries

In [4]:
import torch
from torch.utils.data import (TensorDataset, DataLoader,
                              RandomSampler, SequentialSampler)

from pytorch_transformers import BertTokenizer, BertConfig
from pytorch_transformers import BertForSequenceClassification
from pytorch_transformers import AdamW, WarmupLinearSchedule

from distutils.version import LooseVersion as LV

from sklearn.model_selection import train_test_split

import io

import pandas as pd
import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset

import tensorflow_datasets as tfds

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


sns.set()

if torch.cuda.is_available():
    device = torch.device('cuda')
    devicename = '['+torch.cuda.get_device_name(0)+']'
else:
    device = torch.device('cpu')
    devicename = ""
    
print('Using PyTorch version:', torch.__version__,
      'Device:', device, devicename)
assert(LV(torch.__version__) >= LV("1.0.0"))


Using PyTorch version: 1.13.1+cu116 Device: cpu 


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Download the IMDb and SST-2 datasets and extract them.

In [5]:
# Load the IMDB dataset
imdb_dataset = load_dataset("imdb")


# Load the SST-2 dataset
sst2_dataset = load_dataset("glue", "sst2")



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Load the IMDb dataset using pandas, and preprocess the text data by removing HTML tags, non-alphanumeric characters, and stop words.

In [37]:
print(imdb_dataset.column_names)

{'train': ['text', 'label'], 'test': ['text', 'label'], 'unsupervised': ['text', 'label']}


In [38]:
# Load the IMDb dataset
imdb_train_df = pd.DataFrame(imdb_dataset['train'])
imdb_train_df = imdb_train_df.reset_index(drop=True)
imdb_test_df = pd.DataFrame(imdb_dataset['test'])
imdb_test_df = imdb_test_df.reset_index(drop=True)

print('\nIMDB data loaded:')
print('train:', imdb_train_df.shape)
print('test:', imdb_test_df.shape)
print(imdb_train_df['label'].unique())
print(imdb_test_df['label'].unique())


IMDB data loaded:
train: (25000, 2)
test: (25000, 2)
[0 1]
[0 1]


In [39]:
# Let's view some random reviews:
print(imdb_train_df.sample(5))
print(imdb_test_df.sample(5))

                                                    text  label
1676   It Could Have Been A Marvelous Story Based On ...      0
15892  As with all environmentally aware films from t...      1
17981  This is one of the best of the genre. I saw it...      1
10490  This is one of those films that I could only s...      0
22783  Outragously entertaining period piece set in t...      1
                                                    text  label
23595  This film is worthwhile despite what you may h...      1
6000   This movie could have been oh so much better. ...      0
6183   like in so many movies of the past, you would ...      0
22590  Maybe I'm reading into this too much, but I wo...      1
5293   I was duped into watching this by the many fri...      0


In [40]:
# Preprocess the text data
imdb_train_df ['text'] = imdb_train_df ['text'].str.replace('<.*?>', '', regex=True) # remove HTML tags
imdb_train_df ['text'] = imdb_train_df ['text'].str.replace('[^a-zA-Z0-9\s]', '', regex=True) # remove non-alphanumeric characters
stop_words = set(stopwords.words('english'))
imdb_train_df ['text'] = imdb_train_df ['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # remove stop words


imdb_test_df ['text'] = imdb_test_df ['text'].str.replace('<.*?>', '', regex=True) # remove HTML tags
imdb_train_df ['text'] = imdb_test_df ['text'].str.replace('[^a-zA-Z0-9\s]', '', regex=True) # remove non-alphanumeric characters
stop_words = set(stopwords.words('english'))
imdb_test_df ['text'] = imdb_test_df ['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # remove stop words

Load the SST-2 dataset using pandas, and preprocess the text data in the same way as the IMDb dataset

In [41]:
print(sst2_dataset.column_names)

{'train': ['sentence', 'label', 'idx'], 'validation': ['sentence', 'label', 'idx'], 'test': ['sentence', 'label', 'idx']}


In [46]:
# Load the SST-2 dataset
sst2_train_df = pd.DataFrame(sst2_dataset['train'])[['sentence', 'label']]
sst2_train_df = sst2_train_df.rename(columns={'sentence': 'text'})
sst2_train_df = sst2_train_df.reset_index(drop=True)

sst2_test_df = pd.DataFrame(sst2_dataset['validation'])[['sentence', 'label']]
sst2_test_df = sst2_test_df.rename(columns={'sentence': 'text'})
sst2_test_df = sst2_test_df.reset_index(drop=True)

#sst2_test_df =pd.concat([pd.DataFrame(sst2_dataset['test'])[['sentence', 'label']], pd.DataFrame(sst2_dataset['validation'])[['sentence', 'label']]])

print('\nSST2 data loaded:')
print('train:', sst2_train_df.shape)
print('test:', sst2_test_df.shape)
print(sst2_train_df['label'].unique())
print(sst2_test_df['label'].unique())


SST2 data loaded:
train: (67349, 2)
test: (872, 2)
[0 1]
[1 0]


Unbalanced set. Replace default split by train_test_split

In [50]:
# Load your dataframe
df = pd.concat([sst2_train_df,sst2_test_df])

# Define your features and target variable
X = df.drop("label", axis=1)
y = df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the train and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

sst2_train_df = pd.concat([X_train,y_train], axis=1)
sst2_test_df = pd.concat([X_test,y_test], axis=1)

print('\nSST2 data re splitted:')
print('train:', sst2_train_df.shape)
print('test:', sst2_test_df.shape)
print(sst2_train_df['label'].unique())
print(sst2_test_df['label'].unique())


X_train shape: (54576, 1)
X_test shape: (13645, 1)
y_train shape: (54576,)
y_test shape: (13645,)

SST2 data re splitted:
train: (54576, 2)
test: (13645, 2)
[1 0]
[1 0]


In [52]:
# Let's view some random reviews:
print(sst2_train_df.sample(5))
print(sst2_test_df.sample(5))

                                                    text  label
3689                        is far from disappointing ,       1
1910   not quite as miraculous as its dreamworks make...      1
45776                 fails to spark this leaden comedy       0
34412  are jarring and deeply out of place in what co...      0
41125                                   often funny way       1
                                                    text  label
25722                   is the central flaw of the film       0
52315         is a likable story , told with competence       1
36042  robinson 's web of suspense matches the page-t...      1
38595  we 're touched by the film 's conviction that ...      1
13970        flat-out amusing , sometimes endearing and       1


In [53]:
# Preprocess the text data
sst2_train_df['text'] = sst2_train_df['text'].str.replace('<.*?>', '', regex=True) # remove HTML tags
sst2_train_df['text'] = sst2_train_df['text'].str.replace('[^a-zA-Z0-9\s]', '', regex=True) # remove non-alphanumeric characters
stop_words = set(stopwords.words('english'))
sst2_train_df['text'] = sst2_train_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # remove stop words


sst2_test_df['text'] = sst2_test_df['text'].str.replace('<.*?>', '', regex=True) # remove HTML tags
sst2_test_df['text'] = sst2_test_df['text'].str.replace('[^a-zA-Z0-9\s]', '', regex=True) # remove non-alphanumeric characters
stop_words = set(stopwords.words('english'))
sst2_test_df['text'] = sst2_test_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # remove stop words


Training and testing on SST2, OOD detection on IMDB

In [55]:
train_df = sst2_train_df
test_df = sst2_test_df
ood_df = pd.concat([imdb_train_df,imdb_test_df])
ood_df = ood_df.reset_index(drop=True)

In [56]:
# Let's view some random reviews:
print(train_df.sample(5))
print(test_df.sample(5))
print(ood_df.sample(5))

                                                    text  label
50861                     quietly introspective portrait      1
7668            skip film buy philip glass soundtrack cd      0
1626                   seem long two year affair subject      0
2230                          unsung heroes 20th century      1
11261  thirteen conversations one thing generosity op...      1
                                                    text  label
4364      begrudge anyone receiving whatever consolation      0
41834        obvious copy one best films ever made could      0
36204                                         love power      1
51557                                    though much fun      1
55263  subtitled costume drama set remote african emp...      1
                                                    text  label
37258  It's long time ago I saw movie still one worst...      0
43201  As kid I loved song "Never smile crocodile", I...      1
388    Lets see What annoyed me most The

The token `[CLS]` is a special token required by BERT at the beginning of the sentence.

In [57]:
sentences_train = train_df.text.values
sentences_train = ["[CLS] " + s for s in sentences_train]

sentences_test = test_df.text.values
sentences_test = ["[CLS] " + s for s in sentences_test]

sentences_ood = ood_df.text.values
sentences_ood = ["[CLS] " + s for s in sentences_ood]


labels_train = train_df.label.values
labels_test  = test_df.label.values
labels_ood  = ood_df.label.values

print ("\nThe first training sentence:")
print(sentences_train[0], 'LABEL:', labels_train[0])



The first training sentence:
[CLS] wildly alive LABEL: 1


Next we use the BERT tokenizer to convert the sentences into tokens
that match the data BERT was trained on.


In [58]:
BERTMODEL = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(BERTMODEL,
                                          do_lower_case=True)

tokenized_train = [tokenizer.tokenize(s) for s in sentences_train]
tokenized_test  = [tokenizer.tokenize(s) for s in sentences_test]
tokenized_ood  = [tokenizer.tokenize(s) for s in sentences_ood]

print ("\nThe full tokenized first training sentence:")
print (tokenized_train[0])

print ("\nThe full tokenized first test sentence:")
print (tokenized_test[0])

print ("\nThe full tokenized first OOD sentence:")
print (tokenized_ood[0])

100%|██████████| 231508/231508 [00:00<00:00, 1224484.43B/s]



The full tokenized first training sentence:
['[CLS]', 'wildly', 'alive']



Now we set the maximum sequence lengths for our training and test
sentences as `MAX_LEN_TRAIN` and `MAX_LEN_TEST`. The maximum length
supported by the used BERT model is 512.

The token `[SEP]` is another special token required by BERT at the
end of the sentence.

#TO DO OOD detection !!

In [None]:
MAX_LEN_TRAIN, MAX_LEN_TEST = 128, 512

tokenized_train = [t[:(MAX_LEN_TRAIN-1)]+['SEP'] for t in tokenized_train]
tokenized_test  = [t[:(MAX_LEN_TEST-1)]+['SEP'] for t in tokenized_test]

print ("\nThe truncated tokenized first training sentence:")
print (tokenized_train[0])


The truncated tokenized first training sentence:
['[CLS]', 'i', 'love', 'sci', '##fi', 'and', 'am', 'willing', 'to', 'put', 'up', 'with', 'a', 'lot', 'sci', '##fi', 'movies', '##tv', 'are', 'usually', 'under', '##fu', '##nded', 'under', '##app', '##re', '##cia', '##ted', 'and', 'misunderstood', 'i', 'tried', 'to', 'like', 'this', 'i', 'really', 'did', 'but', 'it', 'is', 'to', 'good', 'tv', 'sci', '##fi', 'as', 'babylon', '5', 'is', 'to', 'star', 'trek', 'the', 'original', 'silly', 'pro', '##st', '##hetic', '##s', 'cheap', 'cardboard', 'sets', 'stil', '##ted', 'dialogues', 'c', '##g', 'that', 'doesn', '##t', 'match', 'the', 'background', 'and', 'painfully', 'one', '##dim', '##ens', '##ional', 'characters', 'cannot', 'be', 'overcome', 'with', 'a', 'sci', '##fi', 'setting', 'im', 'sure', 'there', 'are', 'those', 'of', 'you', 'out', 'there', 'who', 'think', 'babylon', '5', 'is', 'good', 'sci', '##fi', 'tv', 'its', 'not', 'its', 'cl', '##ich', '##d', 'and', 'un', '##ins', '##pi', '##ring',


Next we use the BERT tokenizer to convert each token into an integer
index in the BERT vocabulary. We also pad any shorter sequences to
`MAX_LEN_TRAIN` or `MAX_LEN_TEST` indices with trailing zeros.

In [None]:
ids_train = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_train]
ids_train = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)),
                             mode='constant') for i in ids_train])

ids_test = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_test]
ids_test = np.array([np.pad(i, (0, MAX_LEN_TEST-len(i)),
                            mode='constant') for i in ids_test])

print ("\nThe indices of the first training sentence:")
print (ids_train[0])


The indices of the first training sentence:
[  101  1045  2293 16596  8873  1998  2572  5627  2000  2404  2039  2007
  1037  2843 16596  8873  5691  9189  2024  2788  2104 11263 25848  2104
 29098  2890  7405  3064  1998 28947  1045  2699  2000  2066  2023  1045
  2428  2106  2021  2009  2003  2000  2204  2694 16596  8873  2004 17690
  1019  2003  2000  2732 10313  1996  2434 10021  4013  3367 20086  2015
 10036 19747  4520 25931  3064 22580  1039  2290  2008  2987  2102  2674
  1996  4281  1998 16267  2028 22172  6132 19301  3494  3685  2022  9462
  2007  1037 16596  8873  4292 10047  2469  2045  2024  2216  1997  2017
  2041  2045  2040  2228 17690  1019  2003  2204 16596  8873  2694  2049
  2025  2049 18856  7033  2094  1998  4895  7076  8197  4892  2096  2149
  7193  2453  2066  7603  1998  2839  2458   100]


BERT also requires *attention masks*, with 1 for each real token in
the sequences and 0 for the padding:

In [None]:
amasks_train, amasks_test = [], []

for seq in ids_train:
  seq_mask = [float(i>0) for i in seq]
  amasks_train.append(seq_mask)

for seq in ids_test:
  seq_mask = [float(i>0) for i in seq]
  amasks_test.append(seq_mask)

We use scikit-learn's train_test_split() to use 10% of our training
data as a validation set, and then convert all data into
torch.tensors.

In [None]:
(train_inputs, validation_inputs,
 train_labels, validation_labels) = train_test_split(ids_train, labels_train,
                                                     random_state=42,
                                                     test_size=0.1)
(train_masks, validation_masks,
 _, _) = train_test_split(amasks_train, ids_train,
                          random_state=42, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks  = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks  = torch.tensor(validation_masks)
test_inputs = torch.tensor(ids_test)
test_labels = torch.tensor(labels_test)
test_masks  = torch.tensor(amasks_test)


Next we create PyTorch *DataLoader*s for all data sets.
For fine-tuning BERT on a specific task, the authors recommend a
batch size of 16 or 32.

In [None]:
BATCH_SIZE = 32

print('\nDatasets:')
print('Train: ', end="")
train_data = TensorDataset(train_inputs, train_masks,
                           train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
                              batch_size=BATCH_SIZE)
print(len(train_data), 'reviews')

print('Validation: ', end="")
validation_data = TensorDataset(validation_inputs, validation_masks,
                                validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,
                                   sampler=validation_sampler,
                                   batch_size=BATCH_SIZE)
print(len(validation_data), 'reviews')

print('Test: ', end="")
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler,
                             batch_size=BATCH_SIZE)
print(len(test_data), 'reviews')


Datasets:
Train: 22500 reviews
Validation: 2500 reviews
Test: 25000 reviews


BERT MODEL INITIALIZATION

We now load a pretrained BERT model with a single linear
classification layer added on top.


In [None]:
model = BertForSequenceClassification.from_pretrained(BERTMODEL,
                                                      num_labels=2)

model.cuda()
print('\nPretrained BERT model "{}" loaded'.format(BERTMODEL))

100%|██████████| 433/433 [00:00<00:00, 90584.75B/s]
100%|██████████| 440473133/440473133 [00:38<00:00, 11553603.76B/s]



Pretrained BERT model "bert-base-uncased" loaded



We set the remaining hyperparameters needed for fine-tuning the
pretrained model: 
 * EPOCHS: the number of training epochs in fine-tuning
   (recommended values between 2 and 4) 
 * WEIGHT_DECAY: weight decay for the Adam optimizer 
 * LR: learning rate for the Adam optimizer 
   (2e-5 to 5e-5 recommended) 
 * WARMUP_STEPS: number of warmup steps to (linearly) reach the
   set learning rate

 We also need to grab the training parameters from the pretrained
 model.

In [None]:
EPOCHS = 4
WEIGHT_DECAY = 0.01
LR = 2e-5
WARMUP_STEPS =int(0.2*len(train_dataloader))

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)],
     'weight_decay': WEIGHT_DECAY},
    {'params': [p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS,
                                 t_total=len(train_dataloader)*EPOCHS)

LEARNING

Let's now define functions to train() and evaluate() the model:

In [None]:
def train(epoch, loss_vector=None, log_interval=200):
  # Set model to training mode
  model.train()

  # Loop over each batch from the training set
  for step, batch in enumerate(train_dataloader):

    # Copy data to GPU if needed
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Zero gradient buffers
    optimizer.zero_grad()

    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None,
                    attention_mask=b_input_mask, labels=b_labels)

    loss = outputs[0]
    if loss_vector is not None:
        loss_vector.append(loss.item())

    # Backward pass
    loss.backward()

    # Update weights
    scheduler.step()
    optimizer.step()

    if step % log_interval == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, step * len(b_input_ids),
                len(train_dataloader.dataset),
                100. * step / len(train_dataloader), loss))

def evaluate(loader):
  model.eval()

  n_correct, n_all = 0, 0

  for batch in loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)
      logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    predictions = np.argmax(logits, axis=1)

    labels = b_labels.to('cpu').numpy()
    n_correct += np.sum(predictions == labels)
    n_all += len(labels)

  print('Accuracy: [{}/{}] {:.4f}'.format(n_correct, n_all,
                                          n_correct/n_all))

Now we are ready to train our model using the train()
function. After each epoch, we evaluate the model using the
validation set and evaluate().

In [None]:
train_lossv = []
for epoch in range(1, EPOCHS + 1):
    print()
    train(epoch, train_lossv)
    print('\nValidation set:')
    evaluate(validation_dataloader)



Validation set:
Accuracy: [2245/2500] 0.8980


Validation set:
Accuracy: [2245/2500] 0.8980


Validation set:
Accuracy: [2245/2500] 0.8980


Validation set:
Accuracy: [2245/2500] 0.8980


Let's take a look at our training loss over all batches:

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_lossv, label='original')
plt.plot(np.convolve(train_lossv, np.ones(101), 'same') / 101,
         label='averaged')
plt.legend(loc='best')
plt.savefig("training-loss.png")
plt.show()


Inference

For a better measure of the quality of the model, let's see the
model accuracy for the test reviews.

In [None]:
print('\nTest set:')
evaluate(test_dataloader)

# eof


Test set:
Accuracy: [22972/25000] 0.9189


Once training is complete, we can evaluate the model on the SST2 dataset:

Finally, we'll save the hidden layers and trained values of the model:
#TO DO

In [None]:
# Save hidden layers and trained values
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'hidden_layers': model.encoder.layer[-1].output_hidden_states,
    'trained_values': model.pooler.dense.weight
}, 'path/to/save/model.pt')
