https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb#scrollTo=xtkcmIY9t6AF

## Installation of libraries and imports

In [1]:
!pip install transformers
!pip install datasets



In [2]:
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
import glob
import xml.etree.ElementTree as ET
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# Check that we are using 100% of GPU memory footprint support libraries/code
# from https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip -q install gputil
!pip -q install psutil
!pip -q install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 12.1 GB  | Proc size: 812.0 MB
GPU RAM Free: 11441MB | Used: 0MB | Util   0% | Total 11441MB



In case GPU utilisation (Util) is not at 0%, you can uncomment and run the following line to kill all processes to get the full GPU afterwards. Make sure to comment out the line again to not constantly crash the notebook on purpose.

In [None]:
# !kill -9 -1

## Loading the dataset

In [4]:
# Clone the dataset repository from github
!git clone https://github.com/CodyRichter/Automatic-Short-Answer-Grading

fatal: destination path 'Automatic-Short-Answer-Grading' already exists and is not an empty directory.


In [5]:
import json

with open('/content/Automatic-Short-Answer-Grading/dataset/train.json', 'r') as tf:
  training_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-answers.json', 'r') as tf:
  test_unseen_answer_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-questions.json', 'r') as tf:
  test_unseen_question_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-domains.json', 'r') as tf:
  test_unseen_domain_data = json.load(tf)

print('Number of Training Data Responses', len(training_data))
print('Number of Test Data (New Answer) Responses', len(test_unseen_answer_data))
print('Number of Test Data (New Question) Responses', len(test_unseen_question_data))
print('Number of Test Data (New Domain) Responses', len(test_unseen_domain_data))

Number of Training Data Responses 16265
Number of Test Data (New Answer) Responses 540
Number of Test Data (New Question) Responses 733
Number of Test Data (New Domain) Responses 4562


In [6]:
from torch.utils.data import Dataset, DataLoader

class ShortAnswerGradingDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Note: I handle the parsing in the data loading from XML section
        # Returns a dict for each item with the following keys: 'question', 'ref', 'response', 'score' all of type 'str'
        return self.dataset[idx]

In [7]:
training_dataset = ShortAnswerGradingDataset(training_data)
test_dataset_unseen_answers = ShortAnswerGradingDataset(test_unseen_answer_data)
test_dataset_unseen_questions = ShortAnswerGradingDataset(test_unseen_question_data)
test_dataset_unseen_domains = ShortAnswerGradingDataset(test_unseen_domain_data)

from sklearn.model_selection import train_test_split

training_dataset, validation_dataset = train_test_split(training_dataset, test_size=0.1, random_state=0)

validation_parent_ids = set()
validation_ids_to_remove = set()
validation_original_ids = set()

# Step 1: Get IDs of Original Responses and mark augmented ones for deletion
for validation_item in validation_dataset:
  if validation_item['aug']:
    validation_parent_ids.add(validation_item['aug_metadata']['parent_id'])
    validation_ids_to_remove.add(validation_item['id'])
  else:
    validation_original_ids.add(validation_item['id'])

train_ids_to_remove = set()

# Step 2: Obtain Original Respones for validation set and mark augmented
#         dataset items for removal if the original is in the validation set
for train_item in training_dataset:

  # If the original is in the validation set, remove from the training set
  if train_item['aug'] and train_item['aug_metadata']['parent_id'] in validation_original_ids:
    train_ids_to_remove.add(train_item['id'])

  # If the original is in the training set, add it to the validation set
  # and then mark it for deletion from the training set
  if not train_item['aug'] and train_item['id'] in validation_parent_ids:
    validation_dataset.append(train_item)
    train_ids_to_remove.add(train_item['id'])

# Step 3: Perform removal operations
validation_dataset[:] = [x for x in validation_dataset if x['id'] not in validation_ids_to_remove]
training_dataset[:] = [x for x in training_dataset if x['id'] not in train_ids_to_remove]

print('Number of Training Samples', len(training_dataset))
print('Number of Validation Samples', len(validation_dataset))
print('Number of Test Data (New Answer) Responses', len(test_unseen_answer_data))
print('Number of Test Data (New Question) Responses', len(test_unseen_question_data))
print('Number of Test Data (New Domain) Responses', len(test_unseen_domain_data))

Number of Training Samples 12805
Number of Validation Samples 1380
Number of Test Data (New Answer) Responses 540
Number of Test Data (New Question) Responses 733
Number of Test Data (New Domain) Responses 4562


In [11]:
# use the unseen question dataset for testing
test_dataset = test_unseen_question_data

In [12]:
# for training_item in training_dataset:
#   print(training_item)

# for test_item in test_dataset:
#   print(test_item)
print(training_dataset[0])
print(len(training_dataset))
print(len(test_dataset))

{'question': "Pam and her family were planning a hike. Pam found 2 maps of the same area. Her friend recommended she use the topographic map when they went to the lake. Why would Pam's friend recommend using the topographic map for the hike?", 'ref': 'She recommended the topographic map because the map shows the elevations along the trail. Pam would know where the trail was the steepest.', 'response': 'Because both maps show the shapes of landforms, but a topographic map shows elevation and dip.', 'score': 'incorrect', 'aug': True, 'id': 18221, 'aug_metadata': {'parent_id': 3816, 'translation_seq': ['en', 'es', 'en']}}
12805
733


In [16]:
# Concate the reference answer and student answer to creat new input for both train and test set
train_ref = []
train_res = []
train_score = []
test_ref = []
test_res = []
test_score = []
valid_ref = []
valid_res = []
valid_score = []

for training_item in training_data:
  train_ref.append(training_item["ref"])
  train_res.append(training_item["response"])
  train_score.append(0 if training_item["score"]=='incorrect' else 1)

for test_item in test_dataset:
  test_ref.append(test_item["ref"])
  test_res.append(test_item["response"])
  test_score.append(0 if test_item["score"]=='incorrect' else 1)

for valid_item in validation_dataset:
  valid_ref.append(valid_item["ref"])
  valid_res.append(valid_item["response"])
  valid_score.append(0 if valid_item["score"]=='incorrect' else 1)

train = {'idx': list(range(len(training_data))), 'label': train_score, 'sentence1': train_ref, 'sentence2': train_res}
valid = {'idx': list(range(len(validation_dataset))), 'label': valid_score, 'sentence1': valid_ref, 'sentence2': valid_res}
test = {'idx': list(range(len(test_dataset))), 'label': test_score, 'sentence1': test_ref, 'sentence2': test_res}

# Transform data into pandas dataframes
df_train = pd.DataFrame(train)
df_valid = pd.DataFrame(valid)
df_test = pd.DataFrame(test)

In [17]:
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(16265, 4)
(1380, 4)
(733, 4)


In [18]:
df_train.head()

Unnamed: 0,idx,label,sentence1,sentence2
0,0,0,The water splashed because the fork was vibrat...,Hitting the fork and dipping it into the water.
1,1,0,The water splashed because the fork was vibrat...,Strike the fork and plunge it into the water.
2,2,0,The water splashed because the fork was vibrat...,Hit the fork and dip it in the water.
3,3,0,The water splashed because the fork was vibrat...,Hit the fork and immerse it in water.
4,4,0,The water splashed because the fork was vibrat...,Hit with a fork and submerge it in water.


## Classes and functions

In [19]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='bert-base-uncased'):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = str(self.data.loc[index, 'sentence1'])
        sent2 = str(self.data.loc[index, 'sentence2'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.loc[index, 'label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

In [20]:
class SentencePairClassifier(nn.Module):

    def __init__(self, bert_model="bert-base-uncased", freeze_bert=False):
        super(SentencePairClassifier, self).__init__()
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model)

        #  Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768
        elif bert_model == 'allenai/scibert_scivocab_uncased':
            hidden_size = 768

        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        # Classification layer
        self.cls_layer = nn.Linear(hidden_size, 1)

        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        model_output = self.bert_layer(input_ids, attn_masks, token_type_ids)

        # Feeding to the classifier layer the last layer hidden-state of the [CLS] token further processed by a
        # Linear Layer and a Tanh activation. The Linear layer weights were trained from the sentence order prediction (ALBERT) or next sentence prediction (BERT)
        # objective during pre-training.

        logits = self.cls_layer(self.dropout(model_output.pooler_output))

        return logits

In [21]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    

def evaluate_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            count += 1

    return mean_loss / count

In [22]:
print("Creation of the models' folder...")
!mkdir models

Creation of the models' folder...


Link for mixed precision training, gradient scaling and gradient accumulation  : https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples

If you would like to learn more about Training Neural Nets on Larger Batches, I suggest reading this post of Thomas Wolf :
https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255

In [23]:
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):

    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):

            # Converting to cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            # Enables autocasting for the forward pass (model + loss)
            with autocast():
                # Obtaining the logits from the model
                logits = net(seq, attn_masks, token_type_ids)

                # Computing loss
                loss = criterion(logits.squeeze(-1), labels.float())
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged

            # Backpropagating the gradients
            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            if (it + 1) % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                # Adjust the learning rate based on the number of iterations.
                lr_scheduler.step()
                # Clear gradients
                opti.zero_grad()


            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1

    # Saving the model
    path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model.replace('/', '_'), lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()

## Parameters

In [24]:
bert_model = "bert-base-uncased"  # 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'bert-base-uncased', ...
freeze_bert = True  # if True, freeze the encoder weights and only update the classification layer weights
maxlen = 128  # maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
bs = 20  # batch size
iters_to_accumulate = 2  # the gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate. If set to "1", you get the usual batch size
lr = 5e-4  # learning rate
epochs = 6  # number of training epochs

## Training and validation

Link for the AdamW optimizer and the learning rate scheduler :
https://huggingface.co/transformers/main_classes/optimizer_schedules.html

In [25]:
#  Set all seeds to make reproducible results
set_seed(1)

# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_valid, maxlen, bert_model)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)

if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = nn.DataParallel(net)

net.to(device)

criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

Reading training data...


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Reading validation data...


  cpuset_checked))


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 20%|█▉        | 162/814 [01:44<07:05,  1.53it/s]


Iteration 162/814 of epoch 1 complete. Loss : 0.34457021408978805 


 40%|███▉      | 324/814 [03:26<05:09,  1.58it/s]


Iteration 324/814 of epoch 1 complete. Loss : 0.3363598070486828 


 60%|█████▉    | 486/814 [05:09<03:27,  1.58it/s]


Iteration 486/814 of epoch 1 complete. Loss : 0.3497038917776979 


 80%|███████▉  | 648/814 [06:52<01:45,  1.57it/s]


Iteration 648/814 of epoch 1 complete. Loss : 0.33578661728052445 


100%|█████████▉| 810/814 [08:35<00:02,  1.58it/s]


Iteration 810/814 of epoch 1 complete. Loss : 0.32750997398002646 


100%|██████████| 814/814 [08:37<00:00,  1.57it/s]
100%|██████████| 69/69 [00:43<00:00,  1.58it/s]



Epoch 1 complete! Validation Loss : 0.6457539243974547
Best validation loss improved from inf to 0.6457539243974547



 20%|█▉        | 162/814 [01:43<06:55,  1.57it/s]


Iteration 162/814 of epoch 2 complete. Loss : 0.3271299749612808 


 40%|███▉      | 324/814 [03:26<05:11,  1.57it/s]


Iteration 324/814 of epoch 2 complete. Loss : 0.3288746094906036 


 60%|█████▉    | 486/814 [05:09<03:27,  1.58it/s]


Iteration 486/814 of epoch 2 complete. Loss : 0.34727350291278625 


 80%|███████▉  | 648/814 [06:51<01:44,  1.58it/s]


Iteration 648/814 of epoch 2 complete. Loss : 0.33325429499885184 


100%|█████████▉| 810/814 [08:34<00:02,  1.58it/s]


Iteration 810/814 of epoch 2 complete. Loss : 0.3245023613174756 


100%|██████████| 814/814 [08:36<00:00,  1.58it/s]
100%|██████████| 69/69 [00:43<00:00,  1.58it/s]



Epoch 2 complete! Validation Loss : 0.6447747747103373
Best validation loss improved from 0.6457539243974547 to 0.6447747747103373



 20%|█▉        | 162/814 [01:43<06:52,  1.58it/s]


Iteration 162/814 of epoch 3 complete. Loss : 0.3230298173464375 


 40%|███▉      | 324/814 [03:25<05:10,  1.58it/s]


Iteration 324/814 of epoch 3 complete. Loss : 0.32308817098354115 


 60%|█████▉    | 486/814 [05:08<03:27,  1.58it/s]


Iteration 486/814 of epoch 3 complete. Loss : 0.3429439869927771 


 80%|███████▉  | 648/814 [06:51<01:44,  1.59it/s]


Iteration 648/814 of epoch 3 complete. Loss : 0.3303739782652737 


100%|█████████▉| 810/814 [08:33<00:02,  1.57it/s]


Iteration 810/814 of epoch 3 complete. Loss : 0.32133904559376797 


100%|██████████| 814/814 [08:35<00:00,  1.58it/s]
100%|██████████| 69/69 [00:43<00:00,  1.58it/s]



Epoch 3 complete! Validation Loss : 0.649357159068619


 20%|█▉        | 162/814 [01:43<06:51,  1.59it/s]


Iteration 162/814 of epoch 4 complete. Loss : 0.31714567703045443 


 40%|███▉      | 324/814 [03:25<05:11,  1.57it/s]


Iteration 324/814 of epoch 4 complete. Loss : 0.32014736551561473 


 60%|█████▉    | 486/814 [05:08<03:27,  1.58it/s]


Iteration 486/814 of epoch 4 complete. Loss : 0.340092367320149 


 80%|███████▉  | 648/814 [06:51<01:45,  1.57it/s]


Iteration 648/814 of epoch 4 complete. Loss : 0.33096000818926613 


100%|█████████▉| 810/814 [08:34<00:02,  1.59it/s]


Iteration 810/814 of epoch 4 complete. Loss : 0.32088476171096164 


100%|██████████| 814/814 [08:36<00:00,  1.58it/s]
100%|██████████| 69/69 [00:43<00:00,  1.58it/s]



Epoch 4 complete! Validation Loss : 0.650379238785177


 20%|█▉        | 162/814 [01:43<06:54,  1.57it/s]


Iteration 162/814 of epoch 5 complete. Loss : 0.31090507803507794 


 40%|███▉      | 324/814 [03:25<05:12,  1.57it/s]


Iteration 324/814 of epoch 5 complete. Loss : 0.32056083456601625 


 60%|█████▉    | 486/814 [05:08<03:27,  1.58it/s]


Iteration 486/814 of epoch 5 complete. Loss : 0.3361194944124163 


 80%|███████▉  | 648/814 [06:51<01:45,  1.57it/s]


Iteration 648/814 of epoch 5 complete. Loss : 0.3293644031624735 


100%|█████████▉| 810/814 [08:33<00:02,  1.57it/s]


Iteration 810/814 of epoch 5 complete. Loss : 0.3208422498018653 


100%|██████████| 814/814 [08:36<00:00,  1.58it/s]
100%|██████████| 69/69 [00:43<00:00,  1.58it/s]



Epoch 5 complete! Validation Loss : 0.643730424452519
Best validation loss improved from 0.6447747747103373 to 0.643730424452519



 20%|█▉        | 162/814 [01:43<06:53,  1.58it/s]


Iteration 162/814 of epoch 6 complete. Loss : 0.30954350048193224 


 40%|███▉      | 324/814 [03:25<05:10,  1.58it/s]


Iteration 324/814 of epoch 6 complete. Loss : 0.3264164560370975 


 60%|█████▉    | 486/814 [05:08<03:26,  1.59it/s]


Iteration 486/814 of epoch 6 complete. Loss : 0.32792718966066103 


 80%|███████▉  | 648/814 [06:51<01:47,  1.55it/s]


Iteration 648/814 of epoch 6 complete. Loss : 0.3274201743396712 


100%|█████████▉| 810/814 [08:34<00:02,  1.59it/s]


Iteration 810/814 of epoch 6 complete. Loss : 0.3193566015473119 


100%|██████████| 814/814 [08:36<00:00,  1.57it/s]
100%|██████████| 69/69 [00:43<00:00,  1.58it/s]



Epoch 6 complete! Validation Loss : 0.6352263546508291
Best validation loss improved from 0.643730424452519 to 0.6352263546508291

The model has been saved in models/bert-base-uncased_lr_0.0005_val_loss_0.63523_ep_6.pt


You can download the model saved in the folder "models" by browsing the files on the left of the colab notebook

In [None]:
# If you encounter a CUDA out of memory error: 
# - uncomment the kill command, run the "kill" command (and comment it)
# - reduce the batch size
# - then run all cells from the begining 

# !kill -9 -1

## Prediction

In [26]:
print("Creation of the results' folder...")
!mkdir results

Creation of the results' folder...


In [27]:
def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()

def test_prediction(net, device, dataloader, with_labels=True, result_file="results/output.txt"):
    """
    Predict the probabilities on a dataset with or without labels and print the result in a file
    """
    net.eval()
    w = open(result_file, 'w')
    probs_all = []

    with torch.no_grad():
        if with_labels:
            for seq, attn_masks, token_type_ids, _ in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()
        else:
            for seq, attn_masks, token_type_ids in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

    w.writelines(str(prob)+'\n' for prob in probs_all)
    w.close()

I'm sharing below an ALBERT pre-trained model (45 Mo) so you can reproduce my results on the MRPC validation set (**91.19** as F1 score and **87.5** as accuracy). It's just in case but if all the code run as expected, you should get after the model training the correct model in the *models* folder

You can download it and upload it (~ 3 minutes) in the *models* folder by browsing the files on the left of the colab notebook :

https://drive.google.com/file/d/1AcRLGvALAH3BVSiDVjY_b8CggJgVfksp/view?usp=sharing

In [28]:
path_to_model = '/content/models/bert-base-uncased_lr_0.0005_val_loss_0.63523_ep_6.pt'  
# path_to_model = '/content/models/...'  # You can add here your trained model

path_to_output_file = 'results/output.txt'

print("Reading test data...")
test_set = CustomDataset(df_test, maxlen, bert_model)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=5)

model = SentencePairClassifier(bert_model)
if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)

print()
print("Loading the weights of the model...")
model.load_state_dict(torch.load(path_to_model))
model.to(device)

print("Predicting on test data...")
test_prediction(net=model, device=device, dataloader=test_loader, with_labels=True,  # set the with_labels parameter to False if your want to get predictions on a dataset without labels
                result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))

Reading test data...


  cpuset_checked))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Loading the weights of the model...
Predicting on test data...


100%|██████████| 37/37 [00:23<00:00,  1.56it/s]


Predictions are available in : results/output.txt





You can download the predictions saved in the folder "results" by browsing the files on the left of the colab notebook

## Evaluation

In [29]:
path_to_output_file = 'results/output.txt'  # path to the file with prediction probabilities

labels_test = df_test['label']  # true labels

probs_test = pd.read_csv(path_to_output_file, header=None)[0]  # prediction probabilities

threshold = 0.5   # you can adjust this threshold for your own dataset
preds_test=(probs_test>=threshold).astype('uint8') # predicted labels using the above fixed threshold

metric = load_metric("glue", "mrpc")

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [30]:
# Compute the accuracy and F1 scores
metric._compute(predictions=preds_test, references=labels_test)

{'accuracy': 0.5798090040927695, 'f1': 0.5157232704402516}

In [None]:
# Result -----

# bert_model = "bert-base-uncased" 
# freeze_bert = True
# maxlen = 128 
# bs = 20
# iters_to_accumulate = 2 
# lr = 1e-3  
# epochs = 4
# test dataset: Unseen questions
# accuracy = 0.68

# bert_model = "bert-base-uncased" 
# freeze_bert = True
# maxlen = 128 
# bs = 20
# iters_to_accumulate = 2 
# lr = 5e-4  
# epochs = 6
# test dataset: Unseen questions
# accuracy = 0.52


# Augmented data
#------------------
# bert_model = "bert-base-uncased" 
# freeze_bert = True
# maxlen = 128 
# bs = 20 
# iters_to_accumulate = 2 
# lr = 5e-4  
# epochs = 10 
# test dataset: Unseen questions
# accuracy = 0.64
# f1 0.617

# bert_model = "bert-base-uncased" 
# freeze_bert = True 
# maxlen = 128 
# bs = 20 
# iters_to_accumulate = 2 
# lr = 5e-4  
# epochs = 15 
# test dataset: Unseen questions
# accuracy = 0.66
# f1 0.65

# bert_model = "bert-base-uncased" 
# freeze_bert = True
# maxlen = 128 
# bs = 20
# iters_to_accumulate = 2 
# lr = 5e-4  
# epochs = 6
# test dataset: Unseen questions
# accuracy = 0.65
# f1 = 0.615


# -------------------------
# new new augmented dataset

# bert_model = "bert-base-uncased" 
# freeze_bert = True
# maxlen = 128 
# bs = 20
# iters_to_accumulate = 2 
# lr = 5e-4  
# epochs = 6
# test dataset: Unseen questions
# accuracy = 0.5798
# f1 = 0.515