Fine-Tuning BERT for Text Classification

### Import torch e transformers

In [None]:
!pip install transformers



In [None]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
#from nltk.corpus import stopwords
import nltk
import re
from tabulate import tabulate
from tqdm import trange

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

In [None]:
#verificare la disponibilità di cuda
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Preprocessing data

In [None]:
file_path_train = '/content/drive/MyDrive/BERT/train.csv'
file_path_test = '/content/drive/MyDrive/BERT/test.csv'

In [None]:
def etichetta_numerica(label):
  if label=='neg':
    return 0
  else:
    return 1

In [None]:
# Carica il file CSV
df_train = pd.read_csv(file_path_train)
df_train = df_train.rename(columns={'sentiment': 'label'})
df_train['label'] = df_train['label'].apply(etichetta_numerica)

# Visualizza le prime righe del DataFrame
print(df_train.head())


                                                text  label
0  Now, I won't deny that when I purchased this o...      0
1  The saddest thing about this "tribute" is that...      0
2  Last night I decided to watch the prequel or s...      0
3  I have to admit that i liked the first half of...      0
4  I was not impressed about this film especially...      0


In [None]:
# Carica il file CSV
df_test = pd.read_csv(file_path_test)
df_test = df_test.rename(columns={'sentiment': 'label'})
df_test['label'] = df_test['label'].apply(etichetta_numerica)

# Visualizza le prime righe del DataFrame
print(df_test.head())


                                                text  label
0  My daughter liked it but I was aghast, that a ...      0
1  I... No words. No words can describe this. I w...      0
2  this film is basically a poor take on the old ...      0
3  This is a terrible movie, and I'm not even sur...      0
4  First of all this movie is a piece of reality ...      1


In [None]:
df = pd.concat([df_train, df_test], ignore_index=True)
df.head()

Unnamed: 0,text,label
0,"Now, I won't deny that when I purchased this o...",0
1,"The saddest thing about this ""tribute"" is that...",0
2,Last night I decided to watch the prequel or s...,0
3,I have to admit that i liked the first half of...,0
4,I was not impressed about this film especially...,0


In [None]:
# Conta il numero di valori '1' e '0' nella colonna 'label'
conteggio_etichette = df['label'].value_counts()

# Visualizza il conteggio
print("Numero di '1':", conteggio_etichette[1])
print("Numero di '0':", conteggio_etichette[0])

Numero di '1': 25000
Numero di '0': 25000


## Remove stpwords

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
sw = stopwords.words('english')

def clean_text(text):

    text = text.lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs
    #text = re.sub(r"http", "",text)

    html=re.compile(r'<.*?>')

    text = html.sub(r'',text) #Removing html tags

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations

    text = [word.lower() for word in text.split() if word.lower() not in sw]

    text = " ".join(text) #removing stopwords

    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis

    return text



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df['text'] = df['text'].apply(lambda x: clean_text(x))

## Preprocess input

In [None]:
reviews = df.text.values
labels = df.label.values

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
index=0
print(' Original: ', reviews[index])


#print('Tokenized: ', tokenizer.tokenize(tweets[index])) # Print the sentence split into tokens.

#print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweets[index]))) # Print the sentence mapped to token ids.

table = np.array([tokenizer.tokenize(reviews[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(reviews[index]))]).T
print(tabulate(table,headers = ['Tokens', 'Token IDs'],tablefmt = 'fancy_grid'))



 Original:  now, deny purchased ebay, high expectations incredible print work master comedy enjoy however, soon disappointed apologies enjoyed it, found compleat al difficult watch got smiles, sure, majority funny came music videos got dvd rest basically filler could tell al greatest video achievement honor goes uhf honestly, doubt ever make jump dvd, ultra hardcore al fan everything, buy tape ebay pay much
╒══════════════╤═════════════╕
│ Tokens       │   Token IDs │
╞══════════════╪═════════════╡
│ now          │        2085 │
├──────────────┼─────────────┤
│ ,            │        1010 │
├──────────────┼─────────────┤
│ deny         │        9772 │
├──────────────┼─────────────┤
│ purchased    │        4156 │
├──────────────┼─────────────┤
│ e            │        1041 │
├──────────────┼─────────────┤
│ ##bay        │       15907 │
├──────────────┼─────────────┤
│ ,            │        1010 │
├──────────────┼─────────────┤
│ high         │        2152 │
├──────────────┼─────────────┤


### Add special tokens

In [None]:
max_len = 0

# For every sentence...
for sent in reviews:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    # Sequenze troncate a 512 poichè è la massima lunghezza gestibile dal tokenizer
    input_ids = tokenizer.encode(sent[:512], add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  206


In [None]:
input_ids = []
attention_masks = []

# For every tweet...
for review in reviews:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        review,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        truncation=True,
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)





In [None]:
# Print sentence 0, now as a list of IDs.
i=random.randint(0,len(reviews)-1)
print('Original: ', reviews[i])
print('Input IDs:', input_ids[i])

Original:  seen film quite time, caught long ago nicely transferred criterion dvd le cercle rouge film owes lot movies, keeps reminding us rififi , asphalt jungle , among others, deal capers take center stage movie reproduce great detail unfortunately, one knows old adage crime pay, start, men involved doomed onset br br jean pierre melville director words fill pictures lot dialog, case yet, talky , style proved le dolous , le samurai , masterpiece, bob le flambeur , among others mr melville sense style comes across everything film, working cinematographer, henri decae, takes us along ride streets paris shows vibrant city mainly night bleak winter france score eric demarsan emphasizes jazzy music accompanies action br br although film shows alain delon, corey, center action, however, smart inspector mattei real hero movie played great bourvil, man shows lot patience figured beginning catch vogel, process gets involved investigation jewel heist knows escaped man tailing looms large behi

In [None]:
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(reviews) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(input_ids[index]))
  token_ids = [i.numpy() for i in input_ids[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(reviews[index])
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

one time best episodes officer sean cooper murdered patrol car back dying convict state penitentiary reveals stole block heroin car shooting case reopened presumption corrupted policeman br br investigation police officer human reveals war veteran involved forbidden love type love considered shameful something least keep hidden time br br type love personally support, still policeman human killed sound track excellent keeps watching dvr , selective use black white mixed color emphasize one object give particular feeling scene especially appealing shall watching one repeat
╒═══════════════╤═════════════╤══════════════════╕
│ Tokens        │   Token IDs │   Attention Mask │
╞═══════════════╪═════════════╪══════════════════╡
│ [CLS]         │         101 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ one           │        2028 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ time          │        2051 │                1 │
├────────────

## Train Validation Split

In [None]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)


# Calculate the number of samples to include in each set.
# train 0.7
train_size = int(0.7 * len(dataset))
val_test_size = len(dataset)  - train_size
# validation 0.15
# test 0.15

val_size =  int(0.5 * val_test_size)
test_size = val_test_size - val_size
print(val_test_size)
print(val_size)

# Divide the dataset by randomly selecting samples.
train_dataset, val_test_dataset = random_split(dataset, [train_size, val_test_size])
val_dataset, test_dataset = random_split(val_test_dataset, [val_size, test_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} test samples'.format(test_size))

15000
7500
35,000 training samples
7,500 validation samples
7,500 test samples


Train parameters
The optimal hyperparameter values are task-specific, but we found the following range of possible values to work well across all tasks:

- Batch size: 16, 32

- Learning rate (Adam): 5e-5, 3e-5, 2e-5

- Number of epochs: 2, 3, 4

In [None]:
# The DataLoader needs to know our batch size for training, so we specify it
# here. For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
test_dataloader = DataLoader(
            test_dataset, # The test samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

## Some Prediction Metrics

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

## Create BERT model

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# if device == "cuda:0":
# # Tell pytorch to run this model on the GPU.
#     model = model.cuda()
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



## Fine tuning the model

In [None]:
# Number of training epochs. The BERT authors recommend between 2 and 4.
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,
                             token_type_ids = None,
                             attention_mask = b_input_mask,
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids,
                              token_type_ids = None,
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')


Epoch:  25%|██▌       | 1/4 [22:17<1:06:53, 1338.00s/it]


	 - Train loss: 0.2971
	 - Validation Accuracy: 0.9043
	 - Validation Precision: 0.9164
	 - Validation Recall: 0.8921
	 - Validation Specificity: 0.9177



Epoch:  50%|█████     | 2/4 [44:38<44:38, 1339.27s/it]  


	 - Train loss: 0.1794
	 - Validation Accuracy: 0.9030
	 - Validation Precision: 0.9388
	 - Validation Recall: 0.8623
	 - Validation Specificity: 0.9433



Epoch:  75%|███████▌  | 3/4 [1:06:59<22:20, 1340.12s/it]


	 - Train loss: 0.0966
	 - Validation Accuracy: 0.9038
	 - Validation Precision: 0.8815
	 - Validation Recall: 0.9361
	 - Validation Specificity: 0.8723



Epoch: 100%|██████████| 4/4 [1:29:21<00:00, 1340.41s/it]


	 - Train loss: 0.0504
	 - Validation Accuracy: 0.9027
	 - Validation Precision: 0.8837
	 - Validation Recall: 0.9308
	 - Validation Specificity: 0.8742






## Test

In [None]:
# Tracking variables
val_accuracy = []
val_precision = []
val_recall = []
val_specificity = []

for batch in test_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    # Forward pass
    eval_output = model(b_input_ids,
                        token_type_ids = None,
                        attention_mask = b_input_mask)
  logits = eval_output.logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  # Calculate validation metrics
  b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
  val_accuracy.append(b_accuracy)
  # Update precision only when (tp + fp) !=0; ignore nan
  if b_precision != 'nan': val_precision.append(b_precision)
  # Update recall only when (tp + fn) !=0; ignore nan
  if b_recall != 'nan': val_recall.append(b_recall)
  # Update specificity only when (tn + fp) !=0; ignore nan
  if b_specificity != 'nan': val_specificity.append(b_specificity)

print('\t - Test Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
print('\t - Test Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Test Precision: NaN')
print('\t - Test Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Test Recall: NaN')
print('\t - Test Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Test Specificity: NaN')

	 - Test Accuracy: 0.9093
	 - Test Precision: 0.8906
	 - Test Recall: 0.9340
	 - Test Specificity: 0.8845

