In [2]:
### Import libraries ###

import pandas as pd # for data manipulation and analysis
import torch # tools and utilities for building and training neural networks
import numpy as np # for numerical computing in Python
from transformers import BertTokenizer, BertForSequenceClassification # for tokenizing text and pre-trained BERT model fine-tuned for sequence classification tasks
from torch.optim import AdamW as TorchAdamW # optimizer for weight decay, which is commonly used for training neural networks
from sklearn.model_selection import train_test_split # function to split datasets into training and testing sets
from sklearn.metrics import accuracy_score # function to compute the accuracy of a classification model by comparing the predicted labels to the true labels

In [3]:
### Import the dataset ###
df = pd.read_csv('https://raw.githubusercontent.com/EvgeniaViskovatykh/BERT-fine-tune-Humor-Detection/main/dataset.csv')

### Show the dataset ###
df.head()

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


In [None]:
### Dataset preparation ###

# to convert boolean values in 'humor' column into strings ('True' and 'False')
df['humor'] = df['humor'].astype(str)

# to add 'score' where to 1 or 0 based on 'True' or 'False' values
df['score'] = df['humor'].map({'True': 1, 'False': 0})

# to drop the 'humor' column from dataset
df.drop(columns=['humor'], inplace=True)

# visualized the new dataset
df.head()

Unnamed: 0,text,score
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",0
1,Watch: darvish gave hitter whiplash with slow ...,0
2,What do you call a turtle without its shell? d...,1
3,5 reasons the 2016 election feels so personal,0
4,"Pasco police shot mexican migrant from behind,...",0


In [None]:
# to extract jokes from the 'text' column and labels from the 'score' column

texts = df['text'].to_list()
labels = df['score'].to_list()

In [None]:
### For training were extracted 35k samples from dataset and splited on train 80%, validation 10% and test 10% parts ###

train_texts, test_texts, train_labels, test_labels = train_test_split(texts[:35000], labels[:35000], test_size=.1, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.1, random_state=42)

In [None]:
### Define tokenizer from pre-trained BERT model for sequence classification and specify num_labels = 2 to perform binary classification ###

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
### to tokenize the input texts (jokes) using the BERT tokenizer ###
# to set truncation = True to truncate long sequences and padding = True to pad sequences to the maximum length #

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
### to convert the tokenized encodings (input_ids and attention_mask) and labels into PyTorch tensors ###
# and create PyTorch datasets (TensorDataset) for training, validation, and test sets #

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),
                                               torch.tensor(train_encodings['attention_mask']),
                                               torch.tensor(train_labels))
val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']),
                                             torch.tensor(val_encodings['attention_mask']),
                                             torch.tensor(val_labels))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']),
                                              torch.tensor(test_encodings['attention_mask']),
                                              torch.tensor(test_labels))

In [None]:
# to define data loaders (DataLoader) to load data in batches for training, validation, and test sets #
# to specify a batch size of 16 and set shuffle = True for the training set to shuffle the data during training #

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
### to put the model in training mode ###
# and freeze all BERT model layers except the last layer from training #
for param in model.bert.parameters():
    param.requires_grad = False

# allow gradients to flow only through the last layer (classification head) #
for param in model.classifier.parameters():
    param.requires_grad = True

In [None]:
### Fine-tune BERT model ###

# to set up the optimizer for fine-tuning the BERT model with a learning rate 'lr' of 1e-5 #
optimizer = TorchAdamW(model.classifier.parameters(), lr=1e-5)
model.train()

# and loop through 3 epochs
for epoch in range(3):

  # to iterate over the batches in the training data loader (train_loader),
  # perform forward pass, calculate loss, perform backward pass, and update model parameters using optimizer
    for batch in train_loader: # initiates a loop over batches in the train_loader

        # each batch is unpacked into three variables: input_ids, attention_mask, and labels
        # input IDs, attention mask, and labels for a batch of data, respectively.
        input_ids, attention_mask, labels = batch

        # to clear the gradients of all optimized tensors before performing backpropagation to avoid accumulating gradients from previous iterations
        optimizer.zero_grad()

        # to pass the input data to the model to compute the output based on these inputs
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # to retrieve the loss value from the outputs
        loss = outputs.loss
        # to compute the gradients of the loss with respect to all model parameters
        loss.backward()
        # to update the model using the computed gradients and the optimization algorithm
        optimizer.step()

In [None]:
### Put the model in evaluation mode ###

model.eval()

# validation accuracy computing #

val_predictions = []  # initialize list for prediction labels
val_true_labels = []  # initialize list for ground truth labels

with torch.no_grad():  # disable gradient calculation for inference speedup
    for batch in val_loader:  # iterate over batches of data from validation data loader
        input_ids, attention_mask, labels = batch  # unpack elements of the batch
        logits = model(input_ids, attention_mask=attention_mask).logits  # compute predictions
        val_predictions.extend(torch.argmax(logits, dim=1).tolist())  # extend prediction labels
        val_true_labels.extend(labels.tolist())  # extend true labels

# calculate validation accuracy
val_accuracy = accuracy_score(val_true_labels, val_predictions)

# print validation accuracy
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.8282539682539682


In [None]:
# test accuracy computing #

test_predictions = []  # initialize list for prediction labels
test_true_labels = []  # initialize list for ground truth labels

with torch.no_grad():  # disable gradient calculation for inference speedup
    for batch in test_loader:  # iterate over batches in the test data loader
        input_ids, attention_mask, labels = batch  # unpack elements of the batch
        logits = model(input_ids, attention_mask=attention_mask).logits  # compute predictions
        test_predictions.extend(torch.argmax(logits, dim=1).tolist())  # extend prediction labels
        test_true_labels.extend(labels.tolist())  # extend true labels

# calculate test accuracy
test_accuracy = accuracy_score(test_true_labels, test_predictions)

# print test accuracy
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8285714285714286


In [None]:
### Save the model ###

torch.save(model.state_dict(), 'BERT_Humor_35k.pth')

In [None]:
torch.save(model.state_dict(), 'BERT_Humor_35k.model')