## Fine-tune BERT to make predictions based on specific train and validation sets

In [1]:
import os
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
import torch
import time
from tqdm import tqdm
from google.colab import drive

class SequenceClassificationDataset(Dataset): # Handle the input data and labels for PyTorch's DataLoader
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs['input_ids']) # Return the total number of samples in the dataset

    def __getitem__(self, idx):
        # Retrieve the input_ids, attention_mask, and label corresponding to the index
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        label = self.labels[idx]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

class BertFineTuning:
    def __init__(self, dataset_path, train_file, validation_file, feature_col, label_col, model_name, batch_size, learning_rate, num_epochs, max_len, optimizer=None, device='cpu'):
        self.dataset_path = dataset_path
        self.train_file = train_file
        self.validation_file = validation_file
        self.feature_col = feature_col
        self.label_col = label_col
        self.model_name = model_name
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.max_len = max_len
        self.optimizer = optimizer
        self.device = torch.device(device)  # Convert device argument to torch.device
        drive.mount('/content/gdrive') # Mount Google Drive

        # Load tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name, max_len=self.max_len)

        # Load datasets
        self.train_df = pd.read_csv(os.path.join(self.dataset_path, self.train_file))
        self.validation_df = pd.read_csv(os.path.join(self.dataset_path, self.validation_file))

        # Label Transformation Guide:
        # ---------------------------
        # Many machine learning models, including those in PyTorch and Hugging Face Transformers, use zero-based indexing for classification tasks.
        # [1] For conventional labels [1, 2, 3, 4, 5], transform them to zero-based indexing [0, 1, 2, 3, 4] by subtracting 1.
        #     This allows training the model using zero-based labels and adjusting predictions accordingly.
        # [2] For zero-based labels (e.g., [0, 1, 2]), skip the label transformation step.
        #     Ensure to adjust the prediction phase accordingly, removing any +1 offset on predicted labels.
        # [3] In general, preprocess your dataset to transform labels into a conventional format (e.g., [0, 1, 2]) before model training.

        # Specific Label Transformations:
        # -------------------------------
        # [1] Convert labels [1, 2, 3, 4, 5] to [0, 1, 2, 3, 4] by subtracting 1. Example: [1 → 0, 2 → 1, 3 → 2, 4 → 3, 5 → 4]
        # [2] Convert labels [-1, 0, 1] to [0, 1, 2] by adding 1. Example: [-1 → 0, 0 → 1, 1 → 2]
        # [3] Convert string labels to numerical format before model training. Implement a mapping strategy to translate string labels into numerical representations.

        # Note on Prediction Phase:
        # --------------------------
        # [*] Ensure to reverse label transformations during prediction to map model outputs back to the original label space.
        #     For example, if using zero-based labels during training, add 1 to predicted indices to align with the original labels.

        # self.train_df[self.label_col] = self.train_df[self.label_col] - 1
        # self.validation_df[self.label_col] = self.validation_df[self.label_col] - 1

        # Calculate number of unique labels
        self.num_labels = len(self.train_df[self.label_col].unique())

        # Tokenize datasets
        self.tokenized_train = self.tokenize_dataset(self.train_df, self.feature_col, self.label_col)
        self.tokenized_validation = self.tokenize_dataset(self.validation_df, self.feature_col, self.label_col)

        # Model configuration
        self.model_config = BertConfig.from_pretrained(self.model_name, num_labels=self.num_labels)
        self.model = BertForSequenceClassification.from_pretrained(self.model_name, config=self.model_config)
        self.model.to(self.device)

        # Optimizer
        if self.optimizer is None:
            raise ValueError("Please provide an optimizer instance.")

        if self.optimizer == 'Adam':
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        elif self.optimizer == 'AdamW':
            self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)

        # DataLoaders
        self.train_dataloader = self.create_dataloader(self.tokenized_train)
        self.validation_dataloader = self.create_dataloader(self.tokenized_validation, shuffle=False)

    def tokenize_dataset(self, df, feature_col, label_col):
        return self.tokenizer(list(df[feature_col]),
                              padding=True,
                              truncation=True,
                              return_tensors='pt'), list(df[label_col])

    def create_dataloader(self, tokenized_dataset, shuffle=True):
        dataset = SequenceClassificationDataset(tokenized_dataset[0], tokenized_dataset[1])
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=shuffle)

    def evaluate_model(self, dataloader):
        self.model.eval() # Set the model to evaluation mode
        # Initialize lists to store true labels and predictions
        all_labels = []
        all_predictions = []

        with torch.no_grad():
            for batch in dataloader: # Iterate over batches in the data loader
                inputs = {key: value.to(self.device) for key, value in batch.items()} # Move inputs to the appropriate device (CPU or GPU)
                labels = inputs["labels"] # Extract labels from inputs
                outputs = self.model(**inputs) # Forward pass through the model
                logits = outputs.logits # Get logits from the model output

                _, predicted = torch.max(logits, 1) # Compute predicted labels
                # Convert labels and predictions to numpy arrays
                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(predicted.cpu().numpy())

        accuracy = accuracy_score(all_labels, all_predictions) # Calculate accuracy
        return accuracy

    def train(self):
        for epoch in range(self.num_epochs): # Iterate over the num_epochs of epochs
            self.model.train() # Set the model to training mode
            train_losses = [] # List to store training losses for each batch

            # Iterate over batches in the training data loader, displaying progress using tqdm
            for batch in tqdm(self.train_dataloader, desc=f'Epoch {epoch + 1}/{self.num_epochs}'):
                inputs = {key: value.to(self.device) for key, value in batch.items()} # Move inputs to the appropriate device (CPU or GPU)
                outputs = self.model(**inputs) # Forward pass through the model
                loss = outputs.loss # Retrieve the loss from the model output
                train_losses.append(loss.item()) # Append the loss value to the list of training losses

                self.optimizer.zero_grad() # Zero the gradients
                loss.backward() # Backpropagate the gradients
                self.optimizer.step() # Update the model parameters

            # Validation
            validation_losses = [] # Initialize an empty list to store validation losses
            validation_accuracy = self.evaluate_model(self.validation_dataloader) # Evaluate model performance on the validation data loader

            for batch in self.validation_dataloader:
              inputs = {key: value.to(self.device) for key, value in batch.items()} # Move inputs to the appropriate device (CPU or GPU)
              outputs = self.model(**inputs) # Forward pass through the model
              loss = outputs.loss # Retrieve the loss from the model output
              validation_losses.append(loss.item()) # Append the loss value to the list of validation losses

            print(f'Epoch {epoch + 1}/{self.num_epochs} - Training Loss: {sum(train_losses) / len(train_losses):.4f} - Validation Loss: {sum(validation_losses) / len(validation_losses):.4f} - Validation Accuracy: {validation_accuracy:.4f}')

            # print(f'Epoch {epoch + 1}/{self.num_epochs} - Training Loss: {sum(train_losses) / len(train_losses):.4f} - Validation Accuracy: {validation_accuracy:.4f}')

    def save_model(self, directory):
        self.model.save_pretrained(directory)
        self.tokenizer.save_pretrained(directory)

# Usage
start_time = time.time()
model = 'bert'
model_name = 'bert-base-uncased'

## Hyperparameters
learning_rate = 2e-5
num_epochs = 3
batch_size = 6

# Maximum sequence length for padding and truncation
# ---------------------------------------------------
# The maximum sequence length limit for BERT/RoBERTa is 512 tokens (approximately 512x5=2,560 characters).
# If an input exceeds 512 tokens, the text is truncated to meet this maximum length.
# If an input is shorter than 512 tokens, the text is padded to achieve the maximum length.
# However, techniques such as chunking or hierarchical processing can handle longer texts by segmenting
#   the input text into smaller segments, processing each segment separately, and then combining the results.
# Nevertheless, these techniques can introduce complexities and potential drawbacks.
max_len = 512

optimizer = 'Adam' # Adam or AdamW
device = 'cuda' if torch.cuda.is_available() else 'cpu' # or device = 'cpu'

## Paths and filenames
absolute_path = "/content/gdrive/My Drive/Projects/SpamNews/"
dataset_path = absolute_path + "Datasets/"
train_file = 'train_set.csv'
validation_file = 'validation_set.csv'
feature_col = 'Text'
label_col = 'Label'
trained_model = model + '_optimizer_' + optimizer + '_lr_' + str(learning_rate) + '_epochs_' + str(num_epochs) + '_bs_' + str(batch_size) + '_maxlen_' + str(max_len)

# Fine-Tuning Phase
classifier = BertFineTuning(dataset_path, train_file, validation_file, feature_col, label_col, model_name, batch_size, learning_rate, num_epochs, max_len, optimizer, device)
classifier.train()
classifier.save_model(absolute_path + 'TrainedModels/' + trained_model)
print("Training time: {:.2f} seconds".format(time.time() - start_time))

Mounted at /content/gdrive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 534/534 [03:51<00:00,  2.30it/s]


Epoch 1/3 - Training Loss: 0.1531 - Validation Loss: 0.1000 - Validation Accuracy: 0.9575


Epoch 2/3: 100%|██████████| 534/534 [03:57<00:00,  2.25it/s]


Epoch 2/3 - Training Loss: 0.0396 - Validation Loss: 0.0470 - Validation Accuracy: 0.9875


Epoch 3/3: 100%|██████████| 534/534 [03:57<00:00,  2.25it/s]


Epoch 3/3 - Training Loss: 0.0294 - Validation Loss: 0.0386 - Validation Accuracy: 0.9900
Training time: 877.48 seconds


## Use the Fine-tuned BERT model to make predictions for a specific test set

In [2]:
import pandas as pd
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
from google.colab import drive

class BertPredictions:
    def __init__(self, model_path, device, max_len):
        drive.mount('/content/gdrive') # Mount Google Drive
        self.model_path = model_path
        self.max_len = max_len
        self.device = torch.device(device)  # Convert device argument to torch.device
        self.model, self.tokenizer = self.load_fine_tuned_bert_model()

    def load_fine_tuned_bert_model(self):
        model = BertForSequenceClassification.from_pretrained(self.model_path) # Load the fine-tuned BERT model
        tokenizer = BertTokenizer.from_pretrained(self.model_path) # Load the tokenizer
        model.to(self.device) # Move the model to the specified device
        return model, tokenizer

    def predict(self, input):
        tokens = self.tokenizer.tokenize(self.tokenizer.decode(self.tokenizer.encode(input))) # Tokenize the input using the loaded tokenizer

        # Truncate the tokens if the length exceeds max_len - 2
        if len(tokens) > self.max_len - 2:
            tokens = tokens[:self.max_len - 2]

        # Encode the tokens and convert them to PyTorch tensor
        input_ids = self.tokenizer.encode(tokens, return_tensors="pt").to(self.device)

        with torch.no_grad():
            self.model.eval() # Set the model to evaluation mode
            logits = self.model(input_ids)[0] # Perform forward pass through the model
            predictions = torch.argmax(logits, dim=1).item() # Predict the label by selecting the index with the highest logit value

        # return predictions + 1 # For non zero-based labels subtract while training and +1 during prediction
        return predictions

    def predict_and_save(self, dataset_path, test_file, feature_col, prediction_col):
        # Load the test dataset
        test_df = pd.read_csv(os.path.join(dataset_path, test_file))

        # Backup the original file by renaming it
        os.rename(os.path.join(dataset_path, test_file), os.path.join(dataset_path, 'test_set_original.csv'))

        # Iterate through each row in the DataFrame
        for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
            content = row[feature_col]

            # Process the content and predict the label
            predicted_rating = self.predict(content)

            # Update the prediction_col column
            test_df.at[index, prediction_col] = predicted_rating

        # Save results to CSV
        test_df.to_csv(os.path.join(dataset_path, test_file), index=False)

# Usage
max_len = 512

str_params = 'bert_optimizer_Adam_lr_2e-05_epochs_3_bs_6_maxlen_512'
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Determine device
optimizer = "Adam"  # Set the correct optimizer

## Paths and filenames
path = "/content/gdrive/My Drive/Projects/SpamNews/"
dataset_path = path + "Datasets/"
test_file = "test_set.csv"
trained_model = path + 'TrainedModels/' + str_params  # The fine-tuned model
feature_col = 'Text'
prediction_col = str_params + '_prediction'

# Instantiate the BertPredictions class
prediction = BertPredictions(trained_model, device, max_len)

# Run prediction and save results to CSV
prediction.predict_and_save(dataset_path, test_file, feature_col, prediction_col)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


100%|██████████| 1000/1000 [00:34<00:00, 29.00it/s]
