In [1]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
os.environ['WANDB_MODE'] = 'disabled'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
data_path = "../dataset/dataset.csv"
df = pd.read_csv(data_path)
print("Data columns:", df.columns)
df.head()

Data columns: Index(['text', 'labels'], dtype='object')


Unnamed: 0,text,labels
0,Question: Does it have a baby picture on it th...,0
1,Question: Does it have a baby picture on it th...,2
2,Question: Does it have a baby picture on it th...,1
3,Question: Does the product image show a laptop...,0
4,Question: Is there an email mentioned in the d...,0


In [4]:
def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.

    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of
              that observation belonging to a certain class.

    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [5]:
labels = df['labels'].unique().tolist()
labels.sort()
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

In [6]:
id2label, label2id

({0: 0, 1: 1, 2: 2}, {0: 0, 1: 1, 2: 2})

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['labels'], test_size=0.2, random_state=42
)

In [8]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=512)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, return_tensors="pt")



In [9]:
class CustomDataset(Dataset):
    """
    Custom Dataset class for handling tokenized text data and corresponding labels.
    Inherits from torch.utils.data.Dataset.
    """
    def __init__(self, encodings, labels):
        """
        Initializes the DataLoader class with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized input text data
                              (e.g., 'input_ids', 'token_type_ids', 'attention_mask').
            labels (list): A list of integer labels for the input text data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a dictionary containing tokenized data and the corresponding label for a given index.

        Args:
            idx (int): The index of the data item to retrieve.

        Returns:
            item (dict): A dictionary containing the tokenized data and the corresponding label.
        """
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels.iloc[idx]).clone().detach()
        return item

    def __len__(self):
        """
        Returns the number of data items in the dataset.

        Returns:
            (int): The number of data items in the dataset.
        """
        return len(self.labels)

In [10]:
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [12]:
num_labels = len(label2id)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
      "train-checkpoints", 
      num_train_epochs=10, 
      eval_strategy="epoch", 
      weight_decay=5e-4, 
      per_device_train_batch_size=64,
      per_device_eval_batch_size=64,
      save_strategy="epoch",
      fp16=True,
      load_best_model_at_end=True
)

In [14]:
trainer = Trainer(
    # the pre-trained model that will be fine-tuned
    model=model,
     # training arguments that we defined above
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics= compute_metrics
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [15]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0016,0.000315,0.999957,0.999957,0.999957,0.999957
2,0.0106,0.000508,0.999957,0.999957,0.999957,0.999957
3,0.0,0.000496,0.999957,0.999957,0.999957,0.999957
4,0.0004,0.000535,0.999957,0.999957,0.999957,0.999957
5,0.0004,0.000697,0.999936,0.999936,0.999936,0.999936
6,0.0004,0.000666,0.999936,0.999936,0.999936,0.999936
7,0.0004,0.000672,0.999936,0.999936,0.999936,0.999936
8,0.0,0.00032,0.999957,0.999957,0.999957,0.999957
9,0.0002,0.000423,0.999936,0.999936,0.999936,0.999936
10,0.0007,0.000432,0.999936,0.999936,0.999936,0.999936


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=29370, training_loss=0.0018914954078212193, metrics={'train_runtime': 6148.7897, 'train_samples_per_second': 305.678, 'train_steps_per_second': 4.777, 'total_flos': 7.533928963803058e+16, 'train_loss': 0.0018914954078212193, 'epoch': 10.0})

In [21]:
import numpy as np
from sklearn.metrics import classification_report

# Make prediction on evaluation dataset
y_pred = trainer.predict(val_dataset).predictions
y_pred = np.argmax(y_pred, axis=-1)

# Get the true labels
y_true = val_dataset.labels
y_true = np.array(y_true)

# Print the classification report
print(classification_report(y_true, y_pred, digits=3))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

           0      1.000     1.000     1.000     15600
           1      1.000     1.000     1.000     15614
           2      1.000     1.000     1.000     15775

    accuracy                          1.000     46989
   macro avg      1.000     1.000     1.000     46989
weighted avg      1.000     1.000     1.000     46989

