In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import  AutoModelForSequenceClassification, AutoTokenizer
import os
os.environ['WANDB_MODE'] = 'disabled'

In [28]:
df = pd.read_csv("../dataset/dataset.csv")

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234944 entries, 0 to 234943
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    234944 non-null  object
 1   labels  234944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.6+ MB


In [30]:
checkpoint = "distilbert-base-uncased" # Define which pre-trained model we will be using
classifier = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3) # Get the classifier
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # Get the tokenizer

In [31]:
df_train, df_eval = train_test_split(df, train_size=0.8, stratify=df.labels, random_state=42) # Stratified splitting

In [32]:
from datasets import Dataset, DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "eval": Dataset.from_pandas(df_eval)
})

In [33]:
# Check the datasets
print("Dataset Dict:\n", raw_datasets)
print("\n\nTrain's features:\n", raw_datasets["train"].features)
print("\n\nFirst row of Train:\n", raw_datasets["train"][0])

Dataset Dict:
 DatasetDict({
    train: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 187955
    })
    eval: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 46989
    })
})


Train's features:
 {'text': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), '__index_level_0__': Value(dtype='int64', id=None)}


First row of Train:
 {'text': "Question: Does the packaging emphasize child safety? Answer: Yes, it mentions 'safe for children with rounded edges,' but lacks official safety certifications.", 'labels': 1, '__index_level_0__': 63701}


In [34]:
# Tokenize the text, and truncate the text if it exceed the tokenizer maximum length. Batched=True to tokenize multiple texts at the same time.
tokenized_datasets = raw_datasets.map(lambda dataset: tokenizer(dataset['text'], truncation=True), batched=True)
print(tokenized_datasets)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 187955/187955 [00:04<00:00, 40847.43 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46989/46989 [00:01<00:00, 43081.45 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 187955
    })
    eval: Dataset({
        features: ['text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 46989
    })
})





In [35]:
print(tokenized_datasets["train"][0])

{'text': "Question: Does the packaging emphasize child safety? Answer: Yes, it mentions 'safe for children with rounded edges,' but lacks official safety certifications.", 'labels': 1, '__index_level_0__': 63701, 'input_ids': [101, 3160, 1024, 2515, 1996, 14793, 17902, 2775, 3808, 1029, 3437, 1024, 2748, 1010, 2009, 9704, 1005, 3647, 2005, 2336, 2007, 8352, 7926, 1010, 1005, 2021, 14087, 2880, 3808, 10618, 2015, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [36]:
tokenized_datasets = tokenized_datasets.remove_columns(["text", "__index_level_0__"])
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 187955
    })
    eval: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 46989
    })
})


In [37]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
import numpy as np
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Padding for batch of data that will be fed into model for training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training args 
training_args = TrainingArguments("train-checkpoints", 
                                  num_train_epochs=10, 
                                  evaluation_strategy="epoch", 
                                  weight_decay=5e-4, 
                                  per_device_train_batch_size=64,
                                  per_device_eval_batch_size=64,
                                  save_strategy="epoch",
                                  fp16=True,
                                  load_best_model_at_end=True)

# Metric for validation error
def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.

    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of
              that observation belonging to a certain class.

    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

# Define trainer
trainer = Trainer(
    classifier,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [12]:
# Start the fine-tuning 
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0002,0.000276,0.999957,0.999957,0.999957,0.999957
2,0.0007,0.000251,0.999979,0.999979,0.999979,0.999979
3,0.0012,0.000268,0.999979,0.999979,0.999979,0.999979
4,0.0,6e-06,1.0,1.0,1.0,1.0
5,0.0,0.000262,0.999979,0.999979,0.999979,0.999979
6,0.0005,8e-05,0.999979,0.999979,0.999979,0.999979
7,0.0,0.000181,0.999979,0.999979,0.999979,0.999979
8,0.0,0.000113,0.999979,0.999979,0.999979,0.999979
9,0.0,0.000116,0.999979,0.999979,0.999979,0.999979
10,0.0,0.000118,0.999979,0.999979,0.999979,0.999979


TrainOutput(global_step=29370, training_loss=0.0006539980641949714, metrics={'train_runtime': 2715.7721, 'train_samples_per_second': 692.087, 'train_steps_per_second': 10.815, 'total_flos': 2.7599966301479564e+16, 'train_loss': 0.0006539980641949714, 'epoch': 10.0})

In [13]:
from sklearn.metrics import classification_report

# Make prediction on evaluation dataset
y_pred = trainer.predict(tokenized_datasets["eval"]).predictions
y_pred = np.argmax(y_pred, axis=-1)

# Get the true labels
y_true = tokenized_datasets["eval"]["labels"]
y_true = np.array(y_true)

# Print the classification report
print(classification_report(y_true, y_pred, digits=3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000     15667
           1      1.000     1.000     1.000     15665
           2      1.000     1.000     1.000     15657

    accuracy                          1.000     46989
   macro avg      1.000     1.000     1.000     46989
weighted avg      1.000     1.000     1.000     46989

