<a href="https://colab.research.google.com/github/AlvinChiew/DeepLearning/blob/main/HuggingFace_TextClassification_Customized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install transformers

[K     |████████████████████████████████| 2.3MB 25.2MB/s 
[K     |████████████████████████████████| 901kB 52.5MB/s 
[K     |████████████████████████████████| 3.3MB 52.7MB/s 
[?25h

# Import Modules

In [2]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.datasets import fetch_20newsgroups

import torch

from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback

In [3]:
# torch.cuda.get_device_name(0)

# Load Data

In [4]:
# list of available categories : https://scikit-learn.org/stable/datasets/real_world.html#usage
categories = ['sci.electronics', 'sci.med', 'sci.space', 'sci.crypt']
data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)     # subset='all' to mimic custom dataset

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [6]:
NUM_LABELS = len(Counter(data["target"]))

print(len(data["target"]))
print(Counter(data["target"]))
print(NUM_LABELS)

3952
Counter({0: 991, 2: 990, 3: 987, 1: 984})
4


# Config

In [7]:
PRETRAINED_MODEL_NAME = 'bert-base-uncased'

# Pre-Process Data

In [8]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [9]:
X = data["data"]
y = data["target"]

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [10]:
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)       # Bert based model has max_len of 512
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [11]:
# `encode_plus` is not necessary. `tokenizer` does stemming to split new vocab/term into smaller structure (down to characters)

# X_val_tokenized = {}
# for s in X_val:
#     t = tokenizer.encode_plus(s, padding=True, truncation=True)
#     if list(t.keys())[0] not in X_val_tokenized:
#         for k in t.keys():
#             X_val_tokenized[k] = [t[k]]
#     else:
#         for k in t.keys():
#             X_val_tokenized[k] += [t[k]]

In [12]:
# create torch-compatible `Dataset` object
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [13]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset = Dataset(X_test_tokenized)

In [14]:
# train_dataset.__len__()
# train_dataset.__getitem__(0)

# Train model

In [15]:
# make `model` callable to be initialized in Trainer for reproducibility
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

# if found below error while setting up `trainer`, check `num_labels`
# RuntimeError: CUDA error: device-side assert triggered

# model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

In [16]:
# define model evaluation metric
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    # metrics algo
    avg = 'binary' if NUM_LABELS == 2 else 'macro'
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average=avg)
    precision = precision_score(y_true=labels, y_pred=pred, average=avg)
    f1 = f1_score(y_true=labels, y_pred=pred, average=avg)

    return {"accuracy": accuracy, 
            "precision": precision, 
            "recall": recall, 
            "f1": f1
            }

In [17]:
# define Trainer
args = TrainingArguments(
                output_dir="model_output",
                evaluation_strategy="epoch",
                per_device_train_batch_size=8,
                per_device_eval_batch_size=8,
                num_train_epochs=3,
                seed=42,
                load_best_model_at_end=True
            )
        

trainer = Trainer(
                model_init=model_init,
                args=args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_metrics,
                # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
            )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [18]:
trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.081663,0.979463,0.979385,0.979265,0.979292
2,0.232800,0.089908,0.984202,0.984011,0.984104,0.983992
3,0.232800,0.079379,0.984202,0.984388,0.983551,0.983877


TrainOutput(global_step=948, training_loss=0.13900442364849622, metrics={'train_runtime': 942.9466, 'train_samples_per_second': 1.005, 'total_flos': 0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 3684868096, 'init_mem_gpu_alloc_delta': 439078400, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -359796736, 'train_mem_gpu_alloc_delta': 1386276352, 'train_mem_cpu_peaked_delta': 374894592, 'train_mem_gpu_peaked_delta': 6504055808})

# Evaluation

In [19]:
raw_pred, _, _ = trainer.predict(test_dataset)      # `raw_pred` : [prob_first_class, prob_second_class, ...]
y_pred = np.argmax(raw_pred, axis=1)                # get index with highest probability

In [20]:
accuracy_score(y_test, y_pred)

0.9797724399494311