
# Notes
This notebook fine-tunes a DistilBERT pre-trained model for multi-class classification of an imbalanced dataset -- the HuffingtonPost news dataset. 

Code was adapted from HuggingFace's tutorials and demo notebooks linked here: https://huggingface.co/docs/transformers/en/model_doc/distilbert, and known to work on Windows 11 desktop with single GPU. 

2024-02-12: adjusted to work on M1 Macbook Air, but did not complete a single training run due to long runtime. 


In [None]:
# ! pip install --quiet pandas numpy datasets transformers torch accelerate -U #for use in Google Colab

In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, set_seed
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datetime import datetime

In [None]:
torch.cuda.is_available()
# Check whether GPU device is available. On my M1 Macbook Air, there is no GPU device

False

In [33]:
# File paths
PATH_RAW = r"/data/data_processed_12cats.csv" # <<< Change this to point to local path
USECOLS = ['text', 'category']

PATH_TOKENIZER = "distilbert-base-uncased"
PATH_MODEL = "distilbert-base-uncased"

# Training parameters
NUM_LABELS = 12
RANDOM_STATE = 42
set_seed(0)
TEST_SIZE = 0.2

NUM_TRAIN_EPOCHS = 20
BATCH_SIZE = 8 
LEARNING_RATE = 5e-5

In [13]:
def label_id_mapper(df, label_col, label_id_col):
    """Function to map category labels to numerical id's

    Args:
        df (pandas dataframe): dataframe containing text and category label columns
        label_col (str): name of category label column
        label_id_col (str): name of category numerical id column (to be assigned)

    Returns:
        df, id2label: dataframe with new column of mapped id's, id2label dictionary
    """

    classes = np.unique(df[label_col]).tolist()
    print(f"Mapping labels to {len(classes)} id's...")
    # Create dictionary for list of cat labels
    id2label = dict(enumerate(classes))
    # Map labels to their id's
    df[label_id_col] = df[label_col].apply(lambda x: [k for k,v in id2label.items() if x==v][0])

    return df, id2label

In [47]:
def compute_metrics(eval_pred):
    """Computes evaluation metrics from predictions (their logits) and actual labels

    Args:
        eval_pred (tuple): set of predictions from a transformer (DistilBERT) classifier
    Returns:
        dictionary of accuracy, f1, precision and recall scores
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true=labels,
                                                               y_pred=predictions,
                                                               average='weighted',
                                                               zero_division=0)
    acc = accuracy_score

    return {
        'accuracy' : acc,
        'f1_weighted' : f1,
        'precision' : precision,
        'recall' : recall,
    }

In [41]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

def eval_test(chkpt):
    model = AutoModelForSequenceClassification.from_pretrained(rf'trainer\\checkpoint-{chkpt}', num_labels=NUM_LABELS)
    trainer = Trainer(model=model,
                      compute_metrics=compute_metrics, )
    predictions = trainer.predict(test_dataset=tokenized_dataset['test'])
    labels = tokenized_dataset['test']['label']
    return predictions, labels

def report_matrix(test, predictions, labels, id2label):
    test['label_id_pred'] = np.argmax(predictions[0], axis=-1)
    test['label_id_pred'] = np.argmax(predictions[0], axis=-1)
    test['label_id_pred'] = np.argmax(predictions[0], axis=-1)
    # Plot confusion matrix and print report
    cm = confusion_matrix(labels, np.argmax(predictions[0], axis=-1), labels=[k for k,v in id2label.items() ])
    disp = ConfusionMatrixDisplay(cm, display_labels=[v for k,v in id2label.items()])
    disp.plot(xticks_rotation=90);
    print(classification_report(y_true=test['label_true'], y_pred=test['label_pred']))


In [21]:
df = pd.read_csv(PATH_RAW, usecols=USECOLS)
df.rename(columns={'category':'label'}, inplace=True)

In [40]:
df1, id2label = label_id_mapper(df, 'label', 'label_id')
id2label

Mapping labels to 12 id's...


{0: 'BUSINESS',
 1: 'COMEDY',
 2: 'ENTERTAINMENT',
 3: 'FOOD & DRINK',
 4: 'HEALTHY LIVING',
 5: 'PARENTING',
 6: 'POLITICS',
 7: 'QUEER VOICES',
 8: 'SPORTS',
 9: 'STYLE & BEAUTY',
 10: 'TRAVEL',
 11: 'WELLNESS'}

In [49]:
df1.head(3)

Unnamed: 0,label,text,label_id
0,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,1
1,PARENTING,The Funniest Tweets From Parents This Week (Se...,5
2,SPORTS,"Maury Wills, Base-Stealing Shortstop For Dodge...",8


As this is an extremely imbalanced data set, we will correct for this during training by using class weights. This is intended to help the model concentrate more on training to identify the smaller classes, by giving less weightage for correctly classifying large classes, and more weightage to doing well for the minor classes.

In [27]:
df['label'].value_counts(normalize=True)

label
POLITICS          0.263456
WELLNESS          0.133063
ENTERTAINMENT     0.127561
TRAVEL            0.073135
STYLE & BEAUTY    0.072749
PARENTING         0.065186
HEALTHY LIVING    0.049481
FOOD & DRINK      0.047011
QUEER VOICES      0.046663
BUSINESS          0.044372
COMEDY            0.039826
SPORTS            0.037498
Name: proportion, dtype: float64

In [57]:
df1 = df1[['text', 'label_id']]
df1.rename(columns={'label_id':'label'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.rename(columns={'label_id':'label'}, inplace=True)


In [58]:
df1.head(2)

Unnamed: 0,text,label
0,23 Of The Funniest Tweets About Cats And Dogs ...,1
1,The Funniest Tweets From Parents This Week (Se...,5


In [59]:
from sklearn.utils import class_weight
# Create array of class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced', 
                                                  classes=np.unique(df1['label']),
                                                  y=df1['label'])

# Check on what are the class weights assigned to categories
class_weights_dict = dict(zip(np.unique(df1['label']), class_weights))
# class_weights_dict1 = dict(zip(np.unique(df1['label_id']), class_weights))
class_weights_dict
# TODO ADD IN CUSTOM TRAINER LATER TO USE CLASS WEIGHTS DURING TRAINING

{0: 1.8780776515151516,
 1: 2.0924253708185936,
 2: 0.6532823732294755,
 3: 1.7726209253417455,
 4: 1.6841625455816973,
 5: 1.278400257839457,
 6: 0.31630781499202554,
 7: 1.7858599502092272,
 8: 2.2223485597521586,
 9: 1.1454914551693678,
 10: 1.13945216127615,
 11: 0.6262700845175072}

In [60]:
tokenizer = AutoTokenizer.from_pretrained(PATH_TOKENIZER)#"distilbert-base-uncased") # Downloads from HuggingFace

In [61]:
# Function to tokenize text in a dataset
def tokenize(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [69]:
# https://androidkt.com/how-to-use-class-weight-in-crossentropyloss-for-an-imbalanced-dataset/
# https://discuss.huggingface.co/t/how-can-i-use-class-weights-when-training/1067/7

from torch import nn
# device = torch.device('cuda') # if there is a GPU available
device = 'mps' # Use this instead for M1 Macbook Air

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        #compute custom loss
        loss_func = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float).to(device)) #for GPU
        loss = loss_func(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


In [70]:
# Set training parameters
training_args = TrainingArguments(output_dir="trainer", 
                                  logging_dir="./logs",
                                  num_train_epochs=NUM_TRAIN_EPOCHS,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  warmup_steps=100,
                                  use_mps_device=True, # added this for M1 Macbook Air (but note Deprecation warning)
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  learning_rate=LEARNING_RATE,
                                  logging_steps=10,
                                  load_best_model_at_end=True,
                                  )



In [71]:
# Do Cross Validation across whole dataset
from sklearn.model_selection import StratifiedKFold
n = 3
kf = StratifiedKFold(n_splits=n)
results = []

# device = torch.device('cuda') # Comment out if no GPU available
# device = "mps"

for train_idx, test_idx in kf.split(df1['text'], df1['label']):

    # Split dataframe
    train_df = df1.iloc[train_idx]
    test_df = df1.iloc[test_idx]

    # Make Dataset objects from dataframes
    trds = Dataset.from_pandas(train_df)
    tds = Dataset.from_pandas(test_df)

    # Make Dataset dictionary
    ds = DatasetDict()
    ds['train'] = trds
    ds['test'] = tds

    # Tokenize datasets
    tokenized_dataset = ds.map(tokenize, batched=True)

    # Shuffle at random
    train_ds = tokenized_dataset['train'].shuffle(seed=RANDOM_STATE).select(range(len(ds['train'])))
    test_ds = tokenized_dataset['test'].shuffle(seed=RANDOM_STATE).select(range(len(ds['test'])))

    model = AutoModelForSequenceClassification.from_pretrained(PATH_MODEL, num_labels=NUM_LABELS)
    # model.cuda()

    trainer = CustomTrainer(model=model,
                            args=training_args,
                            train_dataset=train_ds,
                            eval_dataset=test_ds,
                            compute_metrics=compute_metrics,
                            )
    
    start = datetime.now()
    print(start)
    trainer.train()
    print(rf"Time taken: {datetime.now() - start}")

    best_chkpt = trainer.state.best_model_checkpoint

    # Load model from best checkpoint
    predictions, labels = eval_test(chkpt=best_chkpt.split('-')[-1])
    results.append(predictions.metrics)

print("Results: ", results)
df_r = pd.DataFrame(results)
df_r

Map:   0%|          | 0/89907 [00:00<?, ? examples/s]

Map:   0%|          | 0/44954 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2024-02-12 12:57:07.722090


  0%|          | 0/224780 [00:00<?, ?it/s]

{'loss': 2.4871, 'learning_rate': 5e-06, 'epoch': 0.0}
{'loss': 2.4812, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 2.5028, 'learning_rate': 1.5e-05, 'epoch': 0.0}
{'loss': 2.4871, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 2.4945, 'learning_rate': 2.5e-05, 'epoch': 0.0}
{'loss': 2.4951, 'learning_rate': 3e-05, 'epoch': 0.01}
{'loss': 2.461, 'learning_rate': 3.5e-05, 'epoch': 0.01}


KeyboardInterrupt: 

CV code above successfully runs after adjusting "MPS" device setting for running on Macbook Air, but estimated to take too long. 
Will need to adjust to reduce running time to get through 1 run. e.g. use a smaller sample of the dataset, lower epochs, higher learning rate. 