In [23]:
import pandas as pd
import evaluate
import numpy as np
import torch
import matplotlib.pyplot as plt

from sklearn.metrics import precision_recall_fscore_support, accuracy_score, log_loss
from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorWithPadding,  AutoModelForSequenceClassification, TrainingArguments, Trainer, TrainerCallback
from datasets import load_dataset, Dataset, DatasetDict
from copy import deepcopy

# Reading

## What is transformer
https://zhuanlan.zhihu.com/p/413267911

## Work approach
https://huggingface.co/docs/transformers/en/tasks/sequence_classification

## Concepts
- What is **Auto tokenizer**  
https://www.cnblogs.com/chenhuabin/p/16997607.html


- What is **attention mask**  
https://huggingface.co/docs/transformers/glossary#attention-mask


- Why we need **pad** or **truncation**  
https://huggingface.co/docs/transformers/en/pad_truncation

- Why **Dataset.map** and **DataCollatorWithPadding**  
  https://zhuanlan.zhihu.com/p/414552021

In [24]:
TRAIN_CSV = "data/train.csv"
VALID_CSV = "data/valid.csv"

PRE_TRAINED_MODEL = "distilbert-base-uncased"

SEED = 4332

id2label = {1: "WORST", 2: "BAD", 3:"NEUTRAL", 4: "GOOD", 5:"EXCELLENT"}
label2id = {"WORST": 1, "BAD": 2, "NEUTRAL": 3, "GOOD": 4, "EXCELLENT": 5}


tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL)
# tokenizer = AutoTokenizer.from_pretrained("roberta-base")

accuracy = evaluate.load("accuracy")

In [29]:
def load_data(file_path):
    return pd.read_csv(file_path)[['text','label']].to_dict('records')

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Evaluation metric
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [30]:
train_data = load_data(TRAIN_CSV)
valid_data = load_data(VALID_CSV)

# Create a Dataset Dictionary object for hugging face's pipeline
data = DatasetDict({"train": Dataset.from_list(train_data), "validation": Dataset.from_list(valid_data)})
tokenized_data = data.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [31]:
model = AutoModelForSequenceClassification.from_pretrained(
    PRE_TRAINED_MODEL, num_labels=5, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy


# More args could be found here:
# https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01, 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=SEED,
    data_seed=SEED
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.add_callback((CustomCallback(trainer)))

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/2250 [00:00<?, ?it/s]

{'loss': 0.803, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.44}
{'loss': 0.7077, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.89}


  0%|          | 0/1125 [00:00<?, ?it/s]

{'train_loss': 0.6070576310157776, 'train_accuracy': 0.49083333333333334, 'train_runtime': 86.7251, 'train_samples_per_second': 207.552, 'train_steps_per_second': 12.972, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Checkpoint destination directory my_awesome_model/checkpoint-1125 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.6869535446166992, 'eval_accuracy': 0.463, 'eval_runtime': 4.6298, 'eval_samples_per_second': 431.986, 'eval_steps_per_second': 26.999, 'epoch': 1.0}
{'loss': 0.6402, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.33}
{'loss': 0.6164, 'learning_rate': 2.222222222222222e-06, 'epoch': 1.78}


  0%|          | 0/1125 [00:00<?, ?it/s]

{'train_loss': 0.538798987865448, 'train_accuracy': 0.5272777777777777, 'train_runtime': 43.7907, 'train_samples_per_second': 411.046, 'train_steps_per_second': 25.69, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Checkpoint destination directory my_awesome_model/checkpoint-2250 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.6811787486076355, 'eval_accuracy': 0.468, 'eval_runtime': 4.6097, 'eval_samples_per_second': 433.868, 'eval_steps_per_second': 27.117, 'epoch': 2.0}
{'train_runtime': 483.2851, 'train_samples_per_second': 74.49, 'train_steps_per_second': 4.656, 'train_loss': 0.6822109646267361, 'epoch': 2.0}


TrainOutput(global_step=2250, training_loss=0.6822109646267361, metrics={'train_runtime': 483.2851, 'train_samples_per_second': 74.49, 'train_steps_per_second': 4.656, 'train_loss': 0.6822109646267361, 'epoch': 2.0})

In [None]:
trainer.save_model("elton")

In [None]:
plt.figure(figsize=(10,4))


In [28]:
trainer.state.log_history

[{'loss': 0.4519,
  'learning_rate': 1.555555555555556e-05,
  'epoch': 0.44,
  'step': 500},
 {'loss': 0.375,
  'learning_rate': 1.1111111111111113e-05,
  'epoch': 0.89,
  'step': 1000},
 {'train_loss': 0.3231489062309265,
  'train_accuracy': 0.6293888888888889,
  'train_runtime': 43.2683,
  'train_samples_per_second': 416.009,
  'train_steps_per_second': 26.001,
  'epoch': 1.0,
  'step': 1125},
 {'eval_loss': 0.3231489062309265,
  'eval_accuracy': 0.6293888888888889,
  'eval_runtime': 43.0981,
  'eval_samples_per_second': 417.652,
  'eval_steps_per_second': 26.103,
  'epoch': 1.0,
  'step': 1125},
 {'loss': 0.2974,
  'learning_rate': 6.666666666666667e-06,
  'epoch': 1.33,
  'step': 1500},
 {'loss': 0.2524,
  'learning_rate': 2.222222222222222e-06,
  'epoch': 1.78,
  'step': 2000},
 {'train_loss': 0.25691720843315125,
  'train_accuracy': 0.6616666666666666,
  'train_runtime': 43.3174,
  'train_samples_per_second': 415.537,
  'train_steps_per_second': 25.971,
  'epoch': 2.0,
  'step': 

# Inference

In [None]:
from transformers import pipeline

In [None]:

classifier = pipeline("sentiment-analysis", model="elton")
classifier("How are you")