In [7]:
import pandas as pd
import evaluate
import numpy as np
import torch

from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorWithPadding,  AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict

# Reading

## What is transformer
https://zhuanlan.zhihu.com/p/413267911

## Work approach
https://huggingface.co/docs/transformers/en/tasks/sequence_classification

## Concepts
- What is **Auto tokenizer**  
https://www.cnblogs.com/chenhuabin/p/16997607.html


- What is **attention mask**  
https://huggingface.co/docs/transformers/glossary#attention-mask


- Why we need **pad** or **truncation**  
https://huggingface.co/docs/transformers/en/pad_truncation

- Why **Dataset.map** and **DataCollatorWithPadding**  
  https://zhuanlan.zhihu.com/p/414552021

In [2]:
TRAIN_CSV = "data/train.csv"
VALID_CSV = "data/valid.csv"

id2label = {1: "WORST", 2: "BAD", 3:"NEUTRAL", 4: "GOOD", 5:"EXCELLENT"}
label2id = {"WORST": 1, "BAD": 2, "NEUTRAL": 3, "GOOD": 4, "EXCELLENT": 5}

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("roberta-base")

accuracy = evaluate.load("accuracy")

In [3]:
def load_data(file_path):
    return pd.read_csv(TRAIN_CSV)[['text','label']].to_dict('records')

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Evaluation metric
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [4]:
train_data = load_data(TRAIN_CSV)
valid_data = load_data(VALID_CSV)

# Create a Dataset Dictionary object for hugging face's pipeline
data = DatasetDict({"train": Dataset.from_list(train_data), "test": Dataset.from_list(valid_data)})
tokenized_data = data.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,
    weight_decay=0.01, 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/11250 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
trainer.save_model("elton")

# Inference

In [None]:
from transformers import pipeline

In [None]:

classifier = pipeline("sentiment-analysis", model="elton")
classifier("How are you")