In [10]:
import pandas as pd
from datasets import Dataset, DatasetDict

dataframe = pd.concat(
    [
        pd.read_csv("./data/hateval2019_en_train.csv"),
        pd.read_csv("./data/hateval2019_en_dev.csv"),
        pd.read_csv("./data/hateval2019_en_test.csv"),
    ],
    keys=["train", "dev", "test"],
    names=["split", "index"],
)

datasets = DatasetDict(
    {
        split: Dataset.from_pandas(dataframe.loc[(split)])
        for split in ["train", "dev", "test"]
    }
)
datasets


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'HS', 'TR', 'AG', 'index'],
        num_rows: 9000
    })
    dev: Dataset({
        features: ['id', 'text', 'HS', 'TR', 'AG', 'index'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'text', 'HS', 'TR', 'AG', 'index'],
        num_rows: 3000
    })
})

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


tokenizer = AutoTokenizer.from_pretrained(
    "vinai/bertweet-covid19-base-cased", normalization=True
)
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-covid19-base-cased"
)


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/vinai/bertweet-covid19-base-cased/resolve/main/config.json from cache at /home/chris-zeng/.cache/huggingface/transformers/b0d7660a1cf1cc386b57bf9307b4bc6f23b17e384049be92e2068b42dd6faafc.a2b6026e688d1b19cebc0981d8f3a5b1668eabfda55b2c42049d5eac0bc8cb2d
Model config RobertaConfig {
  "_name_or_path": "vinai/bertweet-covid19-base-cased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_ty

In [12]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_preds):
    pred_logits, labels_logits = eval_preds
    preds = pred_logits.argmax(axis=1)
    labels = labels_logits.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro"),
    }


In [13]:
import numpy as np


def indice2logits(indice, num_classes):
    indice = np.array(indice)
    logits = np.zeros([len(indice), num_classes], dtype=float)
    logits[np.arange(len(indice)), indice] = 1.0
    return {"label_logits": logits}


datasets = datasets.map(
    lambda rec: tokenizer(
        rec["text"],
        padding="max_length",
        max_length=192,
        pad_to_multiple_of=8,
        return_token_type_ids=True,
        return_attention_mask=True,
    ),
    batched=True,
    keep_in_memory=True,
    batch_size=128,
)

datasets = datasets.map(
    lambda rec: indice2logits(rec["HS"], 2),
    batched=True,
    keep_in_memory=True,
)

datasets = datasets.rename_column("label_logits", "labels")
datasets = datasets.remove_columns([])
datasets


  0%|          | 0/71 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'HS', 'TR', 'AG', 'index', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9000
    })
    dev: Dataset({
        features: ['id', 'text', 'HS', 'TR', 'AG', 'index', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'text', 'HS', 'TR', 'AG', 'index', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="outputs/bertweet",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    num_train_epochs=50,
    logging_strategy="epoch",
    remove_unused_columns=True,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=128,
    optim="adamw_apex_fused",
    bf16=True,
    tf32=True,
    learning_rate=1e-6,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["dev"],
    compute_metrics=compute_metrics,
)


Using amp half precision backend


In [6]:
trainer_output = trainer.train()
trainer.save_model()


***** Running training *****
  Num examples = 9000
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 3500


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.6878,0.678662,0.573,0.364272
1,0.6779,0.666452,0.573,0.364272
2,0.6621,0.641345,0.667,0.584775
3,0.6312,0.602661,0.719,0.700743
4,0.5891,0.565582,0.737,0.734705
5,0.5512,0.540997,0.748,0.746294
6,0.5223,0.525015,0.754,0.752988
7,0.4998,0.511515,0.757,0.755474
8,0.4808,0.502811,0.769,0.768398
9,0.4634,0.492053,0.78,0.779308


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to outputs/bertweet/checkpoint-70
Configuration saved in outputs/bertweet/checkpoint-70/config.json
Model weights saved in outputs/bertweet/checkpoint-70/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to outputs/bertweet/checkpoint-140
Configuration saved in outputs/bertweet/checkpoint-140/config.json
Model weights saved in outputs/bertweet/checkpoint-140/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to outputs/bertweet/checkpoint-210
Configuration saved in outputs/bertweet/checkpoint-210/config.json
Model weights saved in outputs/bertweet/checkpoint-210/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to outputs/bertweet/checkpoint-280
Configuration saved in outputs/bertweet/checkpoint-280/config.json


In [19]:
preds = trainer.predict(datasets["test"]).predictions.argmax(axis = 1)
labels = datasets["test"]["HS"]

{
    'accuracy': accuracy_score(labels, preds),
    'f1': f1_score(labels, preds, average='macro'),
}

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text, TR, AG, index, HS. If id, text, TR, AG, index, HS are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 8


{'accuracy': 0.544, 'f1': 0.5105746407064516}