In [None]:
hub_token = "<HUB_TOKEN>"

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

dataframe = pd.concat(
    [
        pd.read_csv("./data/hateval2019_en_train.csv"),
        pd.read_csv("./data/hateval2019_en_dev.csv"),
        pd.read_csv("./data/hateval2019_en_test.csv"),
    ],
    keys=["train", "dev", "test"],
    names=["split", "index"],
)

datasets = DatasetDict(
    {
        split: Dataset.from_pandas(dataframe.loc[(split)])
        for split in ["train", "dev", "test"]
    }
)
datasets


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


tokenizer = AutoTokenizer.from_pretrained(
    "vinai/bertweet-covid19-base-cased", normalization=True
)
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-covid19-base-cased"
)


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_preds):
    pred_logits, labels_logits = eval_preds
    preds = pred_logits.argmax(axis=1)
    labels = labels_logits.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro"),
    }


In [None]:
import numpy as np


def indice2logits(indice, num_classes):
    indice = np.array(indice)
    logits = np.zeros([len(indice), num_classes], dtype=float)
    logits[np.arange(len(indice)), indice] = 1.0
    return {"label_logits": logits}


datasets = datasets.map(
    lambda rec: tokenizer(
        rec["text"],
        padding="max_length",
        max_length=192,
        pad_to_multiple_of=8,
        return_token_type_ids=True,
        return_attention_mask=True,
    ),
    batched=True,
    keep_in_memory=True,
    batch_size=128,
)

datasets = datasets.map(
    lambda rec: indice2logits(rec["HS"], 2),
    batched=True,
    keep_in_memory=True,
)

datasets = datasets.rename_column("label_logits", "labels")
datasets = datasets.remove_columns([])
datasets


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="outputs/bertweet",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    num_train_epochs=30,
    logging_strategy="epoch",
    remove_unused_columns=True,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=128,
    optim="adamw_apex_fused",
    bf16=True,
    tf32=True,
    learning_rate=1e-6,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=True,
    hub_strategy="all_checkpoints",
    hub_model_id="ChrisZeng/bertweet-base-cased-covid19-hateval",
    hub_token=hub_token,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["dev"],
    compute_metrics=compute_metrics,
)

trainer_output = trainer.train(
    resume_from_checkpoint=True,
)
trainer.save_model()


In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

tokenizer = AutoTokenizer.from_pretrained(
    "ChrisZeng/bertweet-base-cased-covid19-hateval"
)
model = AutoModelForSequenceClassification.from_pretrained(
    "ChrisZeng/bertweet-base-cased-covid19-hateval"
)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="outputs/inference",
    overwrite_output_dir=True,
    remove_unused_columns=True,
    eval_accumulation_steps=128,
    disable_tqdm=True,
)

trainer = Trainer(model=model, args=training_args)


preds = trainer.predict(datasets["test"]).predictions.argmax(axis=1)
labels = datasets["test"]["HS"]

{
    "accuracy": accuracy_score(labels, preds),
    "f1": f1_score(labels, preds, average="macro"),
}
