In [1]:
import json

with open("secrets.json", "r") as secrets_file:
    secrets = json.load(secrets_file)

from transformers import AutoTokenizer, AutoModelForSequenceClassification

electra_tokenizer = AutoTokenizer.from_pretrained(
    "google/electra-large-discriminator", normalization=True
)


In [2]:
from utils import load_tweeteval, indice2logits
import pandas as pd

hypotheses = [
    "This sentence contains profanity or a targeted offense.",
    "This sentence contains insult, threat, profanity or swear words.",
    "This sentence contains insult, threat, profanity, swear words or targeted offense.",
]

offenseval_dataset_dicts = {
    hypothesis: (
        load_tweeteval()["offensive"]
        .map(
            lambda rec: {"labels": (pd.Series(rec["labels"]) * (-2) + 2).values},
            batched=True,
            batch_size=1024,
        )
        .map(
            lambda rec: (indice2logits(rec["labels"], 3)), batched=True, batch_size=1024
        )
        .rename_columns({"labels": "label_categoricals", "label_logits": "labels"})
        .rename_columns({"text": "premise"})
        .map(
            lambda rec: {"hypothesis": len(rec["premise"]) * [hypothesis]},
            batched=True,
            batch_size=1024,
        )
        .map(
            lambda rec: electra_tokenizer(
                rec["premise"],
                rec["hypothesis"],
                padding="longest",
                pad_to_multiple_of=8,
                return_token_type_ids=True,
                return_attention_mask=True,
            ),
        )
    )
    for hypothesis in hypotheses
}

results = {}



  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/11916 [00:00<?, ?ex/s]

  0%|          | 0/1324 [00:00<?, ?ex/s]

  0%|          | 0/860 [00:00<?, ?ex/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/11916 [00:00<?, ?ex/s]

  0%|          | 0/1324 [00:00<?, ?ex/s]

  0%|          | 0/860 [00:00<?, ?ex/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/11916 [00:00<?, ?ex/s]

  0%|          | 0/1324 [00:00<?, ?ex/s]

  0%|          | 0/860 [00:00<?, ?ex/s]

In [3]:
from transformers import TrainingArguments, Trainer
from utils import trainer_compute_metrics
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"


training_args = TrainingArguments(
    num_train_epochs=15,
    learning_rate=1e-6,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,
    output_dir="outputs/electra-nli-efl-offenseval",
    overwrite_output_dir=True,
    dataloader_num_workers=4,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    remove_unused_columns=True,
    eval_accumulation_steps=128,
    optim="adamw_torch",
    bf16=True,
    tf32=True,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

datasets = offenseval_dataset_dicts[
    "This sentence contains profanity or a targeted offense."
]

trainer = Trainer(
    model=AutoModelForSequenceClassification.from_pretrained(
        "ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli",
        num_labels=3,
    ),
    tokenizer=electra_tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["val"],
    compute_metrics=trainer_compute_metrics,
)

trainer_output = trainer.train()
trainer.save_model()


Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running training *****
  Num examples = 11916
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 16
  Total optimization steps = 1395


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.4464,0.403205,0.766616,0.730446
1,0.3655,0.358493,0.777946,0.745929
2,0.3302,0.337694,0.777946,0.761262
3,0.3103,0.32029,0.787009,0.760295
4,0.2967,0.308521,0.79003,0.765013
5,0.288,0.30566,0.793051,0.76575
6,0.2825,0.307096,0.793807,0.766469
7,0.2772,0.304263,0.790785,0.765729
8,0.2745,0.298986,0.780967,0.759922
9,0.2701,0.299368,0.786254,0.76537


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Evaluation *****
  Num examples = 1324
  Batch size = 8
Saving model checkpoint to outputs/electra-nli-efl-offenseval/checkpoint-93
Configuration saved in outputs/electra-nli-efl-offenseval/checkpoint-93/config.json
Model weights saved in outputs/electra-nli-efl-offenseval/checkpoint-93/pytorch_model.bin
tokenizer config file saved in outputs/electra-nli-efl-offenseval/checkpoint-93/tokenizer_config.json
Special tokens file saved in outputs/electra-nli-efl-offenseval/checkpoint-93/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Evaluation *****
  Num examples = 1324
  Batch size = 8
Saving model checkpoint 

In [4]:
from utils import get_metrics, f1_macro, get_labels
from sklearn.metrics import accuracy_score
import torch

results["This sentence contains profanity or a targeted offense."] = get_metrics(
    lambda inputs: trainer.predict(inputs).predictions.argmax(axis=1),
    datasets,
    get_labels(datasets, "label_categoricals"),
    ["train", "val", "test"],
    {"accuracy": accuracy_score, "f1": f1_macro},
)

del trainer
del trainer_output
torch.cuda.empty_cache()

results["This sentence contains profanity or a targeted offense."]


The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Prediction *****
  Num examples = 11916
  Batch size = 8


The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Prediction *****
  Num examples = 1324
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Prediction *****
  Num examples = 860
  Batch size = 8


[('train', 'accuracy', 0.8404665995300437),
 ('train', 'f1', 0.8179708378918003),
 ('val', 'accuracy', 0.7938066465256798),
 ('val', 'f1', 0.7691121037734583),
 ('test', 'accuracy', 0.8558139534883721),
 ('test', 'f1', 0.8123209269910103)]

In [5]:
from transformers import TrainingArguments, Trainer
from utils import trainer_compute_metrics

datasets = offenseval_dataset_dicts[
    "This sentence contains insult, threat, profanity or swear words."
]

trainer = Trainer(
    model=AutoModelForSequenceClassification.from_pretrained(
        "ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli",
        num_labels=3,
    ),
    tokenizer=electra_tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["val"],
    compute_metrics=trainer_compute_metrics,
)

trainer_output = trainer.train()
trainer.save_model()


loading configuration file https://huggingface.co/ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli/resolve/main/config.json from cache at /home/chris-zeng/.cache/huggingface/transformers/767fab951e9d8c432dc3775f2943a5208b7e3f6975863a23aaeba306a1c5980e.3104f0cd2cbab9afd68c2c65670667c1b6c00aa3da4c65b894d38f853ed1eb71
Model config ElectraConfig {
  "_name_or_path": "ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli",
  "architectures": [
    "ElectraForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 1024,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "entailment",
    "1": "neutral",
    "2": "contradiction"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "el

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.4644,0.403364,0.768127,0.733207
1,0.373,0.358055,0.779456,0.742428
2,0.3329,0.348588,0.759819,0.746637
3,0.3133,0.31837,0.782477,0.755471
4,0.299,0.308477,0.783233,0.755064
5,0.2918,0.304142,0.789275,0.760774
6,0.2838,0.304205,0.791541,0.764315
7,0.2793,0.299098,0.78852,0.765355
8,0.2738,0.299009,0.785498,0.76419
9,0.2727,0.299354,0.783988,0.764149


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Evaluation *****
  Num examples = 1324
  Batch size = 8
Saving model checkpoint to outputs/electra-nli-efl-offenseval/checkpoint-93
Configuration saved in outputs/electra-nli-efl-offenseval/checkpoint-93/config.json
Model weights saved in outputs/electra-nli-efl-offenseval/checkpoint-93/pytorch_model.bin
tokenizer config file saved in outputs/electra-nli-efl-offenseval/checkpoint-93/tokenizer_config.json
Special tokens file saved in outputs/electra-nli-efl-offenseval/checkpoint-93/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Evaluation *****
  Num examples = 1324
  Batch size = 8
Saving model checkpoint 

In [6]:
from utils import get_metrics, f1_macro, get_labels
from sklearn.metrics import accuracy_score
import torch

results["This sentence contains insult, threat, profanity or swear words."] = get_metrics(
    lambda inputs: trainer.predict(inputs).predictions.argmax(axis=1),
    datasets,
    get_labels(datasets, "label_categoricals"),
    ["train", "val", "test"],
    {"accuracy": accuracy_score, "f1": f1_macro},
)

del trainer
del trainer_output
torch.cuda.empty_cache()

results["This sentence contains insult, threat, profanity or swear words."]


The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Prediction *****
  Num examples = 11916
  Batch size = 8


The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Prediction *****
  Num examples = 1324
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Prediction *****
  Num examples = 860
  Batch size = 8


[('train', 'accuracy', 0.8402148371936892),
 ('train', 'f1', 0.8188765080715483),
 ('val', 'accuracy', 0.790785498489426),
 ('val', 'f1', 0.7684752107723531),
 ('test', 'accuracy', 0.8511627906976744),
 ('test', 'f1', 0.8068283917340521)]

In [7]:
from transformers import TrainingArguments, Trainer
from utils import trainer_compute_metrics

datasets = offenseval_dataset_dicts[
    "This sentence contains insult, threat, profanity, swear words or targeted offense."
]

trainer = Trainer(
    model=AutoModelForSequenceClassification.from_pretrained(
        "ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli",
        num_labels=3,
    ),
    tokenizer=electra_tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["val"],
    compute_metrics=trainer_compute_metrics,
)

trainer_output = trainer.train(resume_from_checkpoint="outputs/electra-nli-efl-offenseval/checkpoint-930")
trainer.save_model()


loading configuration file https://huggingface.co/ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli/resolve/main/config.json from cache at /home/chris-zeng/.cache/huggingface/transformers/767fab951e9d8c432dc3775f2943a5208b7e3f6975863a23aaeba306a1c5980e.3104f0cd2cbab9afd68c2c65670667c1b6c00aa3da4c65b894d38f853ed1eb71
Model config ElectraConfig {
  "_name_or_path": "ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli",
  "architectures": [
    "ElectraForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 1024,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "entailment",
    "1": "neutral",
    "2": "contradiction"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "el

0it [00:00, ?it/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
10,0.2722,0.299882,0.792296,0.769666
11,0.2672,0.297811,0.795317,0.771307
12,0.2658,0.297389,0.791541,0.769431
13,0.2618,0.298355,0.792296,0.770855
14,0.2644,0.298506,0.786254,0.765602


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Evaluation *****
  Num examples = 1324
  Batch size = 8
Saving model checkpoint to outputs/electra-nli-efl-offenseval/checkpoint-1023
Configuration saved in outputs/electra-nli-efl-offenseval/checkpoint-1023/config.json
Model weights saved in outputs/electra-nli-efl-offenseval/checkpoint-1023/pytorch_model.bin
tokenizer config file saved in outputs/electra-nli-efl-offenseval/checkpoint-1023/tokenizer_config.json
Special tokens file saved in outputs/electra-nli-efl-offenseval/checkpoint-1023/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Evaluation *****
  Num examples = 1324
  Batch size = 8
Saving model c

In [8]:
from utils import get_metrics, f1_macro, get_labels
from sklearn.metrics import accuracy_score
import torch

results["This sentence contains insult, threat, profanity, swear words or targeted offense."] = get_metrics(
    lambda inputs: trainer.predict(inputs).predictions.argmax(axis=1),
    datasets,
    get_labels(datasets, "label_categoricals"),
    ["train", "val", "test"],
    {"accuracy": accuracy_score, "f1": f1_macro},
)

del trainer
del trainer_output
torch.cuda.empty_cache()

results["This sentence contains insult, threat, profanity, swear words or targeted offense."]


The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Prediction *****
  Num examples = 11916
  Batch size = 8


The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Prediction *****
  Num examples = 1324
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: premise, hypothesis, label_categoricals.
***** Running Prediction *****
  Num examples = 860
  Batch size = 8


[('train', 'accuracy', 0.8379489761664988),
 ('train', 'f1', 0.8170080115175432),
 ('val', 'accuracy', 0.7915407854984894),
 ('val', 'f1', 0.7694305422001468),
 ('test', 'accuracy', 0.8534883720930233),
 ('test', 'f1', 0.811470447000856)]

In [9]:
results

{'This sentence contains profanity or a targeted offense.': [('train',
   'accuracy',
   0.8404665995300437),
  ('train', 'f1', 0.8179708378918003),
  ('val', 'accuracy', 0.7938066465256798),
  ('val', 'f1', 0.7691121037734583),
  ('test', 'accuracy', 0.8558139534883721),
  ('test', 'f1', 0.8123209269910103)],
 'This sentence contains insult, threat, profanity or swear words.': [('train',
   'accuracy',
   0.8402148371936892),
  ('train', 'f1', 0.8188765080715483),
  ('val', 'accuracy', 0.790785498489426),
  ('val', 'f1', 0.7684752107723531),
  ('test', 'accuracy', 0.8511627906976744),
  ('test', 'f1', 0.8068283917340521)],
 'This sentence contains insult, threat, profanity, swear words or targeted offense.': [('train',
   'accuracy',
   0.8379489761664988),
  ('train', 'f1', 0.8170080115175432),
  ('val', 'accuracy', 0.7915407854984894),
  ('val', 'f1', 0.7694305422001468),
  ('test', 'accuracy', 0.8534883720930233),
  ('test', 'f1', 0.811470447000856)]}