In [1]:
import json

with open("secrets.json", "r") as secrets_file:
    secrets = json.load(secrets_file)

from utils import load_tweeteval

hateval = load_tweeteval()["hate"]

results = {}


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, RobertaModel, T5EncoderModel


bertweet_tokenizer = AutoTokenizer.from_pretrained(
    "vinai/bertweet-covid19-base-cased", normalization=True
)
bertweet_model = RobertaModel.from_pretrained("vinai/bertweet-covid19-base-cased")
bertweet_ft_model = RobertaModel.from_pretrained(
    "ChrisZeng/bertweet-base-cased-covid19-hateval"
)
bertweet_ft_classifier = RobertaForSequenceClassification.from_pretrained(
    "ChrisZeng/bertweet-base-cased-covid19-hateval"
)
t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")
t5_model = T5EncoderModel.from_pretrained("google/t5-efficient-large")

transformer_models = {
    "bertweet": {"tokenizer": bertweet_tokenizer, "model": bertweet_model},
    "bertweet-ft": {"tokenizer": bertweet_tokenizer, "model": bertweet_ft_model},
    "t5": {"tokenizer": t5_tokenizer, "model": t5_model},
}


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-covid19-base-cased were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ChrisZeng/bertweet-base-cased-covid19-hateval were not used when initializing RobertaModel: ['

In [3]:
from utils import indice2logits

hateval = hateval.map(
    lambda rec: indice2logits(rec["labels"], 2), batched=True, batch_size=1024
).rename_columns({"labels": "label_categoricals", "label_logits": "labels"})

for model_name, model_group in transformer_models.items():
    transformer_models[model_name]["datasets"] = hateval.map(
        lambda rec: model_group["tokenizer"](
            rec["text"],
            padding="longest",
            pad_to_multiple_of=8,
            return_token_type_ids=True,
            return_attention_mask=True,
        ),
        batched=True,
        batch_size=None,
    )

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="outputs/inference",
    overwrite_output_dir=True,
    eval_accumulation_steps=128,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
)


  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [4]:
import torch
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score
from utils import get_metrics, get_labels


trainer = Trainer(
    model=bertweet_ft_classifier, tokenizer=bertweet_tokenizer, args=training_args
)

with torch.no_grad():
    results["baseline"] = get_metrics(
        lambda inputs: trainer.predict(inputs).predictions.argmax(axis=1),
        transformer_models["bertweet-ft"]["datasets"],
        get_labels(transformer_models["bertweet-ft"]["datasets"], "label_categoricals"),
        ["train", "val", "test"],
        {"accuracy": accuracy_score, "f1": f1_score},
    )
    
del trainer
results["baseline"]


The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, label_categoricals.
***** Running Prediction *****
  Num examples = 9000
  Batch size = 8


The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, label_categoricals.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, label_categoricals.
***** Running Prediction *****
  Num examples = 2970
  Batch size = 8


[('train', 'accuracy', 0.846),
 ('train', 'f1', 0.8256603773584905),
 ('val', 'accuracy', 0.768),
 ('val', 'f1', 0.7588357588357587),
 ('test', 'accuracy', 0.5461279461279461),
 ('test', 'f1', 0.6401494927923118)]

In [6]:
for model_name, model_group in transformer_models.items():
    trainer = Trainer(
        model=model_group["model"],
        tokenizer=model_group["tokenizer"],
        args=training_args,
    )
    with torch.no_grad():
        transformer_models[model_name]["embedded"] = {
            split: trainer.predict(model_group["datasets"][split])
            for split in ["train", "val", "test"]
        }
    del trainer


The following columns in the test set  don't have a corresponding argument in `RobertaModel.forward` and have been ignored: labels, text, label_categoricals.
***** Running Prediction *****
  Num examples = 9000
  Batch size = 8


The following columns in the test set  don't have a corresponding argument in `RobertaModel.forward` and have been ignored: labels, text, label_categoricals.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `RobertaModel.forward` and have been ignored: labels, text, label_categoricals.
***** Running Prediction *****
  Num examples = 2970
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `RobertaModel.forward` and have been ignored: labels, text, label_categoricals.
***** Running Prediction *****
  Num examples = 9000
  Batch size = 8


The following columns in the test set  don't have a corresponding argument in `RobertaModel.forward` and have been ignored: labels, text, label_categoricals.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `RobertaModel.forward` and have been ignored: labels, text, label_categoricals.
***** Running Prediction *****
  Num examples = 2970
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `T5EncoderModel.forward` and have been ignored: token_type_ids, text, label_categoricals, labels.
***** Running Prediction *****
  Num examples = 9000
  Batch size = 8


The following columns in the test set  don't have a corresponding argument in `T5EncoderModel.forward` and have been ignored: token_type_ids, text, label_categoricals, labels.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `T5EncoderModel.forward` and have been ignored: token_type_ids, text, label_categoricals, labels.
***** Running Prediction *****
  Num examples = 2970
  Batch size = 8


(9000, 768)

In [17]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

params = {
    "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
    "penalty": ["elasticnet"],
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "l1_ratio": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "max_iter": [10000],
    "early_stopping": [True],
}

tune = GridSearchCV(
    SGDClassifier(), params, cv=5, scoring="f1_macro", n_jobs=8, refit=True
)
tune.fit(embedded["train"], labels["train"])
pd.DataFrame(tune.cv_results_).sort_values("mean_test_score", ascending=False)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_early_stopping,param_l1_ratio,param_loss,param_max_iter,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
152,0.393124,0.034228,0.008598,0.002386,0.01,True,0.8,modified_huber,10000,elasticnet,"{'alpha': 0.01, 'early_stopping': True, 'l1_ra...",0.880912,0.854276,0.838372,0.893396,0.799903,0.853372,0.033001,1
275,0.257539,0.007765,0.006944,0.001529,10,True,0.0,hinge,10000,elasticnet,"{'alpha': 10, 'early_stopping': True, 'l1_rati...",0.881870,0.858930,0.842171,0.892565,0.787172,0.852542,0.037104,2
145,0.310922,0.015028,0.004451,0.000740,0.01,True,0.7,hinge,10000,elasticnet,"{'alpha': 0.01, 'early_stopping': True, 'l1_ra...",0.879225,0.858328,0.840843,0.893759,0.789474,0.852326,0.036217,3
221,0.310801,0.004445,0.006590,0.001342,1,True,0.0,log,10000,elasticnet,"{'alpha': 1, 'early_stopping': True, 'l1_ratio...",0.884483,0.860566,0.840323,0.889842,0.786032,0.852249,0.037545,4
220,0.299353,0.005921,0.008579,0.002567,1,True,0.0,hinge,10000,elasticnet,"{'alpha': 1, 'early_stopping': True, 'l1_ratio...",0.882104,0.858011,0.842213,0.895589,0.781911,0.851965,0.039625,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.377036,0.010320,0.005610,0.001273,1000,True,0.5,squared_hinge,10000,elasticnet,"{'alpha': 1000, 'early_stopping': True, 'l1_ra...",0.296050,0.296050,0.366866,0.295775,0.295775,0.310103,0.028382,430
353,0.385124,0.017340,0.006502,0.001600,100,True,0.4,squared_hinge,10000,elasticnet,"{'alpha': 100, 'early_stopping': True, 'l1_rat...",0.296050,0.366866,0.296050,0.295775,0.295775,0.310103,0.028382,430
341,0.362145,0.007225,0.005079,0.000868,100,True,0.2,log,10000,elasticnet,"{'alpha': 100, 'early_stopping': True, 'l1_rat...",0.296050,0.366866,0.296050,0.295775,0.295775,0.310103,0.028382,430
427,0.359460,0.012711,0.005697,0.000932,1000,True,0.8,modified_huber,10000,elasticnet,"{'alpha': 1000, 'early_stopping': True, 'l1_ra...",0.296050,0.296050,0.296050,0.295775,0.295775,0.295940,0.000135,439


In [19]:
f1_score(labels["test"], tune.predict(embedded["test"]), average="macro")


0.532080892666011

<bound method BaseEstimator.get_params of GridSearchCV(cv=5, estimator=SGDClassifier(), n_jobs=8,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'early_stopping': [True],
                         'l1_ratio': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
                                      0.8, 0.9, 1.0],
                         'loss': ['hinge', 'log', 'modified_huber',
                                  'squared_hinge', 'perceptron'],
                         'max_iter': [10000], 'penalty': ['elasticnet']},
             scoring='f1_macro')>