In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"

dataset = Dataset.from_pandas(pd.read_csv(data_path + "/" + filename))

dataset_dict = dataset.train_test_split(test_size=2 / 10, seed=42)
dataset_dict = DatasetDict(
    {
        "eval": dataset_dict["test"],
        **dataset_dict["train"].train_test_split(test_size=3 / 10, seed=42),
    }
)

for split, dataset in dataset_dict.items():
    dataset.to_pandas().to_csv(
        data_path + "/" + filename.replace(".csv", "_" + split + ".csv"), index=False
    )


In [2]:
import json

with open("secrets.json", "r") as secrets_file:
    secrets = json.load(secrets_file)

import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"
splits = ["train", "eval", "test"]

dataset_dict = DatasetDict(
    {
        split: Dataset.from_pandas(
            pd.read_csv(
                data_path + "/" + filename.replace(".csv", "_" + split + ".csv")
            )
        )
        for split in splits
    }
)

dataset_dict


DatasetDict({
    train: Dataset({
        features: ['original', 'censored'],
        num_rows: 8679
    })
    eval: Dataset({
        features: ['original', 'censored'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['original', 'censored'],
        num_rows: 3720
    })
})

In [3]:
def encode(tokenizer, input_text, target_text):
    encoding = tokenizer(input_text)
    with tokenizer.as_target_tokenizer():
        encoding["labels"] = tokenizer(target_text)["input_ids"]
    return encoding


from transformers import Seq2SeqTrainingArguments


def get_traning_args(model_name):
    model_name = model_name[model_name.find("/") + 1 :]
    return Seq2SeqTrainingArguments(
        output_dir="outputs/" + model_name + "-detox",
        overwrite_output_dir=True,
        num_train_epochs=10,
        learning_rate=1e-4,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=16,
        eval_accumulation_steps=128,
        dataloader_num_workers=3,
        predict_with_generate=True,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        remove_unused_columns=True,
        optim="adamw_apex_fused",
        bf16=True,
        bf16_full_eval=True,
        tf32=True,
        gradient_checkpointing=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        push_to_hub=True,
        hub_strategy="all_checkpoints",
        hub_model_id=model_name + "-detox",
        hub_token=secrets["hub_token_write"],
    )


In [4]:
model_name = "google/t5-v1_1-base"

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.add_tokens("<CSD>")
model.resize_token_embeddings(len(tokenizer))
model.config.update({"use_cache": False})

os.environ["TOKENIZERS_PARALLELISM"] = "true"

encoding = dataset_dict.map(
    lambda rec: encode(tokenizer, rec["original"], rec["censored"]),
    keep_in_memory=True,
)

import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

trainer = Seq2SeqTrainer(
    args=get_traning_args(model_name),
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer=tokenizer, model=model, padding="longest", pad_to_multiple_of=8
    ),
    train_dataset=encoding["train"],
    eval_dataset=encoding["eval"],
)


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


  0%|          | 0/8679 [00:00<?, ?ex/s]

  0%|          | 0/3100 [00:00<?, ?ex/s]

  0%|          | 0/3720 [00:00<?, ?ex/s]

Cloning https://huggingface.co/ChrisZeng/t5-v1_1-base-detox into local empty directory.
Using amp half precision backend


In [5]:
trainer.train()


The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: censored, original.
***** Running training *****
  Num examples = 8679
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 1350


Epoch,Training Loss,Validation Loss
0,8.9719,2.03194
1,1.9438,0.642404
2,0.8941,0.438422
3,0.6694,0.345574
4,0.5336,0.281917
5,0.4519,0.243061
6,0.4006,0.231866
7,0.3771,0.225216
8,0.3646,0.227114
9,0.3578,0.226963


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: censored, original.
***** Running Evaluation *****
  Num examples = 3100
  Batch size = 8
Saving model checkpoint to outputs/t5-v1_1-base-detox/checkpoint-135
Configuration saved in outputs/t5-v1_1-base-detox/checkpoint-135/config.json
Model weights saved in outputs/t5-v1_1-base-detox/checkpoint-135/pytorch_model.bin
tokenizer config file saved in outputs/t5-v1_1-base-detox/checkpoint-135/tokenizer_config.json
Special tokens file saved in outputs/t5-v1_1-base-detox/checkpoint-135/special_tokens_map.json
Copy vocab file to outputs/t5-v1_1-base-detox/checkpoint-135/spiece.model
tokenizer config file saved in outputs/t5-v1_1-base-detox/tokenizer_config.json
Special tokens file saved in outputs/t5-v1_1-base-detox/special_tokens_map.json
Copy vocab file to outputs/t5-v1_1-base-detox/spiece.model
The following columns in the evaluation set  don't hav

TrainOutput(global_step=1350, training_loss=1.496486166494864, metrics={'train_runtime': 4130.9442, 'train_samples_per_second': 21.01, 'train_steps_per_second': 0.327, 'total_flos': 1.3138971617968128e+16, 'train_loss': 1.496486166494864, 'epoch': 10.0})

In [6]:
trainer.save_model()
del trainer

Saving model checkpoint to outputs/t5-v1_1-base-detox
Configuration saved in outputs/t5-v1_1-base-detox/config.json
Model weights saved in outputs/t5-v1_1-base-detox/pytorch_model.bin
tokenizer config file saved in outputs/t5-v1_1-base-detox/tokenizer_config.json
Special tokens file saved in outputs/t5-v1_1-base-detox/special_tokens_map.json
Copy vocab file to outputs/t5-v1_1-base-detox/spiece.model
Saving model checkpoint to outputs/t5-v1_1-base-detox
Configuration saved in outputs/t5-v1_1-base-detox/config.json
Model weights saved in outputs/t5-v1_1-base-detox/pytorch_model.bin
tokenizer config file saved in outputs/t5-v1_1-base-detox/tokenizer_config.json
Special tokens file saved in outputs/t5-v1_1-base-detox/special_tokens_map.json
Copy vocab file to outputs/t5-v1_1-base-detox/spiece.model
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}}
Several commits (2) will 

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
import torch
from datasets import Dataset


def detox(tokenizer, model_buffered, batched_inputs):
    input_sequence_buffered = tokenizer(
        batched_inputs, padding="longest", pad_to_multiple_of=8, return_tensors="pt"
    ).to("cuda")
    with torch.no_grad():
        output_sequence = model_buffered.generate(
            input_ids=input_sequence_buffered["input_ids"],
            attention_mask=input_sequence_buffered["attention_mask"],
        )
    del input_sequence_buffered
    generated = [
        "<CSD>" if len(generated) == 0 else generated
        for generated in tokenizer.batch_decode(
            output_sequence, skip_special_tokens=True
        )
    ]
    return {"generated": generated}


import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"
splits = ["train", "eval", "test"]

dataset_dict = DatasetDict(
    {
        split: Dataset.from_pandas(
            pd.read_csv(
                data_path + "/" + filename.replace(".csv", "_" + split + ".csv")
            )
        )
        for split in splits
    }
)

model_name = "ChrisZeng/t5-v1_1-base-detox"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_buffered = model.to("cuda")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

dataset_dict = dataset_dict.map(
    lambda rec: detox(tokenizer, model_buffered, rec["original"]),
    keep_in_memory=True,
    batched=True,
    batch_size=64,
)

del model_buffered




  0%|          | 0/136 [00:00<?, ?ba/s]

  0%|          | 0/49 [00:00<?, ?ba/s]

  0%|          | 0/59 [00:00<?, ?ba/s]

In [4]:
from datasets import load_metric
import numpy as np
import pandas as pd
from IPython.display import display, Pretty

rouge = load_metric("rouge")
exact_match = load_metric("exact_match")
bertscore = load_metric("bertscore")


def compute_metrics(predictions, targets):
    return {
        **{
            key: value.mid.fmeasure
            for key, value in rouge.compute(
                predictions=predictions, references=targets
            ).items()
        },
        "exact_match_rate": 0.01
        * exact_match.compute(predictions=predictions, references=targets)[
            "exact_match"
        ],
        "mean_bertscore_f1": np.mean(
            bertscore.compute(predictions=predictions, references=targets, lang="en")[
                "f1"
            ]
        ),
    }


metrics = pd.concat(
    [
        pd.DataFrame(
            compute_metrics(
                dataset_dict[split]["generated"], dataset_dict[split]["censored"]
            ),
            index=[split],
        )
        for split in splits
    ]
)

metrics


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,exact_match_rate,mean_bertscore_f1
train,0.573827,0.498888,0.569586,0.569978,0.0916,0.90963
eval,0.58491,0.506094,0.580824,0.580941,0.080323,0.910862
test,0.574105,0.496,0.570146,0.570905,0.079839,0.909516
