In [21]:
import json

with open("secrets.json", "r") as secrets_file:
    secrets = json.load(secrets_file)

import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"

dataset = Dataset.from_pandas(pd.read_csv(data_path + "/" + filename))

dataset_dict = dataset.train_test_split(test_size=2 / 10, seed=42)
dataset_dict = DatasetDict(
    {
        "val": dataset_dict["test"],
        **dataset_dict["train"].train_test_split(test_size=3 / 10, seed=42),
    }
)
dataset_dict


DatasetDict({
    val: Dataset({
        features: ['original', 'censored'],
        num_rows: 3100
    })
    train: Dataset({
        features: ['original', 'censored'],
        num_rows: 8679
    })
    test: Dataset({
        features: ['original', 'censored'],
        num_rows: 3720
    })
})

In [22]:
def encode(tokenizer, input_text, target_text):
    encoding = tokenizer(input_text, padding="longest", pad_to_multiple_of=8)
    with tokenizer.as_target_tokenizer():
        encoding["labels"] = tokenizer(
            target_text, padding="longest", pad_to_multiple_of=8
        )["input_ids"]
    return encoding


def preprocess(tokenizer, record):
    return {"censored": record["censored"].replace("<c>", "<censored>")}


from transformers import Seq2SeqTrainingArguments


def get_traning_args(model_name):
    model_name = model_name[model_name.find("/") + 1 :]
    return Seq2SeqTrainingArguments(
        output_dir="outputs/" + model_name + "-detox",
        overwrite_output_dir=True,
        num_train_epochs=20,
        learning_rate=1e-4,
        per_device_train_batch_size=3,
        gradient_accumulation_steps=64,
        eval_accumulation_steps=128,
        dataloader_num_workers=4,
        predict_with_generate=True,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        remove_unused_columns=True,
        optim="adafactor",
        tf32=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        push_to_hub=True,
        hub_strategy="all_checkpoints",
        hub_model_id=model_name + "-detox",
        hub_token=secrets["hub_token_write"],
    )


In [3]:
model_name = "google/t5-v1_1-base"

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.add_special_tokens(["<CSD>"])
model.resize_token_embeddings(len(tokenizer))

encoding = dataset_dict.map(lambda rec: preprocess(tokenizer, rec)).map(
    lambda rec: encode(tokenizer, rec["original"], rec["censored"]),
    keep_in_memory=True,
)

encoding = dataset_dict.map(lambda rec: preprocess(tokenizer, rec)).map(
    lambda rec: encode(tokenizer, rec["original"], rec["censored"]),
    keep_in_memory=True,
)

import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

trainer = Seq2SeqTrainer(
    args=get_traning_args(model_name),
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
    train_dataset=encoding["train"],
    eval_dataset=encoding["val"],
)

training_output = trainer.train()


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


  0%|          | 0/3100 [00:00<?, ?ex/s]

  0%|          | 0/8679 [00:00<?, ?ex/s]

  0%|          | 0/3720 [00:00<?, ?ex/s]

  0%|          | 0/3100 [00:00<?, ?ex/s]

  0%|          | 0/8679 [00:00<?, ?ex/s]

  0%|          | 0/3720 [00:00<?, ?ex/s]

  0%|          | 0/3100 [00:00<?, ?ex/s]

  0%|          | 0/8679 [00:00<?, ?ex/s]

  0%|          | 0/3720 [00:00<?, ?ex/s]

  0%|          | 0/3100 [00:00<?, ?ex/s]

  0%|          | 0/8679 [00:00<?, ?ex/s]

  0%|          | 0/3720 [00:00<?, ?ex/s]

/home/chris-zeng/csci544-project/outputs/t5-v1_1-base-detox is already a clone of https://huggingface.co/ChrisZeng/t5-v1_1-base-detox. Make sure you pull the latest changes with `repo.git_pull()`.
The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: censored, original.
***** Running training *****
  Num examples = 8679
  Num Epochs = 20
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 64
  Total optimization steps = 900


Epoch,Training Loss,Validation Loss
0,15.6221,6.474992
1,7.8371,4.482548
2,5.1287,3.372109
3,3.5644,1.19799
4,1.6916,0.922294
5,1.322,0.782528
6,1.0912,0.571911
7,0.8992,0.487685
8,0.8194,0.412694
9,0.7964,0.385538


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: censored, original.
***** Running Evaluation *****
  Num examples = 3100
  Batch size = 8
Saving model checkpoint to outputs/t5-v1_1-base-detox/checkpoint-45
Configuration saved in outputs/t5-v1_1-base-detox/checkpoint-45/config.json
Model weights saved in outputs/t5-v1_1-base-detox/checkpoint-45/pytorch_model.bin
tokenizer config file saved in outputs/t5-v1_1-base-detox/checkpoint-45/tokenizer_config.json
Special tokens file saved in outputs/t5-v1_1-base-detox/checkpoint-45/special_tokens_map.json
Copy vocab file to outputs/t5-v1_1-base-detox/checkpoint-45/spiece.model
tokenizer config file saved in outputs/t5-v1_1-base-detox/tokenizer_config.json
Special tokens file saved in outputs/t5-v1_1-base-detox/special_tokens_map.json
Copy vocab file to outputs/t5-v1_1-base-detox/spiece.model
The following columns in the evaluation set  don't have a co

In [6]:
trainer.save_model()


Saving model checkpoint to outputs/t5-v1_1-base-detox
Configuration saved in outputs/t5-v1_1-base-detox/config.json
Model weights saved in outputs/t5-v1_1-base-detox/pytorch_model.bin
tokenizer config file saved in outputs/t5-v1_1-base-detox/tokenizer_config.json
Special tokens file saved in outputs/t5-v1_1-base-detox/special_tokens_map.json
Copy vocab file to outputs/t5-v1_1-base-detox/spiece.model
Saving model checkpoint to outputs/t5-v1_1-base-detox
Configuration saved in outputs/t5-v1_1-base-detox/config.json
Model weights saved in outputs/t5-v1_1-base-detox/pytorch_model.bin
tokenizer config file saved in outputs/t5-v1_1-base-detox/tokenizer_config.json
Special tokens file saved in outputs/t5-v1_1-base-detox/special_tokens_map.json
Copy vocab file to outputs/t5-v1_1-base-detox/spiece.model
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}}


In [56]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
import torch
from datasets import Dataset


def detox(tokenizer, model, batched_inputs, batched_targets):
    input_sequence = tokenizer(
        batched_inputs, padding="longest", pad_to_multiple_of=8, return_tensors="pt"
    ).to("cuda")
    with torch.no_grad():
        output_sequence = model.generate(
            input_ids=input_sequence["input_ids"],
            attention_mask=input_sequence["attention_mask"],
        )

    generated = [
        "<censored>" if len(generated) == 0 else generated
        for generated in tokenizer.batch_decode(
            output_sequence, skip_special_tokens=True
        )
    ]

    batched_targets = [
        sentence.replace("<c>", "<censored>") for sentence in batched_targets
    ]
    target_sequence = tokenizer(
        batched_targets, padding="longest", pad_to_multiple_of=8, return_tensors="pt"
    )
    generated_target = tokenizer.batch_decode(
        target_sequence["input_ids"], skip_special_tokens=True
    )
    return {
        "generated": generated,
        "generated_target": generated_target,
    }


data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"

dataset = Dataset.from_pandas(pd.read_csv(data_path + "/" + filename))

model_name = "ChrisZeng/t5-v1_1-base-detox"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

dataset = dataset.map(
    lambda rec: detox(tokenizer, model, rec["original"], rec["censored"]),
    keep_in_memory=True,
    batched=True,
    batch_size=64,
)




  0%|          | 0/243 [00:00<?, ?ba/s]

In [63]:
from datasets import load_metric
import numpy as np
import pandas as pd
from IPython.display import display, Pretty

rouge = load_metric("rouge")
exact_match = load_metric("exact_match")
bertscore = load_metric("bertscore")

{
    **{
        key: value.mid.fmeasure
        for key, value in rouge.compute(
            predictions=dataset["generated"], references=dataset["generated_target"],
        ).items()
    },
    "exact_match_rate": exact_match.compute(
        predictions=dataset["generated"], references=dataset["generated_target"],
    )["exact_match"]
    / 100,
    "mean_bertscore": np.mean(
        bertscore.compute(
            predictions=dataset["generated"],
            references=dataset["generated_target"],
            lang="en",
        )["f1"]
    ),
}


In [62]:
exact_match.compute(
        predictions=dataset["generated"], references=dataset["generated_target"],
    ),

({'exact_match': 7.200464546099748},)