In [None]:
import json

with open("secrets.json", "r") as secrets_file:
    secrets = json.load(secrets_file)

import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"

dataset = Dataset.from_pandas(pd.read_csv(data_path + "/" + filename))

dataset_dict = dataset.train_test_split(test_size=2 / 10, seed=42)
dataset_dict = DatasetDict(
    {
        "val": dataset_dict["test"],
        **dataset_dict["train"].train_test_split(test_size=3 / 10, seed=42),
    }
)
dataset_dict


In [None]:
def encode(tokenizer, input_text, target_text):
    encoding = tokenizer(input_text, padding="longest", pad_to_multiple_of=8)
    with tokenizer.as_target_tokenizer():
        encoding["labels"] = tokenizer(
            target_text, padding="longest", pad_to_multiple_of=8
        )["input_ids"]
    return encoding


def preprocess(tokenizer, record):
    return {"censored": record["censored"].replace("<c>", "<censored>")}


from transformers import Seq2SeqTrainingArguments


def get_traning_args(model_name):
    model_name = model_name[model_name.find('/'):]
    return Seq2SeqTrainingArguments(
        output_dir="outputs/" + model_name + "-detox",
        overwrite_output_dir=True,
        num_train_epochs=20,
        learning_rate=1e-5,
        per_device_train_batch_size=3,
        gradient_accumulation_steps=64,
        eval_accumulation_steps=128,
        dataloader_num_workers=4,
        predict_with_generate=True,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        remove_unused_columns=True,
        optim="adamw_apex_fused",
        fp16=True,
        fp16_opt_level="O2",
        tf32=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        push_to_hub=True,
        hub_strategy="all_checkpoints",
        hub_model_id=model_name + "-detox",
        hub_token=secrets["hub_token_write"],
    )


In [None]:
model_name = "google/t5-v1_1-base"

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
)
import os

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_name = "google/t5-v1_1-base"

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
)
import os

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.add_tokens(["<censored>"])
model.resize_token_embeddings(len(tokenizer))

encoding = dataset_dict.map(lambda rec: preprocess(tokenizer, rec)).map(
    lambda rec: encode(tokenizer, rec["original"], rec["censored"]),
    keep_in_memory=True,
)

encoding = dataset_dict.map(lambda rec: preprocess(tokenizer, rec)).map(
    lambda rec: encode(tokenizer, rec["original"], rec["censored"]),
    keep_in_memory=True,
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

trainer = Seq2SeqTrainer(
    args=get_traning_args(model_name),
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
    train_dataset=encoding["train"],
    eval_dataset=encoding["val"],
)

training_output = trainer.train()


In [None]:
trainer.save_model()


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
import torch
from datasets import Dataset

def detox(tokenizer, model, batched_inputs, batched_targets):
    input_sequence = tokenizer(
        batched_inputs, padding="longest", pad_to_multiple_of=8, return_tensors="pt"
    ).to("cuda")
    with torch.no_grad():
        output_sequence = model.generate(
            input_ids=input_sequence["input_ids"],
            attention_mask=input_sequence["attention_mask"],
        )

    generated = tokenizer.batch_decode(output_sequence, skip_special_tokens=True)
    batched_targets = [
        sentence.replace("<c>", tokenizer.unk_token) for sentence in batched_targets
    ]
    target_sequence = tokenizer(
        batched_targets, padding="longest", pad_to_multiple_of=8, return_tensors="pt"
    )
    generated_target = tokenizer.batch_decode(
        target_sequence["input_ids"], skip_special_tokens=True
    )
    return {
        "generated": generated,
        "generated_target": generated_target,
    }

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"

dataset = Dataset.from_pandas(pd.read_csv(data_path + "/" + filename))

model_name = "outputs/t5-basedetox"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

dataset = dataset.map(
    lambda rec: detox(tokenizer, model, rec["original"], rec["censored"]),
    keep_in_memory=True,
    batched=True,
    batch_size=128,
)


In [None]:
from datasets import load_metric
from IPython.display import display, Pretty

rouge = load_metric("rouge")
exact_match = load_metric("exact_match")
bertscore = load_metric("bertscore")
{
    **rouge.compute(
        predictions=dataset["generated"], references=dataset["generated_target"],
    ),
    **exact_match.compute(
        predictions=dataset["generated"], references=dataset["generated_target"],
    ),
}