In [1]:
import json
from IPython.display import display, Pretty

with open("secrets.json", "r") as secrets_file:
    secrets = json.load(secrets_file)

import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"

dataset = Dataset.from_pandas(pd.read_csv(data_path + "/" + filename))

dataset_dict = dataset.train_test_split(test_size=2 / 10, seed=42)
dataset_dict = DatasetDict(
    {
        "val": dataset_dict["test"],
        **dataset_dict["train"].train_test_split(test_size=3 / 10, seed=42),
    }
)
dataset_dict


DatasetDict({
    val: Dataset({
        features: ['original', 'censored'],
        num_rows: 3100
    })
    train: Dataset({
        features: ['original', 'censored'],
        num_rows: 8679
    })
    test: Dataset({
        features: ['original', 'censored'],
        num_rows: 3720
    })
})

In [2]:
def encode(tokenizer, input_text, target_text):
    encoding = tokenizer(input_text, padding="longest", pad_to_multiple_of=8)
    with tokenizer.as_target_tokenizer():
        encoding["labels"] = tokenizer(
            target_text, padding="longest", pad_to_multiple_of=8
        )["input_ids"]
    return encoding


from transformers import Seq2SeqTrainingArguments


def get_traning_args(model_name):
    return Seq2SeqTrainingArguments(
        output_dir="outputs/" + model_name + "detox",
        overwrite_output_dir=True,
        num_train_epochs=20,
        learning_rate=1e-4,
        per_device_train_batch_size=3,
        gradient_accumulation_steps=64,
        eval_accumulation_steps=128,
        dataloader_num_workers=4,
        predict_with_generate=True,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        remove_unused_columns=True,
        optim="adamw_apex_fused",
        fp16=True,
        fp16_opt_level="O2",
        tf32=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        push_to_hub=True,
        hub_strategy="all_checkpoints",
        hub_model_id=model_name + "-detox",
        hub_token=secrets["hub_token_write"],
    )


In [None]:
model_name = "t5-base"

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
)
import os

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoding = dataset_dict.map(
    lambda rec: encode(tokenizer, rec["original"], rec["censored"]),
    keep_in_memory=True,
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

trainer = Seq2SeqTrainer(
    args=get_traning_args(model_name),
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
    train_dataset=encoding["train"],
    eval_dataset=encoding["val"],
)

training_output = trainer.train()
trainer.save_model()

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
import torch
from datasets import Dataset

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"

dataset = Dataset.from_pandas(pd.read_csv(data_path + "/" + filename))
batched_inputs = dataset["original"][:128]

model_name = "ChrisZeng/t5-base-detox"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
input_sequence = tokenizer(
    batched_inputs, padding="longest", pad_to_multiple_of=8, return_tensors="pt"
).to("cuda")
with torch.no_grad():
    output_sequence = model.generate(
        input_ids=input_sequence["input_ids"],
        attention_mask=input_sequence["attention_mask"],
    )
    
generated = tokenizer.batch_decode(output_sequence, skip_special_tokens=True)




In [1]:
from transformers import (
    Text2TextGenerationPipeline,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
import torch
import pandas as pd
from datasets import Dataset

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"

dataset = Dataset.from_pandas(pd.read_csv(data_path + "/" + filename))

model_name = "ChrisZeng/t5-base-detox"

pipe = Text2TextGenerationPipeline(
    model=AutoModelForSeq2SeqLM.from_pretrained(model_name),
    tokenizer=AutoTokenizer.from_pretrained(model_name),
    device=0,
)

with torch.no_grad():
    generated = pipe(dataset["original"], batch_size=128)


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
generated

['Yet call out all Muslims for the acts of a few will get you pilloried.',
 'This c> is c>. Who would read a book by ',
 "You're an c>.",
 "Nincompoop, that's a nice one! I'm partial to ",
 "testing purposes: you are an c> and i can't stand ignorant do",
 'Well c>, they drafted a guide. We should all be good now',
 'I honestly cannot decide if these guys are complete c> or the most patriotic',
 'Dear world: Just a reminder that none of these folks actually live in Oregon. Please remember',
 'These parents are a bunch of c> crybabies. As a Lincoln',
 'It would take a lot of practice and muscles control to vote with your c>',
 'DARPA, the subdivision of the Defense Department in charge of devising Really Scary Shi',
 'I wonder how much money taxpayers money PPS is going to waste on c>',
 'Anyone who allows Obama to nominate him/her is throwing away a chance to ever be',
 'Two Wild & c> Guys!',
 "I was talking about the comments here---I wouldn't be so c> to",
 'Yeah, too bad... Oregon Li