In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"
splits = ["train", "eval", "test"]

dataset_dict = DatasetDict(
    {
        split: Dataset.from_pandas(
            pd.read_csv(
                data_path + "/" + filename.replace(".csv", "_" + split + ".csv")
            )
        )
        for split in splits
    }
)

In [None]:
import torch


def detox(tokenizer, model_buffered, batched_inputs):
    input_sequence_buffered = tokenizer(
        batched_inputs, padding="longest", pad_to_multiple_of=8, return_tensors="pt"
    ).to("cuda")
    with torch.no_grad():
        output_sequence = model_buffered.generate(
            input_ids=input_sequence_buffered["input_ids"],
            attention_mask=input_sequence_buffered["attention_mask"],
        )
    del input_sequence_buffered
    generated = [
        "<CSD>" if len(generated) == 0 else generated
        for generated in tokenizer.batch_decode(
            output_sequence, skip_special_tokens=True
        )
    ]
    return {"generated": generated}


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
from datasets import Dataset

for model_name in ["t5-base", "t5-v1_1-base", "bart-base"]:
    model_name = f"ChrisZeng/{model_name}-detox"

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model_buffered = model.to("cuda")
    tokenizer.padding_side = "left"
    tokenizer.pad_token = tokenizer.eos_token

    dataset_dict = dataset_dict.map(
        lambda rec: detox(tokenizer, model_buffered, rec["original"]),
        keep_in_memory=True,
        batched=True,
        batch_size=64,
    ).rename_columns({"generated": f"generated_{model_name}"})

    del model_buffered

In [None]:
import re


class SentenceDetoxer(object):
    def __init__(self, toxic_words):
        self.replacements = {toxic_word: "<CSD>" for toxic_word in toxic_words}
        self.pattern = "|".join(r"\b%s\b" % re.escape(s) for s in self.replacements)

    def __call__(self, sentence):
        return re.sub(
            self.pattern, lambda match: self.replacements[match.group(0)], sentence
        )


In [None]:
import pandas as pd

hatebase = pd.read_csv("outputs/hatebase.csv")
hate_words = hatebase[hatebase["language"] == "English"]["word"]
hate_words = hate_words.str.split("(").str[0].str.strip()
toxic_words = pd.concat(
    [hate_words, pd.read_csv("outputs/bad-words.csv")["word"],]
).drop_duplicates()

detoxer = SentenceDetoxer(toxic_words)

dataset_dict = dataset_dict.map(
    lambda record: {"naive-detox": detoxer(record["original"])}
)


In [None]:
dataset_dict

In [None]:
for split, dataset in dataset_dict.items():
    dataset.to_pandas().to_csv(f"outputs/generated_outputs_{split}.csv", index=False)


In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict

dataset_dict = DatasetDict(
    {
        split: Dataset.from_pandas(
            pd.read_csv(f"outputs/generated_outputs_{split}.csv")
        )
        for split in ["train", "eval", "test"]
    }
)



In [4]:
from transformers import pipeline
import pandas as pd
import numpy as np


class ToxicScore(object):
    def __init__(self, score_pipeline):
        self.score_pipeline = score_pipeline

    def __call__(self, inputs):
        return np.mean([result[0]["score"] for result in detoxify_pipeline(inputs)])


detoxify_pipeline = pipeline(
    "text-classification",
    model="unitary/toxic-bert",
    tokenizer="bert-base-uncased",
    function_to_apply="sigmoid",
    return_all_scores=True,
    device=0,
    batch_size=8,
)

toxicscore = ToxicScore(detoxify_pipeline)

from datasets import load_metric
bertscore = load_metric("bertscore")



Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
from datasets import load_metric
import numpy as np
import pandas as pd
from IPython.display import display, Pretty

rouge = load_metric("rouge")
exact_match = load_metric("exact_match")
bertscore = load_metric("bertscore")


def compute_metrics(original, generated):
    return {
        "similarity": np.mean(
            bertscore.compute(predictions=generated, references=original, lang="en")[
                "f1"
            ]
        ),
        "toxicity": np.mean(toxicscore(generated)),
    }


metrics = pd.concat(
    {
        split: pd.concat(
            pd.DataFrame(
                compute_metrics(
                    dataset_dict[split]["original"], dataset_dict[split][col]
                ),
                index=[col],
            )
            for col in dataset_dict[split].column_names
        )
        for split in dataset_dict.keys()
    },
    names=["split", "column"],
)

metrics




Unnamed: 0_level_0,Unnamed: 1_level_0,similarity,toxicity
split,column,Unnamed: 2_level_1,Unnamed: 3_level_1
train,original,1.000005,0.634264
train,censored,0.938402,0.119891
train,generated_ChrisZeng/t5-base-detox,0.904883,0.117076
train,generated_ChrisZeng/t5-v1_1-base-detox,0.909105,0.143238
train,generated_ChrisZeng/bart-base-detox,0.850838,0.077703
train,naive-detox,0.978006,0.396886
eval,original,1.000006,0.638489
eval,censored,0.939461,0.120043
eval,generated_ChrisZeng/t5-base-detox,0.906586,0.120961
eval,generated_ChrisZeng/t5-v1_1-base-detox,0.910119,0.143255


In [6]:
from datasets import load_metric
import numpy as np
import pandas as pd
from IPython.display import display, Pretty

rouge = load_metric("rouge")
exact_match = load_metric("exact_match")
bertscore = load_metric("bertscore")


def compute_metrics(predictions, targets):
    return {
        **{
            key: value.mid.fmeasure
            for key, value in rouge.compute(
                predictions=predictions, references=targets
            ).items()
        },
        "exact_match_rate": 0.01
        * exact_match.compute(predictions=predictions, references=targets)[
            "exact_match"
        ],
        "mean_bertscore_f1": np.mean(
            bertscore.compute(predictions=predictions, references=targets, lang="en")[
                "f1"
            ]
        ),
    }


metrics = pd.concat(
    {
        split: pd.concat(
            pd.DataFrame(
                compute_metrics(
                    dataset_dict[split][col], dataset_dict[split]["censored"]
                ),
                index=[col],
            )
            for col in [
                "generated_ChrisZeng/t5-base-detox",
                "generated_ChrisZeng/t5-v1_1-base-detox",
                "generated_ChrisZeng/bart-base-detox",
                "naive-detox",
            ]
        )
        for split in dataset_dict.keys()
    },
    names=["split", "column"],
)

metrics




Unnamed: 0_level_0,Unnamed: 1_level_0,rouge1,rouge2,rougeL,rougeLsum,exact_match_rate,mean_bertscore_f1
split,column,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
train,generated_ChrisZeng/t5-base-detox,0.600023,0.527782,0.596384,0.596429,0.02892,0.915085
train,generated_ChrisZeng/t5-v1_1-base-detox,0.573877,0.499064,0.569643,0.569903,0.0916,0.912732
train,generated_ChrisZeng/bart-base-detox,0.326864,0.160558,0.323464,0.324277,0.051734,0.88219
train,naive-detox,0.813105,0.742662,0.808225,0.808793,0.103583,0.948868
eval,generated_ChrisZeng/t5-base-detox,0.604262,0.523416,0.599722,0.599935,0.020323,0.914312
eval,generated_ChrisZeng/t5-v1_1-base-detox,0.584831,0.50642,0.580486,0.58081,0.080323,0.913919
eval,generated_ChrisZeng/bart-base-detox,0.332699,0.166041,0.328492,0.329628,0.043871,0.882423
eval,naive-detox,0.81798,0.747777,0.81285,0.813122,0.109355,0.950081
test,generated_ChrisZeng/t5-base-detox,0.59351,0.514044,0.589424,0.58932,0.016129,0.913068
test,generated_ChrisZeng/t5-v1_1-base-detox,0.574478,0.496127,0.570753,0.570676,0.079839,0.912746
