In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"
splits = ["train", "eval", "test"]

dataset_dict = DatasetDict(
    {
        split: Dataset.from_pandas(
            pd.read_csv(
                data_path + "/" + filename.replace(".csv", "_" + split + ".csv")
            )
        )
        for split in splits
    }
)

In [3]:
import torch


def detox(tokenizer, model_buffered, batched_inputs):
    input_sequence_buffered = tokenizer(
        batched_inputs, padding="longest", pad_to_multiple_of=8, return_tensors="pt"
    ).to("cuda")
    with torch.no_grad():
        output_sequence = model_buffered.generate(
            input_ids=input_sequence_buffered["input_ids"],
            attention_mask=input_sequence_buffered["attention_mask"],
        )
    del input_sequence_buffered
    generated = [
        "<CSD>" if len(generated) == 0 else generated
        for generated in tokenizer.batch_decode(
            output_sequence, skip_special_tokens=True
        )
    ]
    return {"generated": generated}


In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
from datasets import Dataset

for model_name in ["t5-base", "t5-v1_1-base", "bart-base"]:
    model_name = f"ChrisZeng/{model_name}-detox"

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model_buffered = model.to("cuda")
    tokenizer.padding_side = "left"
    tokenizer.pad_token = tokenizer.eos_token

    dataset_dict = dataset_dict.map(
        lambda rec: detox(tokenizer, model_buffered, rec["original"]),
        keep_in_memory=True,
        batched=True,
        batch_size=64,
    ).rename_columns({"generated": f"generated_{model_name}"})

    del model_buffered

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

  0%|          | 0/136 [00:00<?, ?ba/s]

  0%|          | 0/49 [00:00<?, ?ba/s]

  0%|          | 0/59 [00:00<?, ?ba/s]

  0%|          | 0/136 [00:00<?, ?ba/s]

  0%|          | 0/49 [00:00<?, ?ba/s]

  0%|          | 0/59 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/353 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

  0%|          | 0/136 [00:00<?, ?ba/s]

  0%|          | 0/49 [00:00<?, ?ba/s]

  0%|          | 0/59 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['original', 'censored', 'generated_ChrisZeng/t5-base-detox', 'generated_ChrisZeng/t5-v1_1-base-detox', 'generated_ChrisZeng/bart-base-detox'],
        num_rows: 8679
    })
    eval: Dataset({
        features: ['original', 'censored', 'generated_ChrisZeng/t5-base-detox', 'generated_ChrisZeng/t5-v1_1-base-detox', 'generated_ChrisZeng/bart-base-detox'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['original', 'censored', 'generated_ChrisZeng/t5-base-detox', 'generated_ChrisZeng/t5-v1_1-base-detox', 'generated_ChrisZeng/bart-base-detox'],
        num_rows: 3720
    })
})

In [5]:
import re


class SentenceDetoxer(object):
    def __init__(self, toxic_words):
        self.replacements = {toxic_word: "<CSD>" for toxic_word in toxic_words}
        self.pattern = "|".join(r"\b%s\b" % re.escape(s) for s in self.replacements)

    def __call__(self, sentence):
        return re.sub(
            self.pattern, lambda match: self.replacements[match.group(0)], sentence
        )


In [6]:
import pandas as pd

hatebase = pd.read_csv("outputs/hatebase.csv")
hate_words = hatebase[hatebase["language"] == "English"]["word"]
hate_words = hate_words.str.split("(").str[0].str.strip()
toxic_words = pd.concat(
    [hate_words, pd.read_csv("outputs/bad-words.csv")["word"],]
).drop_duplicates()

detoxer = SentenceDetoxer(toxic_words)

dataset_dict = dataset_dict.map(
    lambda record: {"naive-detox": detoxer(record["original"])}
)


  0%|          | 0/8679 [00:00<?, ?ex/s]

  0%|          | 0/3100 [00:00<?, ?ex/s]

  0%|          | 0/3720 [00:00<?, ?ex/s]

In [8]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['original', 'censored', 'generated_ChrisZeng/t5-base-detox', 'generated_ChrisZeng/t5-v1_1-base-detox', 'generated_ChrisZeng/bart-base-detox', 'naive-detox'],
        num_rows: 8679
    })
    eval: Dataset({
        features: ['original', 'censored', 'generated_ChrisZeng/t5-base-detox', 'generated_ChrisZeng/t5-v1_1-base-detox', 'generated_ChrisZeng/bart-base-detox', 'naive-detox'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['original', 'censored', 'generated_ChrisZeng/t5-base-detox', 'generated_ChrisZeng/t5-v1_1-base-detox', 'generated_ChrisZeng/bart-base-detox', 'naive-detox'],
        num_rows: 3720
    })
})

In [None]:
for split, dataset in dataset_dict.items():
    dataset.to_pandas().to_csv(f"outputs/generated_outputs_{split}.csv", index=False)


In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

dataset_dict = DatasetDict(
    {
        split: Dataset.from_pandas(
            pd.read_csv(f"outputs/generated_outputs_{split}.csv")
        )
        for split in ["train", "eval", "test"]
    }
)



In [2]:
from transformers import pipeline
import pandas as pd
import numpy as np


class ToxicScore(object):
    def __init__(self, score_pipeline):
        self.score_pipeline = score_pipeline

    def __call__(self, inputs):
        return np.mean([result[0]["score"] for result in detoxify_pipeline(inputs)])


detoxify_pipeline = pipeline(
    "text-classification",
    model="unitary/toxic-bert",
    tokenizer="bert-base-uncased",
    function_to_apply="sigmoid",
    return_all_scores=True,
    device=0,
    batch_size=8,
)

toxicscore = ToxicScore(detoxify_pipeline)

from datasets import load_metric
bertscore = load_metric("bertscore")



Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
from datasets import load_metric
import numpy as np
import pandas as pd
from IPython.display import display, Pretty

rouge = load_metric("rouge")
exact_match = load_metric("exact_match")
bertscore = load_metric("bertscore")


def compute_metrics(original, generated):
    return {
        "similarity": np.mean(
            bertscore.compute(predictions=generated, references=original, lang="en")["f1"]
        ),
        "toxicity": np.mean(toxicscore(generated)),
    }


metrics = pd.concat(
    pd.DataFrame(
        compute_metrics(dataset_dict[split]["original"], dataset_dict[split][col]),
        index=[split, col],
    )
    for split in dataset_dict.keys()
    for col in dataset_dict[split].column_names
)

metrics


