In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"

dataset = Dataset.from_pandas(pd.read_csv(data_path + "/" + filename))

dataset_dict = dataset.train_test_split(test_size=2 / 10, seed=42)
dataset_dict = DatasetDict(
    {
        "eval": dataset_dict["test"],
        **dataset_dict["train"].train_test_split(test_size=3 / 8, seed=42),
    }
)

for split, dataset in dataset_dict.items():
    dataset.to_pandas().to_csv(
        data_path + "/" + filename.replace(".csv", "_" + split + ".csv"), index=False
    )


In [2]:
import json

with open("secrets.json", "r") as secrets_file:
    secrets = json.load(secrets_file)

import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"
splits = ["train", "eval", "test"]

dataset_dict = DatasetDict(
    {
        split: Dataset.from_pandas(
            pd.read_csv(
                data_path + "/" + filename.replace(".csv", "_" + split + ".csv")
            )
        )
        for split in splits
    }
)

dataset_dict


DatasetDict({
    train: Dataset({
        features: ['original', 'censored'],
        num_rows: 8679
    })
    eval: Dataset({
        features: ['original', 'censored'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['original', 'censored'],
        num_rows: 3720
    })
})

In [3]:
def encode(tokenizer, input_text, target_text):
    encoding = tokenizer(input_text)
    with tokenizer.as_target_tokenizer():
        encoding["labels"] = tokenizer(target_text)["input_ids"]
    return encoding


from transformers import Seq2SeqTrainingArguments


def get_traning_args(model_name):
    model_name = model_name[model_name.find("/") + 1 :]
    return Seq2SeqTrainingArguments(
        output_dir="outputs/" + model_name + "-detox",
        overwrite_output_dir=True,
        num_train_epochs=20,
        learning_rate=1e-5,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=16,
        eval_accumulation_steps=128,
        dataloader_num_workers=3,
        predict_with_generate=True,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        remove_unused_columns=True,
        optim="adamw_apex_fused",
        bf16=True,
        bf16_full_eval=True,
        tf32=True,
        gradient_checkpointing=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        push_to_hub=True,
        hub_strategy="all_checkpoints",
        hub_model_id=model_name + "-detox",
        hub_token=secrets["hub_token_write"],
    )


In [4]:
model_name = "facebook/bart-base"

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.add_tokens("<CSD>")
model.resize_token_embeddings(len(tokenizer))
model.config.update({"use_cache": False})

os.environ["TOKENIZERS_PARALLELISM"] = "true"

encoding = dataset_dict.map(
    lambda rec: encode(tokenizer, rec["original"], rec["censored"]),
    keep_in_memory=True,
)

import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

trainer = Seq2SeqTrainer(
    args=get_traning_args(model_name),
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer=tokenizer, model=model, padding="longest", pad_to_multiple_of=8
    ),
    train_dataset=encoding["train"],
    eval_dataset=encoding["eval"],
)


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


  0%|          | 0/8679 [00:00<?, ?ex/s]

  0%|          | 0/3100 [00:00<?, ?ex/s]

  0%|          | 0/3720 [00:00<?, ?ex/s]

/home/chris-zeng/csci544-project/outputs/bart-base-detox is already a clone of https://huggingface.co/ChrisZeng/bart-base-detox. Make sure you pull the latest changes with `repo.git_pull()`.
Using amp half precision backend


In [6]:
trainer.train(resume_from_checkpoint=True)


Loading model from outputs/bart-base-detox/checkpoint-1350).
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: censored, original.
***** Running training *****
  Num examples = 8679
  Num Epochs = 20
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 2700
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 10
  Continuing training from global step 1350
  Will skip the first 10 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


0it [00:00, ?it/s]

Epoch,Training Loss,Validation Loss
10,0.1798,0.185777
11,0.1745,0.181999
12,0.1689,0.182661
13,0.1707,0.18428
14,0.1658,0.183421
15,0.1647,0.182027
16,0.1645,0.183651
17,0.1633,0.181359
18,0.1612,0.181473
19,0.1603,0.181861


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: censored, original.
***** Running Evaluation *****
  Num examples = 3100
  Batch size = 8
Saving model checkpoint to outputs/bart-base-detox/checkpoint-1485
Configuration saved in outputs/bart-base-detox/checkpoint-1485/config.json
Model weights saved in outputs/bart-base-detox/checkpoint-1485/pytorch_model.bin
tokenizer config file saved in outputs/bart-base-detox/checkpoint-1485/tokenizer_config.json
Special tokens file saved in outputs/bart-base-detox/checkpoint-1485/special_tokens_map.json
tokenizer config file saved in outputs/bart-base-detox/tokenizer_config.json
Special tokens file saved in outputs/bart-base-detox/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: censored, original.
***** Running Evaluation *****
  Nu

TrainOutput(global_step=2700, training_loss=0.08368111080593534, metrics={'train_runtime': 2197.9803, 'train_samples_per_second': 78.973, 'train_steps_per_second': 1.228, 'total_flos': 1.075972393746432e+16, 'train_loss': 0.08368111080593534, 'epoch': 20.0})

In [7]:
trainer.save_model()
del trainer

Saving model checkpoint to outputs/bart-base-detox
Configuration saved in outputs/bart-base-detox/config.json
Model weights saved in outputs/bart-base-detox/pytorch_model.bin
tokenizer config file saved in outputs/bart-base-detox/tokenizer_config.json
Special tokens file saved in outputs/bart-base-detox/special_tokens_map.json
Saving model checkpoint to outputs/bart-base-detox
Configuration saved in outputs/bart-base-detox/config.json
Model weights saved in outputs/bart-base-detox/pytorch_model.bin
tokenizer config file saved in outputs/bart-base-detox/tokenizer_config.json
Special tokens file saved in outputs/bart-base-detox/special_tokens_map.json


Upload file checkpoint-2700/optimizer.pt:   0%|          | 32.0k/1.04G [00:00<?, ?B/s]

Upload file checkpoint-2700/scheduler.pt: 100%|##########| 623/623 [00:00<?, ?B/s]

Upload file checkpoint-2700/pytorch_model.bin:   0%|          | 32.0k/532M [00:00<?, ?B/s]

Upload file checkpoint-2700/scaler.pt: 100%|##########| 559/559 [00:00<?, ?B/s]

Upload file checkpoint-2700/rng_state.pth: 100%|##########| 14.2k/14.2k [00:00<?, ?B/s]

To https://huggingface.co/ChrisZeng/bart-base-detox
   22aac39..c9e4d58  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}}
To https://huggingface.co/ChrisZeng/bart-base-detox
   c9e4d58..c2af807  main -> main



In [8]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
import torch
from datasets import Dataset


def detox(tokenizer, model_buffered, batched_inputs):
    input_sequence_buffered = tokenizer(
        batched_inputs, padding="longest", pad_to_multiple_of=8, return_tensors="pt"
    ).to("cuda")
    with torch.no_grad():
        output_sequence = model_buffered.generate(
            input_ids=input_sequence_buffered["input_ids"],
            attention_mask=input_sequence_buffered["attention_mask"],
        )
    del input_sequence_buffered
    generated = [
        "<CSD>" if len(generated) == 0 else generated
        for generated in tokenizer.batch_decode(
            output_sequence, skip_special_tokens=True
        )
    ]
    return {"generated": generated}


import pandas as pd
from datasets import Dataset, DatasetDict

data_path = "./data/toxic_spans"
filename = "toxic_span_text_pairs.csv"
splits = ["train", "eval", "test"]

dataset_dict = DatasetDict(
    {
        split: Dataset.from_pandas(
            pd.read_csv(
                data_path + "/" + filename.replace(".csv", "_" + split + ".csv")
            )
        )
        for split in splits
    }
)

model_name = "ChrisZeng/t5-v1_1-base-detox"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_buffered = model.to("cuda")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

dataset_dict = dataset_dict.map(
    lambda rec: detox(tokenizer, model_buffered, rec["original"]),
    keep_in_memory=True,
    batched=True,
    batch_size=64,
)

del model_buffered


loading configuration file https://huggingface.co/ChrisZeng/t5-v1_1-base-detox/resolve/main/config.json from cache at /home/chris-zeng/.cache/huggingface/transformers/3d051d400035f2ca53580da38002eb0d2b7e188715710019c196e53ce2863c9c.4216d65d19bd74d36444d35fc3c0231b7f18539c9abaa91f81207dcadbf71eb9
Model config T5Config {
  "_name_or_path": "ChrisZeng/t5-v1_1-base-detox",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "use_cache": false,
  "vocab_size": 32101
}

loa

  0%|          | 0/136 [00:00<?, ?ba/s]

  0%|          | 0/49 [00:00<?, ?ba/s]

  0%|          | 0/59 [00:00<?, ?ba/s]

In [9]:
from datasets import load_metric
import numpy as np
import pandas as pd
from IPython.display import display, Pretty

rouge = load_metric("rouge")
exact_match = load_metric("exact_match")
bertscore = load_metric("bertscore")


def compute_metrics(predictions, targets):
    return {
        **{
            key: value.mid.fmeasure
            for key, value in rouge.compute(
                predictions=predictions, references=targets
            ).items()
        },
        "exact_match_rate": 0.01
        * exact_match.compute(predictions=predictions, references=targets)[
            "exact_match"
        ],
        "mean_bertscore_f1": np.mean(
            bertscore.compute(predictions=predictions, references=targets, lang="en")[
                "f1"
            ]
        ),
    }


metrics = pd.concat(
    [
        pd.DataFrame(
            compute_metrics(
                dataset_dict[split]["generated"], dataset_dict[split]["censored"]
            ),
            index=[split],
        )
        for split in splits
    ]
)

metrics


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /home/chris-zeng/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,exact_match_rate,mean_bertscore_f1
train,0.574129,0.498884,0.569738,0.569856,0.0916,0.909635
eval,0.584921,0.506518,0.580766,0.581342,0.080323,0.910855
test,0.575102,0.496492,0.570006,0.570352,0.079839,0.909516
