In [88]:
!pip install datasets==2.11
!pip install datasets transformers[sentencepiece]
!pip install sacrebleu rouge_score evaluate



In [89]:
from transformers import logging

logging.set_verbosity_error()


In [90]:
from datasets import load_dataset, load_metric
import pandas as pd
import evaluate 
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import DatasetDict
import numpy as np
import evaluate

sacrebleu_metric = load_metric("sacrebleu")
rouge_metric = evaluate.load('rouge')
ter_metric = evaluate.load("ter")

In [91]:
def preprocess_dataset(data):
    conditions = [data.ref_tox < data.trn_tox ]
    values = ['true']
    data['swap'] = np.select(conditions, values)
    
    is_swap = data['swap'] == 'true'
    data.loc[is_swap, ['reference', 'translation', 'ref_tox', 'trn_tox']] = (
        data.loc[is_swap, ['translation', 'reference', 'trn_tox', 'ref_tox']].values
        )
    
    index_drop = data[(data['ref_tox'] <= 0.8) | (data['trn_tox'] >= 0.2) ].index
    data.drop(index_drop, inplace=True)
    data.drop(columns=["swap"], axis=1, inplace=True)
    return data

In [92]:
d = pd.read_table("/kaggle/input/paranmt/filtered.tsv")
d = preprocess_dataset(d)
d.to_csv("converted.csv")

In [93]:
d.head()

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t...",0.785171,0.010309,0.981983,0.014195
1,1,you're becoming disgusting.,Now you're getting nasty.,0.749687,0.071429,0.999039,0.065473
3,3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it.",0.664333,0.309524,0.994215,0.053362
4,4,I have orders to kill her.,I've got orders to put her down.,0.726639,0.181818,0.999348,0.009402
5,5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.703185,0.206522,0.950956,0.035846


In [94]:
raw_datasets = load_dataset("csv", data_files="/kaggle/working/converted.csv")
raw_datasets = raw_datasets['train'].train_test_split(test_size=1-200000/raw_datasets.num_rows["train"], seed=42)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-8f3615130d62798d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-8f3615130d62798d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [95]:
raw_datasets_train = raw_datasets['train'].train_test_split(test_size=0.3, seed=42)
raw_datasets_test = raw_datasets_train['test'].train_test_split(test_size=0.5, seed=42)


ds_splits = DatasetDict({
    'train': raw_datasets_train['train'],
    'valid': raw_datasets_test['train'],
    'test': raw_datasets_test['test']
})

In [96]:
model_checkpoint = "bert-base-uncased"

In [97]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [98]:
def tokenize_function(examples):
    return tokenizer(examples["translation"])

In [99]:
tokenized_datasets = ds_splits.map(tokenize_function, batched=True, num_proc=4,
                                   remove_columns=["Unnamed: 0.1", "Unnamed: 0",
                                                  "reference", "translation", "similarity",
                                                   "lenght_diff", "trn_tox", "ref_tox"
                                                  ])

Map (num_proc=4):   0%|          | 0/140000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/30000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/30000 [00:00<?, ? examples/s]

In [100]:
tokenized_datasets["train"][1]

{'input_ids': [101, 2002, 2001, 4855, 4933, 2043, 1045, 2288, 2045, 1012, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [101]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
block_size = 128

In [102]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [103]:
ds_splits = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/140000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/30000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/30000 [00:00<?, ? examples/s]

In [104]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir="my_awesome_eli5_mlm_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    report_to="none"
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_splits["train"],
    eval_dataset=ds_splits["test"],
    data_collator=data_collator,
)

trainer.train()

{'loss': 5.9812, 'learning_rate': 1.9527186761229316e-05, 'epoch': 0.24}
{'loss': 5.1615, 'learning_rate': 1.905437352245863e-05, 'epoch': 0.47}
{'loss': 4.901, 'learning_rate': 1.8581560283687945e-05, 'epoch': 0.71}
{'loss': 4.7497, 'learning_rate': 1.810874704491726e-05, 'epoch': 0.95}
{'eval_loss': 4.622377395629883, 'eval_runtime': 16.3309, 'eval_samples_per_second': 222.645, 'eval_steps_per_second': 27.861, 'epoch': 1.0}
{'loss': 4.6491, 'learning_rate': 1.7635933806146574e-05, 'epoch': 1.18}
{'loss': 4.5076, 'learning_rate': 1.716312056737589e-05, 'epoch': 1.42}
{'loss': 4.4798, 'learning_rate': 1.6690307328605203e-05, 'epoch': 1.65}
{'loss': 4.4188, 'learning_rate': 1.6217494089834514e-05, 'epoch': 1.89}
{'eval_loss': 4.31002950668335, 'eval_runtime': 16.1663, 'eval_samples_per_second': 224.912, 'eval_steps_per_second': 28.145, 'epoch': 2.0}
{'loss': 4.3831, 'learning_rate': 1.5744680851063832e-05, 'epoch': 2.13}
{'loss': 4.3073, 'learning_rate': 1.5271867612293146e-05, 'epoch':

In [None]:
import numpy as np
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import AutoTokenizer, BertForMaskedLM, BertTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("/kaggle/working/my_awesome_eli5_mlm_model/checkpoint-500")

def predict_toxicity(texts, device='cpu', clf_name = 's-nlp/roberta_toxicity_classifier_v1'):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    clf = RobertaForSequenceClassification.from_pretrained(clf_name).to(device)
    clf_tokenizer = RobertaTokenizer.from_pretrained(clf_name)
    with torch.inference_mode():
        inputs = clf_tokenizer(texts, return_tensors='pt', padding=True).to(clf.device)
        out = torch.softmax(clf(**inputs).logits, -1)[:, 1].cpu().numpy()
    return out


def mask_toxic(sentence, threshold=0.3):
    words = sentence.split()
    probabilities = predict_toxicity(words)
    text_prep = []
    toxic_indexes = []
    for _word, _prob in zip(words, probabilities):
        if _prob > threshold:
            text_prep.append("[MASK]")
        else:
            text_prep.append(_word)
    text_prep = " ".join(text_prep)
    tokenized = tokenizer(text_prep, return_tensors="pt")
    return tokenized

def get_mask_index(inputs):
    mask_token_indexes = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
    return mask_token_indexes

In [None]:
def infer(sentence):
    inputs = mask_toxic(sentence)
    with torch.no_grad():
        logits = model(**inputs).logits
    mask_indexes = get_mask_index(inputs)
    for mask_token_index in mask_indexes:
        predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        inputs.input_ids[0][mask_token_index] = predicted_token_id
    decoded_sentence = tokenizer.decode(inputs.input_ids[0][1:-1])
    return decoded_sentence

In [None]:
df = pd.read_table('/kaggle/input/paranmt/filtered.tsv')
df.head()

In [None]:
from tqdm import trange
from transformers import logging
import torch

logging.set_verbosity_error()

toxic_sentences = df.reference[:10].tolist()
toxic_sentences_list = [[t] for t in toxic_sentences]
detoxified_text = []
result = {
    "bleu": 0,
    "rouge1": 0,
    "rouge2": 0,
    "TER": 0
}
n = len(toxic_sentences)
for i in trange(len(toxic_sentences)):
    detoxified_sentence = infer(toxic_sentences[i])
    #words = toxic_sentences[i].split()
    #toxic_scores = predict_toxicity(words)
    #detoxified_sentence = " ".join([sentence for sentence, score in zip(words, toxic_scores) if score < threshold])
    detoxified_text.append(detoxified_sentence)

result["bleu"] = sacrebleu_metric.compute(predictions=detoxified_text, references=toxic_sentences_list)["score"]
rouge_score = rouge_metric.compute(predictions=detoxified_text, references=toxic_sentences_list)
result["rouge1"] = rouge_score["rouge1"]
result["rouge2"] = rouge_score["rouge2"]
result["TER"] = ter_metric.compute(predictions=detoxified_text, references=toxic_sentences_list)["score"]

In [None]:
print(result)

In [None]:
infer("He is strange")

In [None]:
import pandas as pd
pd.DataFrame(result, index=[0]).head()