### CondBERT  

In this notebook we will be performing the detoxification task using the CondBERT model:

In [14]:
!git clone https://github.com/s-nlp/detox

fatal: destination path 'detox' already exists and is not an empty directory.


In [15]:
%pip install -r /content/detox/requirements.txt -q

In [16]:
import pandas as pd

dataset = pd.read_csv("/content/separated_tox.csv")
dataset = dataset.set_index(dataset.columns[0])
dataset.index.name = "Index"
dataset.head()

Unnamed: 0_level_0,toxic,non-toxic,old_toxicity,new_toxicity
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"if alkar floods her with her mental waste, it ...","if alkar is flooding her with psychic waste, t...",0.981983,0.014195
1,you're becoming disgusting.,now you're getting nasty.,0.999039,0.065473
2,"well, we can spare your life.","well, we could spare your life, for one.",0.985068,0.213313
3,"monkey, you have to wake up.","ah! monkey, you've got to snap out of it.",0.994215,0.053362
4,i have orders to kill her.,i've got orders to put her down.,0.999348,0.009402


In [17]:
from sklearn.model_selection import train_test_split

train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=36)

First, let us see how CondBERT works without pre-training on the current dataset:

In [20]:
import os
import sys

def add_sys_path(p):
    p = os.path.abspath(p)
    print(p)
    if p not in sys.path:
        sys.path.append(p)

add_sys_path('/content/detox/emnlp2021/style_transfer/condBERT')

from importlib import reload
import condbert
reload(condbert)
from condbert import CondBertRewriter
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import pickle
from tqdm.auto import tqdm, trange

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

/content/detox/emnlp2021/style_transfer/condBERT


In [21]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
model.to(device);

vocab_root = '/content/detox/emnlp2021/style_transfer/condBERT/vocab/'

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
with open(vocab_root + "negative-words.txt", "r") as f:
    s = f.readlines()
negative_words = list(map(lambda x: x[:-1], s))
with open(vocab_root + "toxic_words.txt", "r") as f:
    ss = f.readlines()
negative_words += list(map(lambda x: x[:-1], ss))

with open(vocab_root + "positive-words.txt", "r") as f:
    s = f.readlines()
positive_words = list(map(lambda x: x[:-1], s))

In [23]:
import pickle
with open(vocab_root + 'word2coef.pkl', 'rb') as f:
    word2coef = pickle.load(f)

In [24]:
token_toxicities = []
with open(vocab_root + 'token_toxicities.txt', 'r') as f:
    for line in f.readlines():
        token_toxicities.append(float(line))
token_toxicities = np.array(token_toxicities)
token_toxicities = np.maximum(0, np.log(1/(1/token_toxicities-1)))   # log odds ratio

# discourage meaningless tokens
for tok in ['.', ',', '-']:
    token_toxicities[tokenizer.encode(tok)][1] = 3

for tok in ['you']:
    token_toxicities[tokenizer.encode(tok)][1] = 0

In [25]:
reload(condbert)
from condbert import CondBertRewriter

editor_1 = CondBertRewriter(
    model=model,
    tokenizer=tokenizer,
    device=device,
    neg_words=negative_words,
    pos_words=positive_words,
    word2coef=word2coef,
    token_toxicities=token_toxicities,
)

In [26]:
editor_1.translate("you're becoming disgusting.")

you're becoming disgusting.


"you ' re becoming sanitary ."

In [27]:
editor_1.translate("well, we can spare your life.")

well, we can spare your life.	


'well , we can spare their life .'

In [28]:
editor_1.translate("monkey, you have to wake up.")

monkey, you have to wake up.	


'. , you have to wake up .'

In [29]:
original_sentences = list(test_dataset['toxic'])

In [None]:
translated_sentences = []

for i, line in enumerate(tqdm(original_sentences)):
    inp = line.strip()
    out = editor_1.translate(inp, prnt=False).strip()
    translated_sentences.append(out)

  0%|          | 0/115556 [00:00<?, ?it/s]

In [None]:
with open('results1.txt', 'w') as file:
    for item in translated_sentences:
        file.write("%s\n" % item)

### Pre-training the model  

The model can be pre-trained on our data using the training template that can be found here: https://github.com/s-nlp/detox/blob/main/emnlp2021/style_transfer/condBERT/condbert_compile_vocab.ipynb.

In [None]:
from choosers import EmbeddingSimilarityChooser
from multiword.masked_token_predictor_bert import MaskedTokenPredictorBert

In [None]:
import os
import argparse
import numpy as np
from tqdm import tqdm
from nltk import ngrams
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer



class NgramSalienceCalculator():
    def __init__(self, tox_corpus, norm_corpus, use_ngrams=False):
        ngrams = (1, 3) if use_ngrams else (1, 1)
        self.vectorizer = CountVectorizer(ngram_range=ngrams)

        tox_count_matrix = self.vectorizer.fit_transform(tox_corpus)
        self.tox_vocab = self.vectorizer.vocabulary_
        self.tox_counts = np.sum(tox_count_matrix, axis=0)

        norm_count_matrix = self.vectorizer.fit_transform(norm_corpus)
        self.norm_vocab = self.vectorizer.vocabulary_
        self.norm_counts = np.sum(norm_count_matrix, axis=0)

    def salience(self, feature, attribute='tox', lmbda=0.5):
        assert attribute in ['tox', 'norm']
        if feature not in self.tox_vocab:
            tox_count = 0.0
        else:
            tox_count = self.tox_counts[0, self.tox_vocab[feature]]

        if feature not in self.norm_vocab:
            norm_count = 0.0
        else:
            norm_count = self.norm_counts[0, self.norm_vocab[feature]]

        if attribute == 'tox':
            return (tox_count + lmbda) / (norm_count + lmbda)
        else:
            return (norm_count + lmbda) / (tox_count + lmbda)

In [None]:
from collections import Counter
c = Counter()

# read words from our portion of the dataset
for fn in [dataset['toxic'], dataset['non-toxic']]:
    for line in fn:
        for tok in line.strip().split():
            c[tok] += 1

neg_out_name = "/content/detox/emnlp2021/style_transfer/condBERT/vocab/negative-words.txt"
pos_out_name = "/content/detox/emnlp2021/style_transfer/condBERT/vocab/positive-words.txt"

# read words that already are in the dictionary
with open(neg_out_name, 'r') as neg_out, open(pos_out_name, 'r') as pos_out:
    existant_pos_words = pos_out.readlines()
    for line in existant_pos_words:
        for tok in line.strip().split():
            c[tok] += 1
    existant_neg_words = neg_out.readlines()
    for line in existant_neg_words:
        for tok in line.strip().split():
            c[tok] += 1

print(len(c))

In [None]:
vocab = {w for w, _ in c.most_common() if _ > 0}  # if we took words with > 1 occurences, vocabulary would be x2 smaller, but we'll survive this size
print(len(vocab))

In [None]:
corpus_tox = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in dataset['toxic']]
corpus_norm = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in dataset['non-toxic']]

In [None]:
threshold = 4

In [None]:
sc = NgramSalienceCalculator(corpus_tox, corpus_norm, False)
seen_grams = set()

with open(neg_out_name, 'a') as neg_out, open(pos_out_name, 'a') as pos_out:
    for gram in set(sc.tox_vocab.keys()).union(set(sc.norm_vocab.keys())):
        if gram not in seen_grams:
            seen_grams.add(gram)
            toxic_salience = sc.salience(gram, attribute='tox')
            polite_salience = sc.salience(gram, attribute='norm')
            if toxic_salience > threshold:
                neg_out.writelines(f'{gram}\n')
            elif polite_salience > threshold:
                pos_out.writelines(f'{gram}\n')

We will now proceed to evaluate word toxicities with a logistic regression

In [None]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(CountVectorizer(), LogisticRegression(max_iter=10000))

In [None]:
X_train = corpus_tox + corpus_norm
y_train = [1] * len(corpus_tox) + [0] * len(corpus_norm)
pipe.fit(X_train, y_train)

In [None]:
coefs = pipe[1].coef_[0]
coefs.shape

In [None]:
word2coef = {w: coefs[idx] for w, idx in pipe[0].vocabulary_.items()}

In [None]:
import pickle
with open(vocab_root + '/word2coef_2.pkl', 'wb') as f:
    pickle.dump(word2coef, f)

In [None]:
from collections import defaultdict
toxic_counter = defaultdict(lambda: 1)
nontoxic_counter = defaultdict(lambda: 1)

for text in tqdm(corpus_tox):
    for token in tokenizer.encode(text):
        toxic_counter[token] += 1
for text in tqdm(corpus_norm):
    for token in tokenizer.encode(text):
        nontoxic_counter[token] += 1

In [None]:
token_toxicities = [toxic_counter[i] / (nontoxic_counter[i] + toxic_counter[i]) for i in range(len(tokenizer.vocab))]

In [None]:
with open(vocab_root + '/token_toxicities_2.txt', 'w') as f:
    for t in token_toxicities:
        f.write(str(t))
        f.write('\n')

Let us once again set up the model after increasing the vocabulary size

In [None]:
with open(vocab_root + "/negative-words.txt", "r") as f:
    s = f.readlines()
negative_words = list(map(lambda x: x[:-1], s))

with open(vocab_root + "/positive-words.txt", "r") as f:
    s = f.readlines()
positive_words = list(map(lambda x: x[:-1], s))

In [None]:
import pickle
with open(vocab_root + '/word2coef_2.pkl', 'rb') as f:
    word2coef = pickle.load(f)

In [None]:
token_toxicities = []
with open(vocab_root + '/token_toxicities_2.txt', 'r') as f:
    for line in f.readlines():
        token_toxicities.append(float(line))
token_toxicities = np.array(token_toxicities)
token_toxicities = np.maximum(0, np.log(1/(1/token_toxicities-1)))   # log odds ratio

# discourage meaningless tokens
for tok in ['.', ',', '-']:
    token_toxicities[tokenizer.encode(tok)][1] = 3

for tok in ['you']:
    token_toxicities[tokenizer.encode(tok)][1] = 0

In [None]:
def adjust_logits(logits, label=0):
    return logits - token_toxicities * 100 * (1 - 2 * label)

predictor = MaskedTokenPredictorBert(model, tokenizer, max_len=250, device=device, label=0, contrast_penalty=0.0, logits_postprocessor=adjust_logits)

editor = CondBertRewriter(
    model=model,
    tokenizer=tokenizer,
    device=device,
    neg_words=negative_words,
    pos_words=positive_words,
    word2coef=word2coef,
    token_toxicities=token_toxicities,
    predictor=predictor,
)

In [None]:
# will be used to ensure that replacements chosen by BERT will be semantically
# similar with the replaced variables

chooser = EmbeddingSimilarityChooser(sim_coef=10, tokenizer=tokenizer)

In [None]:
print(editor.translate('You are a stupid person!', prnt=False))

In [None]:
print(editor.replacement_loop('You are stupid!', verbose=False, chooser=chooser, n_tokens=(1, 2, 3), n_top=10))

In [None]:
translated_sentences_2 = []

for i, line in enumerate(tqdm(original_sentences)):
    inp = line.strip()
    out = editor.translate(inp, prnt=False)
    translated_sentences_2.append(out)

In [None]:
with open('results3.txt', 'w') as file:
    for item in translated_sentences_2:
        file.write("%s\n" % item)

In [None]:
editor.translate("you're becoming disgusting.")

In [None]:
editor.translate("well, we can spare your life.")

In [None]:
editor.translate("monkey, you have to wake up.")