This notebook reproduces creation of CondRoBERTuito vocabulary in Google Colaboratory.

This notebook creates the files: `positive-words.txt`, `negative-words.txt`, `toxic_words.txt`, `token_toxicities.txt` and `word2coef.pkl`

# 0. Prerequisites

In [47]:
!python -m spacy download es_core_news_sm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz#egg=es_core_news_sm==2.3.1 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mCollecting es_core_news_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz (16.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.2/16.2 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:0

## Setting up

In [48]:
from importlib import reload
import condbert
reload(condbert)

<module 'condbert' from '/mnt/4CC63D9CC63D876C/loncos/Text-Detoxification-in-Spanish/style_transfer/condBERT/condbert.py'>

In [49]:
VOCAB_DIRNAME = 'vocab'

In [50]:
from condbert import CondBertRewriter
from choosers import EmbeddingSimilarityChooser
from multiword.masked_token_predictor_bert import MaskedTokenPredictorBert

# 1. Loading BERT

In [51]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import pickle
import os
from tqdm.auto import tqdm, trange

In [52]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    print("cuda")
else:
    device = torch.device('cpu')
    print("cpu")

if device.type == 'cuda':
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'


cuda


In [53]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer


In [54]:
from transformers import RobertaForMaskedLM, RobertaTokenizer
model_name = "pysentimiento/robertuito-base-uncased" #Robertuito
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaForMaskedLM.from_pretrained(model_name)



In [55]:
#model_name = 'dccuchile/bert-base-spanish-wwm-uncased' #BETO
#tokenizer = BertTokenizer.from_pretrained(model_name)
#model = BertForMaskedLM.from_pretrained(model_name)

In [56]:
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

# 2. Preparing the vocabularires.


- negative-words.txt
- positive-words.txt
- word2coef.pkl
- token_toxicities.txt

These files should be prepared once.

In [57]:
tox_corpus_path = '../../data/processed/toxicCorpusWSpellErr.txt'
norm_corpus_path = '../../data/processed/normalCorpusWSpellErr.txt'

In [58]:
if not os.path.exists(VOCAB_DIRNAME):
    os.makedirs(VOCAB_DIRNAME)

### 2.1 Preparing the DRG-like vocabularies

In [59]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer

custom_stop_words = stopwords.words('spanish')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabriel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
import os
import argparse
import numpy as np
from tqdm import tqdm
from nltk import ngrams
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

class NgramSalienceCalculator():
    def __init__(self, tox_corpus, norm_corpus, use_ngrams=False):
        ngrams = (1, 3) if use_ngrams else (1, 1)
        self.vectorizer = CountVectorizer(ngram_range=ngrams, stop_words=custom_stop_words)

        tox_count_matrix = self.vectorizer.fit_transform(tox_corpus)
        self.tox_vocab = self.vectorizer.vocabulary_
        self.tox_counts = np.sum(tox_count_matrix, axis=0)

        norm_count_matrix = self.vectorizer.fit_transform(norm_corpus)
        self.norm_vocab = self.vectorizer.vocabulary_
        self.norm_counts = np.sum(norm_count_matrix, axis=0)

    def salience(self, feature, attribute='tox', lmbda=0.5):
        assert attribute in ['tox', 'norm']
        if feature not in self.tox_vocab:
            tox_count = 0.0
        else:
            tox_count = self.tox_counts[0, self.tox_vocab[feature]]

        if feature not in self.norm_vocab:
            norm_count = 0.0
        else:
            norm_count = self.norm_counts[0, self.norm_vocab[feature]]

        if attribute == 'tox':
            return (tox_count + lmbda) / (norm_count + lmbda)
        else:
            return (norm_count + lmbda) / (tox_count + lmbda)


In [61]:
from collections import Counter
c = Counter()

for fn in [tox_corpus_path, norm_corpus_path]:
    with open(fn, 'r') as corpus:
        for line in corpus.readlines():
            for tok in line.strip().split():
                c[tok] += 1

In [62]:
vocab = {w for w, _ in c.most_common() if _ > 0 and  len(w)>2}  # if we took words with > 1 occurences, vocabulary would be x2 smaller, but we'll survive this size

In [63]:
with open(tox_corpus_path, 'r') as tox_corpus, open(norm_corpus_path, 'r') as norm_corpus:
    corpus_tox = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in tox_corpus.readlines()]
    corpus_norm = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in norm_corpus.readlines()]

In [64]:
neg_out_name = VOCAB_DIRNAME + '/negative-words.txt'
pos_out_name = VOCAB_DIRNAME + '/positive-words.txt'

In [65]:
threshold = 4

In [66]:
import pandas as pd

In [67]:
inmi = pd.read_csv('../../data/lexicons/immigrant_lexicon.txt', sep='\t', names=['token'])
ins = pd.read_csv('../../data/lexicons/insults_lexicon.txt', sep='\t', names=['token'])
miso = pd.read_csv('../../data/lexicons/misogyny_lexicon.txt', sep='\t', names=['token'])
xeno = pd.read_csv('../../data/lexicons/xenophobia_lexicon.txt', sep='\t', names=['token'])
que = pd.read_csv('../../data/lexicons/badwords_quechua.txt', sep='\t', names=['token'])

In [68]:
tokens_lexicon = pd.concat([inmi,ins, miso, xeno,que], axis=0)

In [69]:
tokens_lexicon=tokens_lexicon.reset_index(drop=True)

In [70]:
import spacy
nlp = spacy.load('es_core_news_sm')

def es_adjetivo(palabra):
    doc = nlp(palabra)
    for token in doc:
        if token.pos_ == 'ADJ':
            return True

    return False

In [71]:
sc = NgramSalienceCalculator(corpus_tox, corpus_norm, False)
seen_grams = set(tuple(row) for row in tokens_lexicon.itertuples(index=False))

with open(neg_out_name, 'w') as neg_out, open(pos_out_name, 'w') as pos_out:
    for gram in set(sc.tox_vocab.keys()).union(set(sc.norm_vocab.keys())):
        if gram not in seen_grams and es_adjetivo(gram):
            seen_grams.add(gram)
            toxic_salience = sc.salience(gram, attribute='tox')
            polite_salience = sc.salience(gram, attribute='norm')
            if toxic_salience > threshold:
                neg_out.writelines(f'{gram}\n')
            elif polite_salience > threshold:
                pos_out.writelines(f'{gram}\n')

In [72]:
with open(neg_out_name, 'a') as neg_out:
  for tok in tokens_lexicon["token"]:
    neg_out.writelines(f'{tok}\n')

## 2.2 Evaluating word toxicities with a logistic regression

In [73]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(CountVectorizer(stop_words=custom_stop_words), LogisticRegression(max_iter=1000))

In [74]:
X_train = corpus_tox + corpus_norm
y_train = [1] * len(corpus_tox) + [0] * len(corpus_norm)
pipe.fit(X_train, y_train);

In [75]:
coefs = pipe[1].coef_[0]
coefs.shape

(96051,)

In [76]:
word2coef = {w: coefs[idx] for w, idx in pipe[0].vocabulary_.items()}

In [77]:
for token in tokens_lexicon['token']:
    word2coef[token] = 5

In [78]:
import pickle
with open(VOCAB_DIRNAME + '/word2coef.pkl', 'wb') as f:
    pickle.dump(word2coef, f)

## 2.3 Labelling BERT tokens by toxicity

In [79]:
from collections import defaultdict
toxic_counter = defaultdict(lambda: 1)
nontoxic_counter = defaultdict(lambda: 1)

for text in tqdm(corpus_tox):
    for token in tokenizer.encode(text):
        toxic_counter[token] += 1
for text in tqdm(corpus_norm):
    for token in tokenizer.encode(text):
        nontoxic_counter[token] += 1

100%|██████████| 109372/109372 [00:09<00:00, 11484.72it/s]
100%|██████████| 12166/12166 [00:01<00:00, 11039.20it/s]


In [80]:
token_toxicities = [toxic_counter[i] / (nontoxic_counter[i] + toxic_counter[i]) for i in range(len(tokenizer.vocab))]

In [81]:
with open(VOCAB_DIRNAME + '/token_toxicities.txt', 'w') as f:
    for t in token_toxicities:
        f.write(str(t))
        f.write('\n')

# 3. Setting up the model

### 3.1 Loading the vocabularies

In [82]:
with open(VOCAB_DIRNAME + "/negative-words.txt", "r") as f:
    s = f.readlines()
negative_words = list(map(lambda x: x[:-1], s))

with open(VOCAB_DIRNAME + "/positive-words.txt", "r") as f:
    s = f.readlines()
positive_words = list(map(lambda x: x[:-1], s))

In [83]:
import pickle
with open(VOCAB_DIRNAME + '/word2coef.pkl', 'rb') as f:
    word2coef = pickle.load(f)

In [84]:
token_toxicities = []
with open(VOCAB_DIRNAME + '/token_toxicities.txt', 'r') as f:
    for line in f.readlines():
        token_toxicities.append(float(line))
token_toxicities = np.array(token_toxicities)
token_toxicities = np.maximum(0, np.log(1/(1/token_toxicities-1)))   # log odds ratio

# discourage meaningless tokens
for tok in ['.', ',', '-']:
    token_toxicities[tokenizer.encode(tok)][1] = 3

for tok in ['you']:
    token_toxicities[tokenizer.encode(tok)][1] = 0

In [85]:
def adjust_logits(logits, label=0):
    return logits - token_toxicities * 100 * (1 - 2 * label)

predictor = MaskedTokenPredictorBert(model, tokenizer, max_len=250, device=device, label=0, contrast_penalty=0.0, logits_postprocessor=adjust_logits)

editor = CondBertRewriter(
    model=model,
    tokenizer=tokenizer,
    device=device,
    neg_words=negative_words,
    pos_words=positive_words,
    word2coef=word2coef,
    token_toxicities=token_toxicities,
    predictor=predictor,
)

The model below is used for reranking BERT hypotheses and helps to increase semantic similarity by choosing the hypotheses with  embeddings similar to the orignal words.

In [86]:
chooser = EmbeddingSimilarityChooser(sim_coef=10, tokenizer=tokenizer)

# 4. Finally, the inference

In [87]:
print(editor.translate('Oye por qué no molestas a tu puta madre?', prnt=False))

oye por qué no molestas, a tu segunda madre?


In [88]:
print(editor.replacement_loop('Oye por qué no molestas a tu puta madre?', verbose=True, chooser=chooser, n_tokens=(1, 2), n_top=10))

['tas'] -> ['tas,']
['▁puta'] -> ['▁pinches', '▁señora']
oye por qué no molestas, a tu pinches señora madre?


Parameters that could be tuned:
* The coeffincient in `adjust_logits` - the larger it is, the more the model avoids toxic words
* The coefficient in `EmbeddingSimilarityChooser` - the larger it is, the more the model tries to preserve content
* n_tokens - how many words can be generated from one
* n_top - how many BERT hypotheses are reranked

In [91]:
print(editor.translate('feminista de mierda', prnt=False))

feminista de mierda.


In [90]:
print(editor.replacement_loop('feminista de mierda', verbose=True, chooser=chooser, n_tokens=(1, 2), n_top=10))

['▁mierda'] -> ['▁mierda,']
feminista de mierda,
