This notebook reproduces creation of CondRoBERTuito vocabulary in Google Colaboratory.

This notebook creates the files: `positive-words.txt`, `negative-words.txt`, `toxic_words.txt`, `token_toxicities.txt` and `word2coef.pkl`

# 0. Prerequisites

In [1]:
!python -m spacy download es_core_news_sm

[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz#egg=es_core_news_sm==2.3.1 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mCollecting es_core_news_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz (16.2 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m16.2/16.2 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[0m[38;5;2m‚úî Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')


## Setting up

In [2]:
from importlib import reload
import condbert
reload(condbert)

<module 'condbert' from '/mnt/4CC63D9CC63D876C/loncos/Text-Detoxification-in-Spanish/style_transfer/condBERT/condbert.py'>

In [3]:
VOCAB_DIRNAME = 'vocab'

In [4]:
from condbert import CondBertRewriter
from choosers import EmbeddingSimilarityChooser
from multiword.masked_token_predictor_bert import MaskedTokenPredictorBert

  from .autonotebook import tqdm as notebook_tqdm


# 1. Loading BERT

In [5]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import pickle
import os
from tqdm.auto import tqdm, trange

In [6]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    print("cuda")
else:
    device = torch.device('cpu')
    print("cpu")

if device.type == 'cuda':
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'


cuda


In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer


In [8]:
from transformers import RobertaForMaskedLM, RobertaTokenizer
model_name = "pysentimiento/robertuito-base-uncased" #Robertuito
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaForMaskedLM.from_pretrained(model_name)



In [9]:
#model_name = 'dccuchile/bert-base-spanish-wwm-uncased' #BETO
#tokenizer = BertTokenizer.from_pretrained(model_name)
#model = BertForMaskedLM.from_pretrained(model_name)

In [10]:
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

# 2. Preparing the vocabularires.


- negative-words.txt
- positive-words.txt
- word2coef.pkl
- token_toxicities.txt

These files should be prepared once.

In [11]:
tox_corpus_path = '../../data/processed/toxicCorpusWSpellErr.txt'
norm_corpus_path = '../../data/processed/normalCorpusWSpellErr.txt'

In [12]:
if not os.path.exists(VOCAB_DIRNAME):
    os.makedirs(VOCAB_DIRNAME)

### 2.1 Preparing the DRG-like vocabularies

In [13]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer

custom_stop_words = stopwords.words('spanish')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabriel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
import os
import argparse
import numpy as np
from tqdm import tqdm
from nltk import ngrams
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

class NgramSalienceCalculator():
    def __init__(self, tox_corpus, norm_corpus, use_ngrams=False):
        ngrams = (1, 3) if use_ngrams else (1, 1)
        self.vectorizer = CountVectorizer(ngram_range=ngrams, stop_words=custom_stop_words)

        tox_count_matrix = self.vectorizer.fit_transform(tox_corpus)
        self.tox_vocab = self.vectorizer.vocabulary_
        self.tox_counts = np.sum(tox_count_matrix, axis=0)

        norm_count_matrix = self.vectorizer.fit_transform(norm_corpus)
        self.norm_vocab = self.vectorizer.vocabulary_
        self.norm_counts = np.sum(norm_count_matrix, axis=0)

    def salience(self, feature, attribute='tox', lmbda=0.5):
        assert attribute in ['tox', 'norm']
        if feature not in self.tox_vocab:
            tox_count = 0.0
        else:
            tox_count = self.tox_counts[0, self.tox_vocab[feature]]

        if feature not in self.norm_vocab:
            norm_count = 0.0
        else:
            norm_count = self.norm_counts[0, self.norm_vocab[feature]]

        if attribute == 'tox':
            return (tox_count + lmbda) / (norm_count + lmbda)
        else:
            return (norm_count + lmbda) / (tox_count + lmbda)


In [15]:
from collections import Counter
c = Counter()

for fn in [tox_corpus_path, norm_corpus_path]:
    with open(fn, 'r') as corpus:
        for line in corpus.readlines():
            for tok in line.strip().split():
                c[tok] += 1

In [16]:
vocab = {w for w, _ in c.most_common() if _ > 0 and  len(w)>2}  # if we took words with > 1 occurences, vocabulary would be x2 smaller, but we'll survive this size

In [17]:
with open(tox_corpus_path, 'r') as tox_corpus, open(norm_corpus_path, 'r') as norm_corpus:
    corpus_tox = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in tox_corpus.readlines()]
    corpus_norm = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in norm_corpus.readlines()]

In [18]:
neg_out_name = VOCAB_DIRNAME + '/negative-words.txt'
pos_out_name = VOCAB_DIRNAME + '/positive-words.txt'

In [19]:
threshold = 4

In [20]:
import pandas as pd

In [21]:
inmi = pd.read_csv('../../data/lexicons/immigrant_lexicon.txt', sep='\t', names=['token'])
ins = pd.read_csv('../../data/lexicons/insults_lexicon.txt', sep='\t', names=['token'])
miso = pd.read_csv('../../data/lexicons/misogyny_lexicon.txt', sep='\t', names=['token'])
xeno = pd.read_csv('../../data/lexicons/xenophobia_lexicon.txt', sep='\t', names=['token'])
que = pd.read_csv('../../data/lexicons/badwords_quechua.txt', sep='\t', names=['token'])

In [22]:
tokens_lexicon = pd.concat([inmi,ins, miso, xeno,que], axis=0)

In [23]:
tokens_lexicon=tokens_lexicon.reset_index(drop=True)

In [24]:
import spacy
nlp = spacy.load('es_core_news_sm')

def es_adjetivo(palabra):
    doc = nlp(palabra)
    for token in doc:
        if token.pos_ == 'ADJ':
            return True

    return False

In [25]:
sc = NgramSalienceCalculator(corpus_tox, corpus_norm, False)
seen_grams = set(tuple(row) for row in tokens_lexicon.itertuples(index=False))

with open(neg_out_name, 'w') as neg_out, open(pos_out_name, 'w') as pos_out:
    for gram in set(sc.tox_vocab.keys()).union(set(sc.norm_vocab.keys())):
        if gram not in seen_grams and es_adjetivo(gram):
            seen_grams.add(gram)
            toxic_salience = sc.salience(gram, attribute='tox')
            polite_salience = sc.salience(gram, attribute='norm')
            if toxic_salience > threshold:
                neg_out.writelines(f'{gram}\n')
            elif polite_salience > threshold:
                pos_out.writelines(f'{gram}\n')

In [26]:
with open(neg_out_name, 'a') as neg_out:
  for tok in tokens_lexicon["token"]:
    neg_out.writelines(f'{tok}\n')

## 2.2 Evaluating word toxicities with a logistic regression

In [27]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(CountVectorizer(stop_words=custom_stop_words), LogisticRegression(max_iter=1000))

In [28]:
X_train = corpus_tox + corpus_norm
y_train = [1] * len(corpus_tox) + [0] * len(corpus_norm)
pipe.fit(X_train, y_train);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
coefs = pipe[1].coef_[0]
coefs.shape

(387168,)

In [30]:
word2coef = {w: coefs[idx] for w, idx in pipe[0].vocabulary_.items()}

In [31]:
for token in tokens_lexicon['token']:
    word2coef[token] = 5

In [32]:
import pickle
with open(VOCAB_DIRNAME + '/word2coef.pkl', 'wb') as f:
    pickle.dump(word2coef, f)

## 2.3 Labelling BERT tokens by toxicity

In [33]:
from collections import defaultdict
toxic_counter = defaultdict(lambda: 1)
nontoxic_counter = defaultdict(lambda: 1)

for text in tqdm(corpus_tox):
    for token in tokenizer.encode(text):
        toxic_counter[token] += 1
for text in tqdm(corpus_norm):
    for token in tokenizer.encode(text):
        nontoxic_counter[token] += 1

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 142907/142907 [00:11<00:00, 12231.72it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 251944/251944 [00:37<00:00, 6751.47it/s]


In [34]:
token_toxicities = [toxic_counter[i] / (nontoxic_counter[i] + toxic_counter[i]) for i in range(len(tokenizer.vocab))]

In [35]:
with open(VOCAB_DIRNAME + '/token_toxicities.txt', 'w') as f:
    for t in token_toxicities:
        f.write(str(t))
        f.write('\n')

# 3. Setting up the model

### 3.1 Loading the vocabularies

In [36]:
with open(VOCAB_DIRNAME + "/negative-words.txt", "r") as f:
    s = f.readlines()
negative_words = list(map(lambda x: x[:-1], s))

with open(VOCAB_DIRNAME + "/positive-words.txt", "r") as f:
    s = f.readlines()
positive_words = list(map(lambda x: x[:-1], s))

In [37]:
import pickle
with open(VOCAB_DIRNAME + '/word2coef.pkl', 'rb') as f:
    word2coef = pickle.load(f)

In [38]:
token_toxicities = []
with open(VOCAB_DIRNAME + '/token_toxicities.txt', 'r') as f:
    for line in f.readlines():
        token_toxicities.append(float(line))
token_toxicities = np.array(token_toxicities)
token_toxicities = np.maximum(0, np.log(1/(1/token_toxicities-1)))   # log odds ratio

# discourage meaningless tokens
for tok in ['.', ',', '-']:
    token_toxicities[tokenizer.encode(tok)][1] = 3

for tok in ['you']:
    token_toxicities[tokenizer.encode(tok)][1] = 0

In [39]:
def adjust_logits(logits, label=0):
    return logits - token_toxicities * 100 * (1 - 2 * label)

predictor = MaskedTokenPredictorBert(model, tokenizer, max_len=250, device=device, label=0, contrast_penalty=0.0, logits_postprocessor=adjust_logits)

editor = CondBertRewriter(
    model=model,
    tokenizer=tokenizer,
    device=device,
    neg_words=negative_words,
    pos_words=positive_words,
    word2coef=word2coef,
    token_toxicities=token_toxicities,
    predictor=predictor,
)

The model below is used for reranking BERT hypotheses and helps to increase semantic similarity by choosing the hypotheses with  embeddings similar to the orignal words.

In [40]:
chooser = EmbeddingSimilarityChooser(sim_coef=10, tokenizer=tokenizer)

# 4. Finally, the inference

In [41]:
print(editor.translate('Oye por qu√© no molestas a tu puta madre?', prnt=False))

oye por qu√© no molestas a tu santa madre?


In [42]:
print(editor.replacement_loop('Oye por qu√© no molestas a tu puta madre?', verbose=True, chooser=chooser, n_tokens=(1, 2), n_top=10))

['tas'] -> ['tas']
['‚ñÅputa'] -> ['‚ñÅp', '***']
oye por qu√© no molestas a tu p*** madre?


Parameters that could be tuned:
* The coeffincient in `adjust_logits` - the larger it is, the more the model avoids toxic words
* The coefficient in `EmbeddingSimilarityChooser` - the larger it is, the more the model tries to preserve content
* n_tokens - how many words can be generated from one
* n_top - how many BERT hypotheses are reranked

In [43]:
print(editor.translate('feminista de mierda', prnt=False))

feminista de mierda.


In [44]:
print(editor.replacement_loop('feminista de mierda', verbose=True, chooser=chooser, n_tokens=(1, 2), n_top=10))

['‚ñÅmierda'] -> ['‚ñÅvida']
feminista de vida


# Evaluation

In [71]:
dataTest = pd.read_csv('../../data/processed/train_es.tsv', sep='\t')

In [72]:
dataTest

Unnamed: 0,id,text,HS,TR,AG
0,20001,Easyjet quiere duplicar el n√∫mero de mujeres p...,1,0,0
1,20002,El gobierno debe crear un control estricto de ...,1,0,0
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0
3,20004,"‚Äî Yo soy respetuoso con los dem√°s, s√≥lamente l...",0,0,0
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0
...,...,...,...,...,...
4464,24996,@miriaan_ac @Linaveso_2105 @HumildesSquad_ C√ÅL...,1,1,1
4465,24997,"@IvanDuque presidente en C√∫cuta , tenemos prob...",1,0,1
4466,24998,- Callat√© Visto Que Te Dejo En Putaüé§üé∂,0,0,0
4467,24999,-¬øporque los hombres se casan con las mujeres?...,1,0,0


In [73]:
dataTest = dataTest["text"].head(50)


In [74]:
dataTest

0     Easyjet quiere duplicar el n√∫mero de mujeres p...
1     El gobierno debe crear un control estricto de ...
2     Yo veo a mujeres destruidas por acoso laboral ...
3     ‚Äî Yo soy respetuoso con los dem√°s, s√≥lamente l...
4     Antonio Caballero y como ser de mal gusto e ig...
5     @Zurine3 Cu√°nto hijo de puta sobra en el mundo...
6     Pablo Casado deber√≠a estudiar un m√°ster en inm...
7     vos me haces eso te pego una patada en la cara...
8     #Noticiasdelalinea: Detienen en Guatapanal mot...
9             @MaivePerez Llora te lo.mereces por zorra
10    Padres indocumentados realizaron una huelga de...
11    No s√© porque hay personas que piensan que todo...
12    A Tomi Alegre JAKSKKSJSJ ‚Äî C√°llate q tdv estoy...
13    @mspn95 @SC_ESPN Callate gallina puta y cntala...
14    Albondigas! Casi al estilo de las fodongas per...
15    Todos: -#NoTodosLosHombres -PPK, presidente de...
16    El juez cita a declarar a la menor que denunci...
17    VAMOOOO LA PUTA MADRE SE LO ME

In [75]:
test = []
for twt in dataTest:
    test.append(editor.translate(twt, prnt=False))

In [76]:
test

["easyjet quiere duplicar el n√∫mero de mujeres piloto' ver√°s t√∫ para aparcar el avi√≥n.. http://t.co/46nulkm09x",
 'el gobierno debe crear un control estricto de inmigraci√≥n en las zonas fronterizas con colombia por q despu√©s del 20-8 querr√°n venir en masa',
 'Yo veo a mujeres destruidas por acoso laboral y callejero. Otras con depresi√≥n debido a violaci√≥n sexual o maltrato f√≠sico. Y conoc√≠ a varias que se suicidaron por este tipo de comportamientos machistas. Vas a seguir con tu show pobre? https://t.co/JE6MEgvIh4',
 ' yo soy respetuoso con los dem√°s, s√≥lamente les recuerdo lo que son. y si son escoria no es culpa m√≠a. ...  ¬°claro que lo s√©! ¬ø¬°por qui√©n me tomas!?',
 'antonio caballero y como ser de mal gusto e ignorante a la vez. claro que no es lo mismo acoso sexual y violaci√≥n pero hay varias conductas que componen la violencia sexual.',
 '@zurine3 cu√°nto hijo de puta, sobra en el mundo...',
 'Pablo Casado deber√≠a estudiar un m√°ster en inmigraci√≥n https://t.c

In [78]:
test2 = []
for twt in dataTest:
    test2.append(editor.replacement_loop(twt, verbose=False, chooser=chooser, n_tokens=(1, 2, 3), n_top=10))
test2



["easyjet quiere duplicar el n√∫mero de mujeres piloto' ver√°s t√∫ para aparcar el avi√≥n.. http://t.co/46nulkm09x",
 'el gobierno debe crear un control estricto de inmigraci√≥n en las zonas fronterizas con colombia por q despu√©s del 20-8querr√°n venir en masa',
 'Yo veo a mujeres destruidas por acoso laboral y callejero. Otras con depresi√≥n debido a violaci√≥n sexual o maltrato f√≠sico. Y conoc√≠ a varias que se suicidaron por este tipo de comportamientos machistas. Vas a seguir con tu show pobre? https://t.co/JE6MEgvIh4',
 ' yo soy respetuoso con los dem√°s, s√≥lamenme cuando les recuerdo lo que son. y si son escoria no es culpa m√≠a. ...  ¬°claro que lo s√©! ¬ø¬°por qui√©n me tomas!?',
 'antonio caballero y como ser de mal gusto e ignorante a la vez. claro que no es lo mismo acoso sexual y violaci√≥n pero hay varias conductas, muchas m√°s que componen la violencia sexual.',
 '@zurine3 cu√°nto hijo de puta, gente as√≠ sobra en el mundo...',
 'Pablo Casado deber√≠a estudiar un m√°st

In [79]:
print(editor.translate('@zurine3 cu√°nto hijo de puta, gente as√≠ sobra en el mundo', prnt=False))

@zurine3 cu√°nto hijo de puta, gente as√≠ sobra en el mundo


In [80]:
print(editor.replacement_loop('@zurine3 cu√°nto hijo de puta, gente as√≠ sobra en el mundo', verbose=True, chooser=chooser, n_tokens=(1, 2), n_top=10))

['ine'] -> ['ine']
@zurine3 cu√°nto hijo de puta, gente as√≠ sobra en el mundo
