This notebook reproduces creation of CondRoBERTuito vocabulary in Google Colaboratory.

This notebook creates the files: `positive-words.txt`, `negative-words.txt`, `toxic_words.txt`, `token_toxicities.txt` and `word2coef.pkl`

# 0. Prerequisites

In [1]:
!pip install gensim pysentimiento tweet-preprocessor tensorboardX


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


## Setting up

In [3]:
from importlib import reload
import condbert
reload(condbert)

<module 'condbert' from '/media/gabriel/Datos Linux/loncos/Text-Detoxification-in-Spanish/style_transfer/condBERT/condbert.py'>

In [4]:
VOCAB_DIRNAME = 'vocab'

In [5]:
from condbert import CondBertRewriter
from choosers import EmbeddingSimilarityChooser
from multiword.masked_token_predictor_bert import MaskedTokenPredictorBert

  from .autonotebook import tqdm as notebook_tqdm


# 1. Loading BERT

In [6]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import pickle
import os
from tqdm.auto import tqdm, trange

In [7]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    print("cuda")
else:
    device = torch.device('cpu')
    print("cpu")

if device.type == 'cuda':
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'


cuda


In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer


In [9]:
from transformers import RobertaForMaskedLM, RobertaTokenizer
model_name = "pysentimiento/robertuito-base-uncased" #Robertuito
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaForMaskedLM.from_pretrained(model_name)



In [10]:
#model_name = 'dccuchile/bert-base-spanish-wwm-uncased' #BETO
#tokenizer = BertTokenizer.from_pretrained(model_name)
#model = BertForMaskedLM.from_pretrained(model_name)

In [11]:
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

# 2. Preparing the vocabularires.


- negative-words.txt
- positive-words.txt
- word2coef.pkl
- token_toxicities.txt

These files should be prepared once.

In [38]:
tox_corpus_path = '../../data/processed/toxicCorpusWSpellErr.txt'
norm_corpus_path = '../../data/processed/normalCorpusWSpellErr.txt'

In [39]:
if not os.path.exists(VOCAB_DIRNAME):
    os.makedirs(VOCAB_DIRNAME)

### 2.1 Preparing the DRG-like vocabularies

In [40]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer

custom_stop_words = stopwords.words('spanish')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabriel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
import os
import argparse
import numpy as np
from tqdm import tqdm
from nltk import ngrams
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

class NgramSalienceCalculator():
    def __init__(self, tox_corpus, norm_corpus, use_ngrams=False):
        ngrams = (1, 3) if use_ngrams else (1, 1)
        self.vectorizer = CountVectorizer(ngram_range=ngrams, stop_words=custom_stop_words)

        tox_count_matrix = self.vectorizer.fit_transform(tox_corpus)
        self.tox_vocab = self.vectorizer.vocabulary_
        self.tox_counts = np.sum(tox_count_matrix, axis=0)

        norm_count_matrix = self.vectorizer.fit_transform(norm_corpus)
        self.norm_vocab = self.vectorizer.vocabulary_
        self.norm_counts = np.sum(norm_count_matrix, axis=0)

    def salience(self, feature, attribute='tox', lmbda=0.5):
        assert attribute in ['tox', 'norm']
        if feature not in self.tox_vocab:
            tox_count = 0.0
        else:
            tox_count = self.tox_counts[0, self.tox_vocab[feature]]

        if feature not in self.norm_vocab:
            norm_count = 0.0
        else:
            norm_count = self.norm_counts[0, self.norm_vocab[feature]]

        if attribute == 'tox':
            return (tox_count + lmbda) / (norm_count + lmbda)
        else:
            return (norm_count + lmbda) / (tox_count + lmbda)


In [42]:
from collections import Counter
c = Counter()

for fn in [tox_corpus_path, norm_corpus_path]:
    with open(fn, 'r') as corpus:
        for line in corpus.readlines():
            for tok in line.strip().split():
                c[tok] += 1

In [43]:
vocab = {w for w, _ in c.most_common() if _ > 0 and  len(w)>2}  # if we took words with > 1 occurences, vocabulary would be x2 smaller, but we'll survive this size

In [44]:
with open(tox_corpus_path, 'r') as tox_corpus, open(norm_corpus_path, 'r') as norm_corpus:
    corpus_tox = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in tox_corpus.readlines()]
    corpus_norm = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in norm_corpus.readlines()]

In [45]:
neg_out_name = VOCAB_DIRNAME + '/negative-words.txt'
pos_out_name = VOCAB_DIRNAME + '/positive-words.txt'

In [46]:
threshold = 4

In [47]:
import pandas as pd

In [48]:
inmi = pd.read_csv('../../data/lexicons/immigrant_lexicon.txt', sep='\t', names=['token'])
ins = pd.read_csv('../../data/lexicons/insults_lexicon.txt', sep='\t', names=['token'])
miso = pd.read_csv('../../data/lexicons/misogyny_lexicon.txt', sep='\t', names=['token'])
xeno = pd.read_csv('../../data/lexicons/xenophobia_lexicon.txt', sep='\t', names=['token'])
que = pd.read_csv('../../data/lexicons/badwords_loncos.txt', sep='\t', names=['token'])

In [49]:
tokens_lexicon = pd.concat([inmi,ins, miso, xeno,que], axis=0)

In [50]:
tokens_lexicon=tokens_lexicon.reset_index(drop=True)

In [51]:
import spacy
nlp = spacy.load('es_core_news_sm')

def es_adjetivo(palabra):
    doc = nlp(palabra)
    for token in doc:
        if token.pos_ == 'ADJ':
            return True

    return False

In [52]:
sc = NgramSalienceCalculator(corpus_tox, corpus_norm, False)
seen_grams = set(tuple(row) for row in tokens_lexicon.itertuples(index=False))

with open(neg_out_name, 'w') as neg_out, open(pos_out_name, 'w') as pos_out:
    for gram in set(sc.tox_vocab.keys()).union(set(sc.norm_vocab.keys())):
        if gram not in seen_grams and es_adjetivo(gram):
            seen_grams.add(gram)
            toxic_salience = sc.salience(gram, attribute='tox')
            polite_salience = sc.salience(gram, attribute='norm')
            if toxic_salience > threshold:
                neg_out.writelines(f'{gram}\n')
            elif polite_salience > threshold:
                pos_out.writelines(f'{gram}\n')

In [53]:
with open(neg_out_name, 'a') as neg_out:
  for tok in tokens_lexicon["token"]:
    neg_out.writelines(f'{tok}\n')

## 2.2 Evaluating word toxicities with a logistic regression

In [54]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(CountVectorizer(stop_words=custom_stop_words), LogisticRegression(max_iter=1000))

In [55]:
X_train = corpus_tox + corpus_norm
y_train = [1] * len(corpus_tox) + [0] * len(corpus_norm)
pipe.fit(X_train, y_train);

In [56]:
coefs = pipe[1].coef_[0]
coefs.shape

(387168,)

In [57]:
word2coef = {w: coefs[idx] for w, idx in pipe[0].vocabulary_.items()}

In [58]:
for token in tokens_lexicon['token']:
    word2coef[token] = 5

In [59]:
import pickle
with open(VOCAB_DIRNAME + '/word2coef.pkl', 'wb') as f:
    pickle.dump(word2coef, f)

## 2.3 Labelling BERT tokens by toxicity

In [60]:
from collections import defaultdict
toxic_counter = defaultdict(lambda: 1)
nontoxic_counter = defaultdict(lambda: 1)

for text in tqdm(corpus_tox):
    for token in tokenizer.encode(text):
        toxic_counter[token] += 1
for text in tqdm(corpus_norm):
    for token in tokenizer.encode(text):
        nontoxic_counter[token] += 1

100%|██████████| 142907/142907 [00:14<00:00, 9649.51it/s] 
100%|██████████| 251944/251944 [00:51<00:00, 4870.19it/s]


In [61]:
token_toxicities = [toxic_counter[i] / (nontoxic_counter[i] + toxic_counter[i]) for i in range(len(tokenizer.vocab))]

In [62]:
with open(VOCAB_DIRNAME + '/token_toxicities.txt', 'w') as f:
    for t in token_toxicities:
        f.write(str(t))
        f.write('\n')

# 3. Setting up the model

### 3.1 Loading the vocabularies

In [63]:
with open(VOCAB_DIRNAME + "/negative-words.txt", "r") as f:
    s = f.readlines()
negative_words = list(map(lambda x: x[:-1], s))

with open(VOCAB_DIRNAME + "/positive-words.txt", "r") as f:
    s = f.readlines()
positive_words = list(map(lambda x: x[:-1], s))

In [64]:
import pickle
with open(VOCAB_DIRNAME + '/word2coef.pkl', 'rb') as f:
    word2coef = pickle.load(f)

In [65]:
token_toxicities = []
with open(VOCAB_DIRNAME + '/token_toxicities.txt', 'r') as f:
    for line in f.readlines():
        token_toxicities.append(float(line))
token_toxicities = np.array(token_toxicities)
token_toxicities = np.maximum(0, np.log(1/(1/token_toxicities-1)))   # log odds ratio

# discourage meaningless tokens
for tok in ['.', ',', '-']:
    token_toxicities[tokenizer.encode(tok)][1] = 3

for tok in ['you']:
    token_toxicities[tokenizer.encode(tok)][1] = 0

In [66]:
def adjust_logits(logits, label=0):
    return logits - token_toxicities * 100 * (1 - 2 * label)

predictor = MaskedTokenPredictorBert(model, tokenizer, max_len=250, device=device, label=0, contrast_penalty=0.0, logits_postprocessor=adjust_logits)

editor = CondBertRewriter(
    model=model,
    tokenizer=tokenizer,
    device=device,
    neg_words=negative_words,
    pos_words=positive_words,
    word2coef=word2coef,
    token_toxicities=token_toxicities,
    predictor=predictor,
)

The model below is used for reranking BERT hypotheses and helps to increase semantic similarity by choosing the hypotheses with  embeddings similar to the orignal words.

In [67]:
chooser = EmbeddingSimilarityChooser(sim_coef=10, tokenizer=tokenizer)

# 4. Finally, the inference

In [68]:
print(editor.translate('Oye por qué no molestas a tu puta madre?', prnt=False))

oye por qué no molestas a tu santa madre?


In [19]:
print(editor.replacement_loop('Oye por qué no molestas a tu puta madre?', verbose=True, chooser=chooser, n_tokens=(1, 2), n_top=10))

['tas'] -> ['tas']
['▁puta'] -> ['▁p', '***']
oye por qué no molestas a tu p*** madre?


Parameters that could be tuned:
* The coeffincient in `adjust_logits` - the larger it is, the more the model avoids toxic words
* The coefficient in `EmbeddingSimilarityChooser` - the larger it is, the more the model tries to preserve content
* n_tokens - how many words can be generated from one
* n_top - how many BERT hypotheses are reranked

In [20]:

print(editor.translate("! Jacintooo...!, !!!te voy a sacar la mierda...!!!", prnt=False))
print(editor.replacement_loop("! Jacintooo...!, !!!te voy a sacar la mierda...!!!", verbose=False, chooser=chooser, n_tokens=(1, 2), n_top=10))

! jacintooo...!, !!!te voy a sacar la basura...!!!
! jacintooo...!, !!!te voy a sacar la...!!!


In [21]:
print(editor.translate("@alan15537141 Callate que después me andas abrazando puta loca jajaja", prnt=False))
print(editor.replacement_loop("@alan15537141 Callate que después me andas abrazando puta loca jajaja", verbose=False, chooser=chooser, n_tokens=(1, 2), n_top=10))   

@alan15537141 callate que después me andas abrazando maldita loca jajaja
@alan15537141 callate que después me andas abrazando con ropa y todo jajaja


In [22]:
print(editor.translate("@Ricardo_Peralta Cállate puta creo que aún sigo ebria 😂", prnt=False))
print(editor.replacement_loop("@Ricardo_Peralta Cállate puta creo que aún sigo ebria 😂", verbose=True, chooser=chooser, n_tokens=(1, 2), n_top=10))   

@ricardo_peralta cállate puta, creo que aún sigo ebria 
['per'] -> ['per']
['▁puta'] -> ['▁y', '▁vete']
@ricardo_peralta cállate y vete creo que aún sigo ebria 


In [23]:
print(editor.translate("Eres un upakuna", prnt=False))
print(editor.replacement_loop("Eres un upakuna", verbose=False, chooser=chooser, n_tokens=(1, 2), n_top=10))   

eres un upakuna
eres un ca-pakuna


In [69]:
print(editor.translate("El huañuli ese qué va a poder cargar tantos sillares.", prnt=False))
print(editor.replacement_loop("El huañuli ese qué va a poder cargar tantos sillares.", verbose=False, chooser=chooser, n_tokens=(1, 2), n_top=10))   

el huichuli ese qué va a poder cargar tantos sillares.
el huévuli ese qué va a poder cargar tantos sillares.


# Evaluation

In [22]:
import pandas as pd

In [24]:
dataTest = pd.read_csv('../../data/processed/test.txt', sep='\t', header=None, names=['text'])

In [25]:
dataTest

Unnamed: 0,text
0,Dentro de poco vamos a tener malaria por todos...
1,@MilagrosMBarros @AgusQ_99 Jajajajajajaja call...
2,"Te subis a un bondi y esta todo cerrado, ABRAN..."
3,"La ""marcha negra"" acelera el paso: más de 40.0..."
4,@mariooovt Está pinche zorra cállate
5,El Estado del Bienestar para quién? Para inmig...
6,@lamambanegra17 @BreenDLP @ChumelTorres @Juand...
7,"@rubenssambueza eres una basura de persona, lo..."
8,Muérete zorra que yo no puedo ver Monstruos S....
9,"@ludmi526 Cállate zorra, me haces acordar a nu..."


In [26]:
test = []
for twt in dataTest['text']:
    test.append(editor.translate(twt, prnt=False))
test

['dentro de poco vamos a tener malaria por todos estos negros sudacas de africa sudacas de áfricaaaaaaaaaaaaaaa',
 '@milagrosmbarros @agusq_99 jajajajajajaja callate que ni labial tenias antes , yo si',
 'te subis a un bondi y esta todo cerrado, abran una ventana manga de bichos url',
 'la "marcha blanca" acelera el paso: más de 40.000 subsaharianos esperan en marruecos para entrar ilegalmente en españa https://t.co/0mhgjnwjoz  ¡¡ okupa monclovita,...¿ya tienes pensado donde cobijar a toda esta gente...?prepara barracas en moncloa, creo que hay terreno...!!',
 '@mariooovt está muy loca cállate',
 'el estado del bienestar para quién? para migrantes (mantero ,violadores y demás chusma que estáis trayendo) pagas y ss que tenemos que pagar los demás ?  para cuándo el bienestar de los españoles, seguridad en el trabajo de policia y guardia civil, trabajo y ayuda xa parados..? https://t.co/yu4cczmpga',
 '@lamambanegra17 @breendlp @chumeltorres @juandedios_p @kimberlyloaiza_ pinches estupida,

In [27]:
test2 = []
for twt in dataTest['text']:
    test2.append(editor.replacement_loop(twt, verbose=False, chooser=chooser, n_tokens=(1, 2, 3), n_top=10))
test2



['dentro de poco vamos a tener malaria por todos estos negros sudacas europeos de africa sudacas sudacas de áfricaaaaaaaaaaaaaaa',
 '@milagrosmbarros @agusq_99 jajajajajajaja callate que ni labial tenias falsa , yo si',
 'te subis a un bondi y esta todo cerrado, abran una ventana manga de bichos si quieren',
 'la "marcha blanca" acelera el paso: más de 40.000 subsaharianos esperan en marruecos para entrar ilegalmente en españa https://t.co/0mhgjnwjoz  ¡¡ okupa monclovita,...¿ya tienes pensado donde cobijar a toda esta gente...?prepara barracas en moncloa, creo que hay terreno...!!',
 '@mariooovt está haciendo todo bien falsa cállate',
 'el estado del bienestar para quién? para ilegales (mantero ,violadores y demás chusma que estáis trayendo) pagas y ss que tenemos que pagar los demás ?  para cuándo el bienestar de los españoles, seguridad en el trabajo de policia y guardia civil, trabajo y ayuda xa parados..? https://t.co/yu4cczmpga',
 '@lamambanegra17 @breendlp @chumeltorres @juandedi

In [28]:
with open('../../data/processed/input1.txt', 'w') as input1, open('../../data/processed/input2.txt', 'w') as input2:
    for twt in test:
        input1.writelines(f'{twt}\n')
    for twt in test2:
        input2.writelines(f'{twt}\n')

In [31]:
%cd ../..

/media/gabriel/Datos Linux/loncos/Text-Detoxification-in-Spanish


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [32]:

!python metric/metric.py --inputs data/processed/input1.txt --preds data/processed/output.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Calculating style of predictions
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializin

In [33]:
!python metric/metric.py --inputs data/processed/input2.txt --preds data/processed/output.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Calculating style of predictions
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializin