# Requirements

In [1]:
import os
import sys


def add_sys_path(p):
    p = os.path.abspath(p)
    print(p)
    if p not in sys.path:
        sys.path.append(p)

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
from importlib import reload

In [4]:
import condbert
reload(condbert)
from condbert import CondBertRewriter

In [5]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import pickle
from tqdm.auto import tqdm, trange

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
device = torch.device('cuda:0')

### Load the model

In [7]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)



In [8]:
model = BertForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model.to(device);

#### Load vocabularies for spans detection

In [10]:
vocab_root = 'vocab_condbert/'

In [11]:
with open(vocab_root + "negative-words.txt", "r") as f:
    s = f.readlines()
negative_words = list(map(lambda x: x[:-1], s))
with open(vocab_root + "toxic_words.txt", "r") as f:
    ss = f.readlines()
negative_words += list(map(lambda x: x[:-1], ss))

with open(vocab_root + "positive-words.txt", "r") as f:
    s = f.readlines()
positive_words = list(map(lambda x: x[:-1], s))

In [12]:
import pickle
with open(vocab_root + 'word2coef.pkl', 'rb') as f:
    word2coef = pickle.load(f)

In [13]:
token_toxicities = []
with open(vocab_root + 'token_toxicities.txt', 'r') as f:
    for line in f.readlines():
        token_toxicities.append(float(line))
token_toxicities = np.array(token_toxicities)
token_toxicities = np.maximum(0, np.log(1/(1/token_toxicities-1)))   # log odds ratio

# discourage meaningless tokens
for tok in ['.', ',', '-']:
    token_toxicities[tokenizer.encode(tok)][1] = 3

for tok in ['you']:
    token_toxicities[tokenizer.encode(tok)][1] = 0

### Applying the model

In [14]:
reload(condbert)
from condbert import CondBertRewriter

editor = CondBertRewriter(
    model=model,
    tokenizer=tokenizer,
    device=device,
    neg_words=negative_words,
    pos_words=positive_words,
    word2coef=word2coef,
    token_toxicities=token_toxicities,
)

In [15]:
print(editor.translate("! Jacintooo...!, !!!te voy a sacar la mierda...!!!", prnt=False))

! jacintooo . . . ! , ! ! ! te voy a sacar la mi onda . . . ! ! !


### Multiunit

In [16]:
editor = CondBertRewriter(
    model=model,
    tokenizer=tokenizer,
    device=device,
    neg_words=negative_words,
    pos_words=positive_words,
    word2coef=word2coef,
    token_toxicities=token_toxicities,
    predictor=None,
)

In [17]:
from multiword import masked_token_predictor_bert
reload(masked_token_predictor_bert)
from multiword.masked_token_predictor_bert import MaskedTokenPredictorBert

In [18]:
predictor = MaskedTokenPredictorBert(model, tokenizer, max_len=250, device=device, label=0, contrast_penalty=0.0)
editor.predictor = predictor

def adjust_logits(logits, label):
    return logits - editor.token_toxicities * 3

predictor.logits_postprocessor = adjust_logits

print(editor.replacement_loop('! Jacintooo...!, !!!te voy a sacar la mierda...!!!', verbose=False))

! jacintooo . . . ! , ! ! ! te voy a sacar la mi . . . ! ! !


In [19]:
%%time
print(editor.replacement_loop('! Jacintooo...!, !!!te voy a sacar la mierda...!!!', verbose=False, n_units=1))

! jacintooo . . . ! , ! ! ! te voy a sacar la mi . . . ! ! !
CPU times: user 404 ms, sys: 314 μs, total: 404 ms
Wall time: 404 ms


In [20]:
%%time
print(editor.replacement_loop('! Jacintooo...!, !!!te voy a sacar la mierda...!!!', verbose=False, n_units=3))

! jacintooo . . . ! , ! ! ! te voy a sacar la mi . . . ! ! !
CPU times: user 3.85 s, sys: 328 ms, total: 4.18 s
Wall time: 4.18 s


In [21]:
%%time
print(editor.replacement_loop('! Jacintooo...!, !!!te voy a sacar la mierda...!!!', verbose=False, n_units=10))

! jacintooo . . . ! , ! ! ! te voy a sacar la mi . . . ! ! !
CPU times: user 4.07 s, sys: 328 ms, total: 4.4 s
Wall time: 4.4 s


In [22]:
import choosers
reload(choosers)
from choosers import EmbeddingSimilarityChooser

# Reproduction

In [23]:
import pandas as pd
dataTest = pd.read_csv('../../data/processed/test.txt', sep='\t', header=None, names=['text'])

In [24]:
test = []
for twt in dataTest['text']:
    test.append(editor.translate(twt, prnt=False))
test

['.o de poco vamos a tener malaria por todos estos congos sudacas de africa sudacas de africaaaaaaaaaaaaaaa',
 '@ milagrosmbarros @ agusq _ 99 jajajajajajaja callate que ni labial tenias zorra , no [UNK]',
 'te subis a un bondi y esta todo cerrado , abran una venta manga de bichos refugiados',
 'la " marcha negra " acelera el paso : mas de 40 . 000 subsaharianos esperan en marruecos para entrar ilegalmente en espana https : / / t . co / 0mhgjnwjoz ¡ ¡ okupa monclovita , . . . ¿ ¿ tienes pensado donde cobijar a toda esta gente . . . ? prepara barracas en moncloa , creo que hay terreno . . . ! !',
 '@ mariooovt esta pepe zorra callate',
 'el estado del bienestar para quien ? para inmigrantes ( mantero , violadores y demas chusma que estais trayendo ) pagas y se que tenemos que pagar los demas ? para cuando el bienestar de los espanoles , seguridad en el trabajo de policia y guardia civil , trabajo y ayuda xa parados . . ? https : / / t . co / yu4cczmpga',
 '@ lamambanegra17 @ breendlp @ 

In [25]:
test2 = []
for twt in dataTest['text']:
    test2.append(editor.replacement_loop(twt, verbose=False))
test2

['el diario de poco vamos a tener malaria por todos estos negro sudacas de africa sudacas de africaaaaaaaaaaaaaaa',
 '@ milagrosmbarros @ agusq _ 99 jajajajajajaja callate que ni labial tenias zorra , yo [UNK]',
 'te subis a un bondi y esta todo cerrado , abran una semana manga de bichos refugiados',
 'la " marcha negra " acelera el paso : mas de 40 . 000 subsaharianos esperan en el para entrar ilegalmente en espana https : / / t . co / 0mhgjnwjoz ¡ ¡ okupa monclovita , . . . ¿ que no tienes pensado donde cobijar a toda esta gente . . . ? prepara barracas en moncloa , creo que hay terreno . . . ! !',
 '@ mariooovt esta ? @ @ zorra callate',
 'el estado del bienestar para quien ? para inmigrantes ( mantero , violadores y demas chusma que estais trayendo ) pagas y espasa que tenemos que pagar los demas ? para cuando el bienestar de los espanoles , seguridad en el trabajo de policia y guardia civil , trabajo y ayuda xa parados . . ? https : / / t . co / yu4cczmpga',
 '@ lamambanegra17 @ b

In [26]:
with open('../../data/processed/inputcond1.txt', 'w') as input1, open('../../data/processed/inputcond2.txt', 'w') as input2:
    for twt in test:
        input1.writelines(f'{twt}\n')
    for twt in test2:
        input2.writelines(f'{twt}\n')

In [27]:
%cd ../..

/media/gabriel/Datos Linux/loncos/Text-Detoxification-in-Spanish


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [28]:

!python metric/metric.py --inputs data/processed/inputcond1.txt --preds data/processed/output.txt

Calculating style of predictions
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█████████████████████████████████████████████| 2/2 [00:02<00:00,  1.49s/it]
Calculating BLEU similarity
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates t

In [29]:
!python metric/metric.py --inputs data/processed/inputcond2.txt --preds data/processed/output.txt

Calculating style of predictions
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█████████████████████████████████████████████| 2/2 [00:03<00:00,  1.64s/it]
Calculating BLEU similarity
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates t

In [23]:
print(editor.translate("Con ese cuerpo es toda una marraqueta", prnt=False))
print(editor.replacement_loop("Con ese cuerpo es toda una marraqueta", verbose=True, n_tokens=(1, 2), n_top=10))   

con ese cuerpo es toda una marraqueta
['es'] -> ['es']
['mar', '##ra', '##quet', '##a'] -> ['vida', '.']
con ese cuerpo es toda una vida .


In [24]:
print(editor.translate("Esa chica ya se tropezó dos veces desde que entró, es una petaca", prnt=False))
print(editor.replacement_loop("Esa chica ya se tropezó dos veces desde que entró, es una petaca", verbose=False, n_tokens=(1, 2), n_top=10))  

esa chica . se tropezo dos veces desde que entro , es una petaca
esa chica que no se tropezo dos veces desde que entro , es una petaca


In [None]:
print(editor.translate("El huañuli ese qué va a poder cargar tantos sillares.", prnt=False))
print(editor.replacement_loop("El huañuli ese qué va a poder cargar tantos sillares.", verbose=False, n_tokens=(1, 2), n_top=10))  

el huanuli es era que va a poder cargar tantos sillares .
el huanuli es que va a poder cargar tantos sillares .
