In [1]:
import transformers

transformers.logging.set_verbosity_error()
transformers.set_seed(42)

In [2]:
import pandas as pd
from tqdm import tqdm
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModelForSequenceClassification, AutoTokenizer
from t5_paraphraser_inference import t5_paraphrase, caif_t5_paraphrase
from rank_paraphrases import rank_paraphrases, convert_results_to_df
from sentence_transformers import SentenceTransformer
import pickle

In [9]:
dev_df = pd.read_csv('data/input/dev.tsv', sep='\t')
toxic_inputs = dev_df['toxic_comment'].tolist()

In [6]:
with open('data/toxic_vocab_extended.txt') as f:
    bad_words = f.readlines()
    bad_words = [word.strip() for word in bad_words]
    bad_words += [word.capitalize() for word in bad_words]

In [4]:
#constants
lm_checkpoint = 'cointegrated/rut5-base-paraphraser'
model_name = lm_checkpoint.split('/')[1]
cls_checkpoint = 'BunnyNoBugs/rubert-tiny2-russe-toxicity'
num_samples = 10

In [5]:
model = T5ForConditionalGeneration.from_pretrained(lm_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(lm_checkpoint)
model.cuda();

In [6]:
style_cls_model = AutoModelForSequenceClassification.from_pretrained(cls_checkpoint)
style_tokenizer = AutoTokenizer.from_pretrained(cls_checkpoint)
style_cls_model.cuda();

In [7]:
sim_model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

In [7]:
bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

KeyboardInterrupt: 

In [10]:
origin = 'Пошел нахуй скотина тупая'
para_results = rut5_paraphrase(
    origin,
    model,
    tokenizer,
    beams=1,
    grams=4,
    do_sample=True,
    top_k=20,
    num_samples=10
)
rank_paraphrases(
    para_results,
    origin,
    style_cls_model,
    style_tokenizer,
    sim_model
)

NameError: name 'rut5_paraphrase' is not defined

In [29]:
%%time

origin = 'далбоебы путиных в запасе лет на 200'
para_results = caif_rut5_paraphrase(
    origin,
    lm_checkpoint,
    cls_checkpoint,
    fp16=False,
    alpha=-5,
    target_label_id=1,
    entropy_threshold=0,
    encoder_no_repeat_ngram_size=None,
    num_samples=10
)
rank_paraphrases(
    para_results,
    origin,
    style_cls_model,
    style_tokenizer,
    sim_model
)

Wall time: 8.77 s


{'best_candidate': (0.9115038514137268,
  0.9758521,
  'Далбоепа путиных в запасе год на 200'),
 'ranked_candidates': [(0.9115038514137268,
   0.9758521,
   'Далбоепа путиных в запасе год на 200'),
  (0.8755508065223694, 0.9957093, 'Дедбоепа путиных ушла на 200 лет на 200'),
  (0.8129284977912903, 0.9687927, 'В запасе далбоепов Путиных 200 лет'),
  (0.7860055565834045,
   0.9805802,
   'Несколько лет далбоесов в запасе Путина до 200 лет'),
  (0.7806413769721985, 0.9931212, 'Доклад далбоеby путиных упал на 200 лет'),
  (0.5562983751296997,
   0.9938664,
   'Данбоепа Верницкого умерла от болевых больных до 20 тысяч лет'),
  (0.5027222633361816,
   0.9903492,
   'На два года долголетие далбоеновых путиных'),
  (0.44007033109664917, 0.98821205, 'Данбоепы путиных долго вернутся в штат'),
  (0.40935221314430237, 0.9665256, 'Даллбоебой путиных в запасе лето'),
  (0.3596664369106293,
   0.9945479,
   'Далбоепы Владимира Путина будут хранить на долгую работу')]}

In [33]:
% % time

paraphrase(
    text=['Ты, бля, не мороси, служил, али как?	Ты служил, али как?', 'Залупа пенис член'],
    beams=5,
    grams=4,
    do_sample=False,
    bad_words_ids=bad_words_ids
)

Wall time: 1min 9s


['Ты, черт возьми, не морос, служила, а? Ты служила, а? Ты служила, а?',
 'Залупа члена пениса']

## Ordinary sampling

### No filtering

In [None]:
results = []
problematic_batches = []  #if something goes wrong you can track such batches
batch_size = 32

for i in tqdm(range(0, len(toxic_inputs), batch_size)):
    batch = toxic_inputs[i:i + batch_size]
    try:
        results.extend(paraphrase(
            batch,
            beams=5,
            grams=4,
            bad_words_ids=None
        )
        )
    except Exception as e:
        print(i)
        problematic_batches.append(toxic_inputs[i:i + batch_size])

  4%|▍         | 1/25 [48:25<19:22:00, 2905.04s/it]

In [None]:
with open(f'{model_name}-bad-words_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in results])

### Filter candidates

In [23]:
filter_cls_model = AutoModelForSequenceClassification.from_pretrained(cls_checkpoint)
filter_tokenizer = AutoTokenizer.from_pretrained(cls_checkpoint)
filter_cls_model.cuda();

In [90]:
rut5_kwargs = {
    'beams': 1,
    'grams': 0,
    'bad_words_ids': None,
    'do_sample': True,
    'top_k': 20
}

In [97]:
para_results = []

for i in tqdm(toxic_inputs):
    rut5_kwargs['text'] = [i]
    para_result = filter_paraphrases(
        paraphrase_func=paraphrase,
        paraphrase_kwargs=rut5_kwargs,
        filter_cls_model=filter_cls_model,
        filter_tokenizer=filter_tokenizer,
        max_tries=max_tries
    )
    para_results.append(para_result)

100%|██████████| 800/800 [19:02<00:00,  1.43s/it]


In [98]:
results_path = f'data/filter_paraphrases_results/{model_name}-max-{max_tries}'
results_df = convert_results_to_df(para_results)
results_df.to_csv(f'{results_path}_dev.csv', index=False)
with open(f'{results_path}_dev.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [99]:
results_df['num_tries'].mean()

3.39

In [100]:
with open(f'data/output/{model_name}-max-10_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in results_df['best_para_text']])

## CAIF sampling

In [10]:
para_results = []

for i in tqdm(toxic_inputs):
    para_result = caif_rut5_paraphrase(
        i,
        lm_checkpoint,
        cls_checkpoint,
        fp16=False,
        alpha=0,
        target_label_id=1,
        entropy_threshold=0,
        encoder_no_repeat_ngram_size=0
    )
    para_results.append(para_result)

100%|██████████| 800/800 [18:22<00:00,  1.38s/it]


In [11]:
with open(f'data/output/caif-alpha-0-{model_name}_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in para_results])

### Rank candidates

In [12]:
para_results = []
best_candidates = []

for i in tqdm(toxic_inputs):
    candidates = caif_t5_paraphrase(
        i,
        lm_checkpoint,
        cls_checkpoint,
        fp16=False,
        alpha=-2,
        target_label_id=1,
        entropy_threshold=0,
        encoder_no_repeat_ngram_size=None,
        num_samples=10
    )
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.99
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])

100%|██████████| 800/800 [3:34:09<00:00, 16.06s/it]  


In [13]:
with open(f'data/rank_candidates_results/caif-alpha--2-{model_name}-10-samples.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [14]:
with open(f'data/output/caif-alpha--2-{model_name}-10-samples_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])

In [15]:
para_results = []
best_candidates = []

for i in tqdm(toxic_inputs):
    candidates = caif_t5_paraphrase(
        i,
        lm_checkpoint,
        cls_checkpoint,
        fp16=False,
        alpha=-1,
        target_label_id=1,
        entropy_threshold=0,
        encoder_no_repeat_ngram_size=None,
        num_samples=10
    )
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.99
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])

100%|██████████| 800/800 [3:22:48<00:00, 15.21s/it]  


In [16]:
with open(f'data/rank_candidates_results/caif-alpha--1-{model_name}-10-samples.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [17]:
with open(f'data/output/caif-alpha--1-{model_name}-10-samples_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])

In [11]:
para_results = []
best_candidates = []

for i in tqdm(toxic_inputs):
    candidates = caif_t5_paraphrase(
        i,
        lm_checkpoint,
        cls_checkpoint,
        fp16=False,
        alpha=-5,
        target_label_id=1,
        entropy_threshold=0.5,
        encoder_no_repeat_ngram_size=None,
        num_samples=10
    )
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.99
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])

100%|██████████| 800/800 [3:08:48<00:00, 14.16s/it]  


In [12]:
with open(f'data/rank_candidates_results/caif-entropy-0,5-{model_name}-10-samples.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [13]:
with open(f'data/output/caif-entropy-0,5-{model_name}-10-samples_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])

In [14]:
para_results = []
best_candidates = []

for i in tqdm(toxic_inputs):
    candidates = caif_t5_paraphrase(
        i,
        lm_checkpoint,
        cls_checkpoint,
        fp16=False,
        alpha=-5,
        target_label_id=1,
        entropy_threshold=1.5,
        encoder_no_repeat_ngram_size=None,
        num_samples=10
    )
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.99
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])

100%|██████████| 800/800 [2:33:15<00:00, 11.49s/it]  


In [15]:
with open(f'data/rank_candidates_results/caif-entropy-1,5-{model_name}-10-samples.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [16]:
with open(f'data/output/caif-entropy-1,5-{model_name}-10-samples_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])

In [17]:
para_results = []
best_candidates = []

for i in tqdm(toxic_inputs):
    candidates = caif_t5_paraphrase(
        i,
        lm_checkpoint,
        cls_checkpoint,
        fp16=False,
        alpha=-5,
        target_label_id=1,
        entropy_threshold=3.2,
        encoder_no_repeat_ngram_size=None,
        num_samples=10
    )
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.99
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])

100%|██████████| 800/800 [1:48:21<00:00,  8.13s/it]  


In [18]:
with open(f'data/rank_candidates_results/caif-entropy-3,2-{model_name}-10-samples.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [19]:
with open(f'data/output/caif-entropy-3,2-{model_name}-10-samples_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])

In [20]:
para_results = []
best_candidates = []

for i in tqdm(toxic_inputs):
    candidates = caif_t5_paraphrase(
        i,
        lm_checkpoint,
        cls_checkpoint,
        fp16=False,
        alpha=-5,
        target_label_id=1,
        entropy_threshold=5.0,
        encoder_no_repeat_ngram_size=None,
        num_samples=10
    )
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.99
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])

100%|██████████| 800/800 [1:17:34<00:00,  5.82s/it]


In [21]:
with open(f'data/rank_candidates_results/caif-entropy-5,0-{model_name}-10-samples.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [22]:
with open(f'data/output/caif-entropy-5,0-{model_name}-10-samples_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])

In [25]:
para_results = []
best_candidates = []

for i in tqdm(toxic_inputs):
    candidates = caif_t5_paraphrase(
        i,
        lm_checkpoint,
        cls_checkpoint,
        fp16=False,
        alpha=-5,
        target_label_id=1,
        entropy_threshold=0,
        encoder_no_repeat_ngram_size=None,
        num_samples=10
    )
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.99
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])

100%|██████████| 875/875 [4:22:15<00:00, 17.98s/it]  


In [27]:
with open(f'data/rank_candidates_results/caif-{model_name}-10-samples_test.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [28]:
with open(f'data/output/caif-{model_name}-10-samples_test.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])