In [1]:
import transformers
transformers.logging.set_verbosity_error()
transformers.set_seed(42)

In [2]:
import pandas as pd
from tqdm import tqdm
from transformers import (GPT2LMHeadModel, GPT2Tokenizer, AutoModelForSequenceClassification, MT5Tokenizer, MT5ForConditionalGeneration,
                          AutoTokenizer)
from russian_paraphrasers_inference import rp_gpt_paraphrase, caif_rp_gpt_paraphrase, rp_mt5_paraphrase, caif_rp_mt5_paraphrase
from sentence_transformers import SentenceTransformer
from rank_paraphrases import rank_paraphrases
import pickle

In [3]:
dev_df = pd.read_csv('data/input/dev.tsv', sep='\t')
toxic_inputs = dev_df['toxic_comment'].tolist()

In [6]:
#constants
lm_checkpoint = 'alenusch/mt5large-ruparaphraser'
model_name = lm_checkpoint.split('/')[1]
cls_checkpoint = 'BunnyNoBugs/rubert-tiny2-russe-toxicity'

In [33]:
style_cls_model = AutoModelForSequenceClassification.from_pretrained(cls_checkpoint)
style_tokenizer = AutoTokenizer.from_pretrained(cls_checkpoint)
style_cls_model.cuda();

In [34]:
sim_model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

## Ordinary sampling

In [12]:
model = GPT2LMHeadModel.from_pretrained(lm_checkpoint)
tokenizer = GPT2Tokenizer.from_pretrained(lm_checkpoint)
model.cuda();

In [22]:
rp_gpt_paraphrase(
    'блядь скотина сдохни нахуй',
    model,
    tokenizer,
    temperature=1,
    top_k=20,
    top_p=1.0,
    repetition_penalty=1.0,
    do_sample=True,
    num_return_sequences=1
)

'сдохни нахуй отродье'

In [8]:
rp_kwargs = {
    'model': model,
    'tokenizer': tokenizer,
    'temperature': 1,
    'top_k': 20,
    'top_p': 1.0,
    'repetition_penalty': 1.0,
    'do_sample': True,
    'num_return_sequences': 1
}

In [9]:
para_results = []
for i in tqdm(toxic_inputs):
    rp_kwargs['text'] = i
    para_result = filter_paraphrases(
        paraphrase_func=rp_gpt_paraphrase,
        paraphrase_kwargs=rp_kwargs,
        filter_cls_model=filter_cls_model,
        filter_tokenizer=filter_tokenizer,
        max_tries=max_tries
    )
    para_results.append(para_result)

100%|██████████| 1/1 [00:02<00:00,  2.30s/it]


In [11]:
results_path = f'data/filter_paraphrases_results/{model_name}-max-{max_tries}'
results_df = convert_results_to_df(para_results)
results_df.to_csv(f'{results_path}_dev.csv')
with open(f'{results_path}_dev.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [12]:
results_df['num_tries'].mean()

4.0

In [None]:
with open(f'data/output/{model_name}-max-10_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in results_df['best_para_text']])

## CAIF sampling

In [39]:
%%time

caif_rp_gpt_paraphrase(
    'блядь скотина сдохни нахуй',
    lm_model_name=lm_checkpoint,
    cls_model_name=cls_checkpoint,
    fp16=True,
    alpha=5,
    target_label_id=0,
    entropy_threshold=0,
)

Wall time: 2.83 s


'бжиииииииииииите нааааас пришли наааас придавите нас'

In [9]:
%%time

caif_rp_gpt_paraphrase(
    'Иди нафиг.',
    lm_model_name=lm_checkpoint,
    cls_model_name=cls_checkpoint,
    fp16=True,
    alpha=-5,
    target_label_id=1,
    entropy_threshold=0,
    num_samples=10
)

RuntimeError: The size of tensor a (100) must match the size of tensor b (10) at non-singleton dimension 1

In [40]:
para_results = []
for i in tqdm(toxic_inputs):
    para_result = caif_rp_gpt_paraphrase(
        i,
        lm_model_name=lm_checkpoint,
        cls_model_name=cls_checkpoint,
        fp16=True,
        alpha=-5,
        target_label_id=1,
        entropy_threshold=0.5,
    )
    para_results.append(para_result)

100%|██████████| 800/800 [40:50<00:00,  3.06s/it]


In [41]:
with open(f'data/output/new-caif-alpha--5-entropy-0,5-{model_name}_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in para_results])

### Filter candidates

In [9]:
%%time

caif_rp_mt5_paraphrase(
    'Пошел нахуй',
    lm_checkpoint,
    cls_checkpoint,
    fp16=True,
    alpha=-5,
    target_label_id=1,
    entropy_threshold=0,
    encoder_no_repeat_ngram_size=None,
    num_samples=10,
    act_type='sigmoid'
)

Wall time: 5.93 s


['Ты уже пошел на покой',
 'Пошли к черту черт',
 'Ты пошёл на х',
 'Пошел пошел на этот аукцион',
 'Пошел на мели и ну',
 'Пошел на улицу мать твою',
 'Я имею в виду пошел на х',
 'Слушай пошел на увольнение',
 'Пошел черт возьми туда',
 'Слушайте мы просто пошли на улицу']

In [39]:
para_results = []
best_candidates = []

for i in tqdm(toxic_inputs):
    candidates = caif_rp_mt5_paraphrase(
        i,
        lm_checkpoint,
        cls_checkpoint,
        fp16=True,
        alpha=-5,
        target_label_id=1,
        entropy_threshold=0,
        encoder_no_repeat_ngram_size=None,
        num_samples=10
    )
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.99
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])
    break

  0%|          | 0/800 [00:07<?, ?it/s]


In [42]:
model_name

'mt5small-ruparaphraser'

In [None]:
with open(f'data/rank_candidates_results/caif-{model_name}-10-samples.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [None]:
with open(f'data/output/caif-{model_name}-10-samples_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])