In [1]:
import transformers
transformers.logging.set_verbosity_error()
transformers.set_seed(42)

In [2]:
import pandas as pd
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModelForSequenceClassification, AutoTokenizer
from russian_paraphrasers_inference import rp_gpt_paraphrase, caif_rp_gpt_paraphrase
from filter_paraphrases import filter_paraphrases, convert_results_to_df
import pickle

In [20]:
dev_df = pd.read_csv('data/input/dev.tsv', sep='\t')
toxic_inputs = dev_df['toxic_comment'].tolist()

In [4]:
#constants
lm_checkpoint = 'alenusch/rugpt3-paraphraser'
model_name = lm_checkpoint.split('/')[1]
cls_checkpoint = 'BunnyNoBugs/rubert-tiny2-russe-toxicity'
max_tries = 10

In [5]:
filter_cls_model = AutoModelForSequenceClassification.from_pretrained(cls_checkpoint)
filter_tokenizer = AutoTokenizer.from_pretrained(cls_checkpoint)
filter_cls_model.cuda();

## Ordinary sampling

In [6]:
model = GPT2LMHeadModel.from_pretrained(lm_checkpoint)
tokenizer = GPT2Tokenizer.from_pretrained(lm_checkpoint)
model.cuda();

In [7]:
rp_gpt_paraphrase(
    'Иди нафиг.',
    model,
    tokenizer,
    temperature=1,
    top_k=20,
    top_p=1.0,
    repetition_penalty=1.0,
    do_sample=True,
    num_return_sequences=1
)

'Иди от него подальше'

In [8]:
rp_kwargs = {
    'model': model,
    'tokenizer': tokenizer,
    'temperature': 1,
    'top_k': 20,
    'top_p': 1.0,
    'repetition_penalty': 1.0,
    'do_sample': True,
    'num_return_sequences': 1
}

In [9]:
para_results = []
for i in tqdm(toxic_inputs):
    rp_kwargs['text'] = i
    para_result = filter_paraphrases(
        paraphrase_func=rp_gpt_paraphrase,
        paraphrase_kwargs=rp_kwargs,
        filter_cls_model=filter_cls_model,
        filter_tokenizer=filter_tokenizer,
        max_tries=10
    )
    para_results.append(para_result)

100%|██████████| 1/1 [00:02<00:00,  2.30s/it]


In [11]:
results_path = f'data/filter_paraphrases_results/{model_name}-max-{max_tries}'
results_df = convert_results_to_df(para_results)
results_df.to_csv(f'{results_path}_dev.csv')
with open(f'{results_path}_dev.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [12]:
results_df['num_tries'].mean()

4.0

## CAIF sampling

In [10]:
caif_paraphrase_kwargs = {
    'lm_model_name': lm_checkpoint,
    'cls_model_name': cls_checkpoint,
    'fp16': True,
    'alpha': -5,
    'target_label_id': 1,
    'entropy_threshold': 0,
}

In [11]:
caif_rp_gpt_paraphrase(
    'Иди нафиг.',
    lm_model_name=lm_checkpoint,
    cls_model_name=cls_checkpoint,
    fp16=True,
    alpha=-5,
    target_label_id=1,
    entropy_threshold=0,
)

'Иди домой и в безопасности'

In [None]:
para_results = []
for i in tqdm(toxic_inputs):
    caif_paraphrase_kwargs['text'] = i
    para_result = filter_paraphrases(
        paraphrase_func=caif_rp_gpt_paraphrase,
        paraphrase_kwargs=caif_paraphrase_kwargs,
        filter_cls_model=filter_cls_model,
        filter_tokenizer=filter_tokenizer,
        max_tries=10
    )
    para_results.append(para_result)

  8%|▊         | 65/800 [13:00<2:59:00, 14.61s/it]

In [None]:
results_path = f'data/filter_paraphrases_results/caif-{model_name}-max-{max_tries}'
results_df = convert_results_to_df(para_results)
results_df.to_csv(f'{results_path}_dev.csv')
with open(f'{results_path}_dev.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [None]:
results_df['num_tries'].mean()

In [None]:
with open(f'data/output/caif-{model_name}-max-10_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in results_df['best_para_text']])