In [1]:
import pickle
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer
from rank_paraphrases import rank_paraphrases

In [2]:
#constants
lm_checkpoint = 'alenusch/rugpt3-paraphraser'
model_name = lm_checkpoint.split('/')[1]
cls_checkpoint = 'BunnyNoBugs/rubert-tiny2-russe-toxicity'

In [3]:
dev_df = pd.read_csv('data/input/dev.tsv', sep='\t')
toxic_inputs = dev_df['toxic_comment'].tolist()

In [4]:
style_cls_model = AutoModelForSequenceClassification.from_pretrained(cls_checkpoint)
style_tokenizer = AutoTokenizer.from_pretrained(cls_checkpoint)
style_cls_model.cuda();

In [5]:
sim_model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

In [6]:
with open('data/rank_candidates_results/caif-alpha-0-rugpt3-paraphraser-10-samples.pickle', 'rb') as f:
    samples = pickle.load(f)

In [7]:
new_samples = []
for i in samples:
    new_samples.append([x[2] for x in i])

In [10]:
para_results = []
best_candidates = []

for i, candidates in tqdm(zip(toxic_inputs, new_samples)):
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.8
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])

0it [00:00, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
800it [00:29, 27.11it/s]


In [43]:
with open(f'data/rank_candidates_results/caif-alpha-0-{model_name}-10-samples.pickle', 'wb') as f:
    pickle.dump(para_results, f)

In [13]:
with open(f'data/output/caif-alpha-0-{model_name}-10-samples-threshold-0,8_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])

['—Ç–µ–º–ø–µ—Ä–∞—Ç—É—Ä–∞ –ø–ª–∞–≤–ª–µ–Ω–∏—è –∫–∞–∫–æ–π —Å—Ç–∞–ª–∏ —Ö–≤–∞—Ç–∏—Ç —á—Ç–æ–±—ã —Ä–∞—Å–ø–ª–∞–≤–∏—Ç—å –µ–µ –∫—Ö',
 '—Ç—ã –±—ã–ª —Ç–∞–º —á–º–æ–º —Å–ª—É–∂–∏–ª —Ç—ã —Å–ª—É–∂–∏–ª –≤–æ –±–ª–∞–≥–æ',
 '–í –∫–∞–∫–æ–º –º–µ—Å—Ç–µ –≤—ã –∫–æ—Ä–º–∏—Ç–µ—Å—å?',
 '—ç—Ç–æ–º—É —Å–∞–π—Ç—É —è –¥–∞–≤–Ω–æ –Ω–µ –¥–æ–≤–µ—Ä—è—é, –ø–∏—à—É—Ç —Ä–∞–∑–Ω—É—é –µ—Ä—É–Ω–¥—É',
 '–º–µ–Ω—è —Å–µ–≥–æ–¥–Ω—è —á—É—Ç—å –Ω–µ –¥–æ—Å—Ç–∞–ª–∏ –≤ –∞—Å—å–∫–µ –≤–æ—Ç —Ç–∞–∫–æ–π –∑–≤–æ–Ω–æ–∫',
 '—ç—Ç–æ –∂ –º–µ—Å—Ç–æ –≥–¥–µ —É –Ω–µ—ë –≤ —Ä–æ—Å—Å–∏–∏,–ü—É—Ç–∏–Ω—Å–∫–∞—è –æ–±–ª–∞—Å—Ç—å?–ù–µ—É–¥–æ–≤–ª–µ—Ç–≤–æ—Ä–µ–Ω–Ω–∞—è –µ–Ñ–¥–∏–Ω–∞ —Ä–æ—Å—Å–∏–π—Å–∫–∞—è –ø–∏—Å–∞—Ç–µ–ª—å–Ω–∏—Ü–∞,–ø–æ—ç—Ç –∏ –±–ª–æ–≥–µ—Ä—à–∞?',
 '–í—ã –∑–Ω–∞–µ—Ç–µ —á—Ç–æ –∫—É—Ä–∏—Ç–µ',
 '–Ω–∞–∫–∞–∑–∞—Ç—å –∏ –Ω–∞–π—Ç–∏',
 '–¥–∞ –Ω–∞–¥–æ –≤—Ä–∞—á–µ–π –Ω–∞ –º–µ—Å—Ç–æ —Å—Ç–∞–≤–∏—Ç—å —ç—Ç–∏—Ö –≤—Ä–∞—á–µ–π',
 '–∫—É–¥–∞ —Å–º–æ—Ç—Ä—è—Ç –º–æ–¥–µ—Ä–∞—Ç–æ—Ä—ã —Ñ–æ—Ç–æ–≥—Ä–∞—Ñ–∏–π? –∫–∞–∫ –±—ã–ª–æ –≤–æ–∑–º–æ–∂–Ω–æ, —á—Ç–æ–±—ã –ø—Ä–æ–ø—É—Å—Ç–∏–ª–∏ —ç—Ç—É —Ñ–æ—Ç–æ–≥—Ä–∞—Ñ–∏—é? —ç—