In [1]:
import pickle
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer
from rank_paraphrases import rank_paraphrases

In [2]:
#constants
lm_checkpoint = 's-nlp/t5-paraphrase-paws-msrp-opinosis-paranmt'
model_name = lm_checkpoint.split('/')[1]
cls_checkpoint = 's-nlp/roberta_toxicity_classifier'

In [3]:
with open('test_1k_toxic.txt') as f:
    toxic_inputs = f.read().splitlines()

In [5]:
with open('caif-t5-paraphrase-paws-msrp-opinosis-paranmt_test_1k.pickle', 'rb') as f:
    samples1 = pickle.load(f)

with open('caif-t5-paraphrase-paws-msrp-opinosis-paranmt_test_1k2.pickle', 'rb') as f:
    samples2 = pickle.load(f)

with open('caif-t5-paraphrase-paws-msrp-opinosis-paranmt_test_1k2_1.pickle', 'rb') as f:
    samples2_1 = pickle.load(f)

In [6]:
samples2_1

[["That's always the answer to you smoot that no tax and no tax pay but be an entrepreneur.",
  'Tax tax never works and you can earn yourself',
  'That will always be the answer to you, tax tax, tax tax never work and do it yourself',
  "Yeah That always answers you, Tax, Tax can't be done, just earn your money.",
  'That is always the answer for the naysayers, Tax tax never is tax evading, Tax evading work, Earn yourself'],
 ["When the call is about their sacrosanct and they call people 'crybabe'......but not if they're on their own.",
  "When they call it'saunt, they just call people's - 'Cause they know 'em a little scared.",
  "When they sigh with their shilling, they call everyone a cry, ''",
  "When they call out their bld, they merely call people's crys",
  "When a kid gets their money for his, 'do not use his stubs, just call the people to a compass."],
 ['When do you start on that sah mlb?',
  'When do you start the saft?',
  'When do you start on that sanity?',
  'When are t

In [10]:
samples = []
for i, j in zip(samples1 + samples2, samples2_1):
    samples.append(i + j)

In [11]:
style_cls_model = AutoModelForSequenceClassification.from_pretrained(cls_checkpoint)
style_tokenizer = AutoTokenizer.from_pretrained(cls_checkpoint)
style_cls_model.cuda();

Some weights of the model checkpoint at s-nlp/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
sim_model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

In [17]:
para_results = []
best_candidates = []

for i, candidates in tqdm(zip(toxic_inputs, samples)):
    ranked_candidates = rank_paraphrases(
        candidates,
        i,
        style_cls_model,
        style_tokenizer,
        sim_model,
        style_score_threshold=0.8
    )
    para_results.append(ranked_candidates['ranked_candidates'])
    best_candidates.append(ranked_candidates['best_candidate'][2])

1000it [00:47, 21.22it/s]


In [18]:
with open(f'caif-{model_name}-10-samples-threshold-0,8_test_1k.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence + '\n' for sentence in best_candidates])