In [1]:
# if you want to use cuda, you can specify the ID of the device
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '5'
# and set the use_cuda parameter to True
use_cuda = True # otherwise, set it to False

In [2]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel

In [3]:
from ru_detoxification_evaluation import load_model

## Reading input dataset and results of model inference

In [4]:
# We suppose that the input dataset is a dataframe with original toxic sentences 
# and/or with neutral refenrences
df = pd.read_csv('../data/input/dev.tsv', sep='\t')
df = df.fillna('')

In [5]:
df.head(2)

Unnamed: 0,toxic_comment,neutral_comment1,neutral_comment2,neutral_comment3
0,пиздеж! температуры горения хватит чтобы её ра...,Враньё! Температуры горения хватит чтобы ее ра...,"неправда,температуры горения хватит чтобы расп...",Враньё! Температуры горения хватит на чтобы её...
1,а ты чмо там был.ты вообще служил.гандон,А ты там был? Ты вообще служил?,,


In [6]:
toxic_inputs = df['toxic_comment'].tolist()

neutral_references = []
for index, row in df.iterrows():
    neutral_references.append([row['neutral_comment1'], row['neutral_comment2'], row['neutral_comment3']])

In [7]:
# We suppose that the model outputs are saved as .txt file seperated with '\n'
with open('../data/output/new-caif-rugpt3-paraphraser_dev.txt', 'r', encoding='utf-8') as file:
    preds = file.readlines()
preds = [sentence.strip() for sentence in preds]

## Style Transfer Accuracy (STA)

In [8]:
from ru_detoxification_metrics import evaluate_style

In [9]:
style_model, style_tokenizer = load_model('s-nlp/russian_toxicity_classifier', use_cuda=use_cuda)

In [10]:
accuracy = evaluate_style(
    model = style_model,
    tokenizer = style_tokenizer,
    texts = preds,
    target_label=0,  # 1 is toxic, 0 is neutral
    batch_size=32, 
    verbose=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

In [11]:
print(f'Style transfer accuracy (STA):  {np.mean(accuracy)}')

Style transfer accuracy (STA):  0.7680249810218811


## Meaning Preservation Score (SIM)

In [12]:
from ru_detoxification_metrics import evaluate_cosine_similarity

In [13]:
meaning_model, meaning_tokenizer = load_model('cointegrated/LaBSE-en-ru', use_cuda=use_cuda, model_class=AutoModel)

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
similarity = evaluate_cosine_similarity(
    model = meaning_model,
    tokenizer = meaning_tokenizer,
    original_texts = toxic_inputs,
    rewritten_texts = preds,
    batch_size=32,
    verbose=True,
    )

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [15]:
print(f'Meaning preservation (SIM):  {np.mean(similarity)}')

Meaning preservation (SIM):  0.39833030104637146


## Fluency score (FL)

In [16]:
from ru_detoxification_metrics import evaluate_cola_relative

In [17]:
cola_model, cola_tolenizer = load_model('s-nlp/rubert-base-corruption-detector', use_cuda=use_cuda)

In [18]:
fluency = evaluate_cola_relative(
    model = cola_model,
    tokenizer = cola_tolenizer,
    original_texts = toxic_inputs,
    rewritten_texts = preds,
    target_label=1,
    batch_size=32,
    verbose=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [19]:
print(f'Fluency score (FL):  {np.mean(fluency)}')

Fluency score (FL):  0.8065676093101501


## Joint score (J)

In [20]:
joint = accuracy * similarity * fluency

In [21]:
joint

array([2.46479645e-01, 5.07902682e-01, 1.96248759e-02, 6.55506074e-01,
       0.00000000e+00, 7.30593741e-01, 1.14290245e-01, 5.86284436e-02,
       8.55390504e-02, 1.08936131e-01, 5.35859346e-01, 5.46079397e-01,
       7.51483366e-02, 2.32018933e-01, 7.06034184e-01, 7.26914257e-02,
       2.04498872e-01, 3.83508801e-01, 3.28810960e-02, 1.10529391e-02,
       0.00000000e+00, 4.39830035e-01, 5.70685685e-01, 1.22965522e-01,
       3.50806005e-02, 6.48836553e-01, 1.31133467e-01, 1.10794321e-01,
       2.69820206e-02, 1.05388025e-02, 5.00120744e-02, 3.13538671e-01,
       2.24190444e-01, 2.78390050e-01, 3.65133733e-01, 1.25230089e-01,
       1.75598308e-01, 3.79498512e-01, 9.26436037e-02, 1.23347923e-01,
       7.41392493e-01, 6.35362029e-01, 2.58834869e-01, 3.71663332e-01,
       3.11604619e-01, 3.92805427e-01, 5.60602434e-02, 3.65320176e-01,
       2.39653349e-01, 5.99738836e-01, 4.04670760e-02, 1.56458125e-01,
       2.47989163e-01, 4.10046101e-01, 1.25206942e-02, 1.11533046e-01,
      

In [22]:
print(f'Joint score (J):   {np.mean(joint)}')

Joint score (J):   0.2373957484960556


## Analysis

In [36]:
accuracy[accuracy <= 0.5].mean()

0.18001422

In [33]:
similarity[accuracy > 0.5].mean()

0.3699633

In [34]:
similarity[accuracy <= 0.5].mean()

0.48400152

## ChrF1 with references

In [24]:
from nltk.translate.chrf_score import corpus_chrf

In [25]:
corpus_chrf(neutral_references, preds)

0.2831523701734911