In [1]:
import pandas as pd
import sys
sys.path.append("..")
from mmice.utils import html_highlight_diffs
from mmice.edit_finder import EditEvaluator
from mmice.maskers.random_masker import RandomMasker
from transformers import MT5TokenizerFast
from IPython.display import display, HTML
import numpy as np
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")
eval = EditEvaluator(fluency_model_name="google/mt5-small",
                     fluency_masker=RandomMasker(None, MT5TokenizerFast.from_pretrained("google/mt5-small", model_max_length=700, legacy=False), 700))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
TASK = "imdb"
STAGE2EXP = "mmice-test-editor-final"
SAVE_PATH = f"../results/{TASK}/edits/{STAGE2EXP}/"
EDIT_PATH = SAVE_PATH + "edits.csv"

In [3]:
def read_edits(path):
    edits = pd.read_csv(path, sep="\t", lineterminator="\n")
    if edits['new_pred'].dtype == np.dtype('float64'):
        edits['new_pred'] = edits.apply(lambda row: str(int(row['new_pred']) if not np.isnan(row['new_pred']) else ""), axis=1)
        edits['orig_pred'] = edits.apply(lambda row: str(int(row['orig_pred']) if not np.isnan(row['orig_pred']) else ""), axis=1)
        edits['contrast_pred'] = edits.apply(lambda row: str(int(row['contrast_pred']) if not np.isnan(row['contrast_pred']) else ""), axis=1)
    else:
        edits['new_pred'].fillna(value="", inplace=True)
        edits['orig_pred'].fillna(value="", inplace=True)
        edits['contrast_pred'].fillna(value="", inplace=True)
    return edits

In [4]:
def get_best_edits(edits):
    """ MiCE writes all edits that are found in Stage 2, 
    but we only want to evaluate the smallest per input. 
    Calling get_sorted_e() """
    return edits[edits['sorted_idx'] == 0]
    
def evaluate_edits(edits):
    temp = edits[edits['sorted_idx'] == 0]
    minim = temp['minimality'].mean()
    flipped = temp[temp['new_pred'].astype(str)==temp['contrast_pred'].astype(str)]
    nunique = temp['data_idx'].nunique()
    flip_rate = len(flipped)/nunique
    duration = temp['duration'].mean()
    metrics = {
        "num_total": nunique,
        "num_flipped": len(flipped),
        "flip_rate": flip_rate,
        "minimality": minim,
        "fluency": temp['fluency'].mean(),
        "duration": duration,
    }
    for k, v in metrics.items():
        print(f"{k}: \t{round(v, 3)}")
    return metrics

In [5]:
def display_edits(row):
    html_original, html_edited = html_highlight_diffs(row['orig_editable_seg'], row['edited_editable_seg'], nlp)
    minim = round(row['minimality'], 3)
    print(f"MINIMALITY: \t{minim}")
    print("")
    display(HTML(html_original))
    display(HTML(html_edited))

def display_classif_results(rows):
    for _, row in rows.iterrows():
        orig_contrast_prob_pred = round(row['orig_contrast_prob_pred'], 3)
        new_contrast_prob_pred = round(row['new_contrast_prob_pred'], 3)
        print("-----------------------")
        print(f"ORIG LABEL: \t{row['orig_pred']}")
        print(f"CONTR LABEL: \t{row['contrast_pred']} (Orig Pred Prob: {orig_contrast_prob_pred})")
        print(f"NEW LABEL: \t{row['new_pred']} (New Pred Prob: {new_contrast_prob_pred})")
        print("")
        display_edits(row)

def display_race_results(rows):
    for _, row in rows.iterrows():
        orig_contrast_prob_pred = round(row['orig_contrast_prob_pred'], 3)
        new_contrast_prob_pred = round(row['new_contrast_prob_pred'], 3)
        orig_input = eval(row['orig_input'])
        options = orig_input['options']
        print("-----------------------")
        print(f"QUESTION: {orig_input['question']}")
        print("\nOPTIONS:")
        for opt_idx, opt in enumerate(options):
            print(f"  ({opt_idx}) {opt}")
        print(f"\nORIG LABEL: \t{row['orig_pred']}")
        print(f"CONTR LABEL: \t{row['contrast_pred']} (Orig Pred Prob: {orig_contrast_prob_pred})")
        print(f"NEW LABEL: \t{row['new_pred']} (New Pred Prob: {new_contrast_prob_pred})")
        print("")
        display_edits(row)

In [7]:
edits = read_edits(EDIT_PATH)
edits = get_best_edits(edits)

In [8]:
tqdm.pandas(desc='original sequence loss!')
a = edits["orig_editable_seg"].progress_apply(eval.score_fluency)

original sequence loss!:   5%|▌         | 25/500 [18:11<5:45:41, 43.67s/it]


KeyboardInterrupt: 

In [None]:
tqdm.pandas(desc='edited sequence loss!')
b = edits["edited_editable_seg"].progress_apply(lambda x: eval.score_fluency(x) if isinstance(x, str) else 0)

edited sequence loss!:   0%|          | 1/500 [00:05<42:53,  5.16s/it]


KeyboardInterrupt: 

In [None]:
edits['fluency'] =  b/a
edits.to_csv(SAVE_PATH + "best_edits.csv", sep="\t", lineterminator="\n")

In [None]:
edits = read_edits(SAVE_PATH + "best_edits.csv")
edits = get_best_edits(edits)
metrics = evaluate_edits(edits)


num_total: 	500
num_flipped: 	498
flip_rate: 	0.996
minimality: 	0.194
fluency: 	0.989
duration: 	37.342


In [8]:
random_rows = edits.sample(1)
display_classif_results(random_rows)
# display_race_results(random_rows)

-----------------------
ORIG LABEL: 	POSITIVE
CONTR LABEL: 	NEGATIVE (Orig Pred Prob: 0.052)
NEW LABEL: 	NEGATIVE (New Pred Prob: 0.533)

MINIMALITY: 	0.07

