In [None]:
import logging
import numpy as np
import os
import pandas as pd
import sys
import torch
import transformers

from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification

sys.path.extend(["../utils", "../evaluate"])
from metrics import compute_n_gram_entropy
from wikidata import load_wikidata_json

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disable tokenizer warning

# set up logging
formatter = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
transformers.utils.logging.set_verbosity_error()
transformers.utils.logging.disable_progress_bar()

In [None]:
output_dir = "results/pipeline"

In [None]:
results = []

files = sorted([file for file in os.listdir(output_dir) if "json" in file])
logger.info(f"Found {len(files)} files in {output_dir}")

In [None]:
files

In [None]:
n = 0
fname = files[n]
fname

In [None]:
logger.info(f"{n+1}/{len(files)} Evaluating {fname}")
_, dataset, split, method, model_name = fname.split(".")[0].split("_")

# load data
df_gen = pd.read_json(f"{output_dir}/{fname}", lines=True)
df_gt = pd.read_json(f"../datasets/{dataset}/{dataset}_{split}.json", lines=True)
df = df_gt.merge(df_gen)
wikidata_dict = load_wikidata_json("../datasets/wikidata_entity_data.json")
logger.info(f"Loaded dataframe with {len(df)} rows")

In [None]:
def accuracy(row):
    gen = sent_tokenize(row["gen"])[0][len(row["prompt"]):]
    aliases = wikidata_dict[row[f"{row['cf_entity_type']}_id"]]["aliases"]
    if aliases:
            aliases = set(sum([x.split(", ") for x in aliases], []))
    if row[f"{row['cf_entity_type']}_retrieved"] in gen or (aliases and any([a in gen for a in aliases])):
        return True
    else:
        return False

df["acc"] = df.apply(lambda row: accuracy(row), axis=1)
np.round(df["acc"].mean() * 100, 2)

In [None]:
def accuracy_pipeline(row):
    gen = sent_tokenize(row["gen_pipeline"])[0][len(row["prompt"]):]
    aliases = wikidata_dict[row[f"{row['cf_entity_type']}_id"]]["aliases"]
    if aliases:
            aliases = set(sum([x.split(", ") for x in aliases], []))
    if row[f"{row['cf_entity_type']}_retrieved"] in gen or (aliases and any([a in gen for a in aliases])):
        return True
    else:
        return False

df["acc_pipeline"] = df.apply(lambda row: accuracy_pipeline(row), axis=1)
np.round(df["acc_pipeline"].mean() * 100, 2)

In [None]:
# acc_edit
np.round((~df["acc"] == df["correction"]).mean() * 100, 2)

In [None]:
def accuracy_retrieval(row):
    if row["triple_retrieved"]:
        entity_id_retrieved = row["triple_retrieved"]["retrieved"][0][1]
        if entity_id_retrieved == row[f"{row['cf_entity_type']}_id"]:
            return True
        else:
            return False
    else:
        return False
df["acc_retrieval"] = df.apply(lambda row: accuracy_retrieval(row), axis=1)

# acc_retrieval_all
np.round(df[df["correction"]]["acc_retrieval"].mean() * 100, 2)

In [None]:
def injection_accuracy(row):
    gen = sent_tokenize(row["gen_pipeline"])[0][len(row["prompt"]):]
    if row["triple_retrieved"]:
        ent = row["triple_retrieved"]["retrieved"][0][0]
        id = row["triple_retrieved"]["retrieved"][0][1]
        aliases = wikidata_dict.get(id)
        if aliases:
                aliases = aliases["aliases"]
                if aliases:
                    aliases = set(sum([x.split(", ") for x in aliases], []))
        if ent in gen or (aliases and any([a in gen for a in aliases])):
            return True
        else:
            return False
    else:
        return False

df["acc_injection_all"] = df.apply(lambda row: injection_accuracy(row), axis=1)
np.round(df[df["correction"]]["acc_injection_all"].mean() * 100, 2)

In [None]:
def fluency_ngram_entropy(df):
    fluency_pre = df.apply(lambda row: compute_n_gram_entropy(sent_tokenize(row["gen"])[0]), axis=1)
    fluency_post = df.apply(lambda row: compute_n_gram_entropy(sent_tokenize(row["gen_pipeline"])[0]), axis=1)
    return ((fluency_post - fluency_pre) / fluency_pre).mean()

np.round(fluency_ngram_entropy(df) * 100, 2)

In [None]:
model_name = "textattack/roberta-base-CoLA"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, device_map="auto")

In [None]:
def predict_acceptability(sentences, tokenizer, model):
    """Predicts grammatical acceptability score of sentence."""
    inputs = tokenizer(sentences, padding=True, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    probas = torch.softmax(outputs.logits, dim=1).detach().cpu().numpy()
    return probas[:, 1]
    
def grammatical_correctness(df, full_gen=False, batch_size=64):
    # use complete generation instead of first sentence if full_gen=True
    gen_pre_inj = []
    gen_post_inj = []
    count = 0
    for n, row in df.iterrows():
        if full_gen:
            gen_pre_inj.append(row["gen"])
            gen_post_inj.append(row["gen_pipeline"])
        else:
            gen_pre_inj.append(sent_tokenize(row["gen"])[0])
            gen_post_inj.append(sent_tokenize(row["gen_pipeline"])[0])
        if len(gen_pre_inj) == batch_size or n == len(df)-1:
            grammar_pre_inj = predict_acceptability(gen_pre_inj, tokenizer, model)
            grammar_post_inj = predict_acceptability(gen_post_inj, tokenizer, model)
            count += np.sum((grammar_post_inj - grammar_pre_inj) / grammar_pre_inj)
            gen_pre_inj = []
            gen_post_inj = []
    return count / len(df)

np.round(grammatical_correctness(df) * 100, 2)

In [None]:
# num_edits
df["correction"].sum()

In [None]:
# fixing edits
len(df[~df["acc"] & df["correction"] & df["acc_pipeline"]])

In [None]:
# breaking edits
len(df[df["acc"] & df["correction"] & ~df["acc_pipeline"]])

In [None]:
print("t_gen", df["t_gen"].mean().round(2))
print("t_extraction", df["t_extraction"].mean().round(2))
print("t_retrieval", df["t_retrieval"].mean().round(2))
print("t_injection", df["t_injection"].mean().round(2))
print("t_pipeline", df["t_pipeline"].mean().round(2))

In [None]:
# analizying pipeline results
# wrong retrieval but correct in pipeline
len(df[df["correction"] & ~df["acc_retrieval"] & df["acc_pipeline"]])
#df[df["correction"] & ~df["acc_retrieval"] & df["acc_pipeline"]].to_excel("pipeline_wrong_retr_correct_pipe_ici.xlsx", index=False)

In [None]:
results = []

files = sorted([file for file in os.listdir(output_dir) if "json" in file])
logger.info(f"Found {len(files)} files in {output_dir}")

for n, fname in enumerate(files):
    logger.info(f"{n+1}/{len(files)} Evaluating {fname}")
    _, dataset, split, method, model_name = fname.split(".")[0].split("_")

    # load data
    df_gen = pd.read_json(f"{output_dir}/{fname}", lines=True)
    df_gt = pd.read_json(f"../datasets/{dataset}/{dataset}_{split}.json", lines=True)
    df = df_gt.merge(df_gen)
    wikidata_dict = load_wikidata_json("../datasets/wikidata_entity_data.json")
    logger.info(f"Loaded dataframe with {len(df)} rows")

    df["acc"] = df.apply(lambda row: accuracy(row), axis=1)
    df["acc_pipeline"] = df.apply(lambda row: accuracy_pipeline(row), axis=1)
    df["acc_retrieval"] = df.apply(lambda row: accuracy_retrieval(row), axis=1)
    
    results.append(
        {
            "output_dir": output_dir,
            "fname": fname,
            "dataset": dataset,
            "split": split,
            "method": method,
            "model_name": model_name,
            "n_samples": len(df),
            "acc": df["acc"].mean(),
            "acc_pipeline": df["acc_pipeline"].mean(),
            "acc_edit": (~df["acc"] == df["correction"]).mean(),
            "acc_retrieval": df[~df["acc"] & df["correction"]]["acc_retrieval"].mean(),
            "acc_retrieval_all": df[df["correction"]]["acc_retrieval"].mean(),
            "acc_injection": df[~df["acc"] & df["correction"] & df["acc_retrieval"]]["acc_pipeline"].mean(),
            "acc_injection_all": df[df["correction"]]["acc_pipeline"].mean(),
            "acc_retrieval_n_samples": len(df[~df["acc"] & df["correction"] & df["acc_retrieval"]]),
            "num_edits": df["correction"].sum(),
            "num_correct_edits": len(df[df["correction"] & df["acc_pipeline"]]),
            "num_breaking_edits": len(df[df["acc"] & df["correction"] & ~df["acc_pipeline"]]),
            "fluency": fluency_ngram_entropy(df),
            "grammar": grammatical_correctness(df, full_gen=False),
            "t_gen": df["t_gen"].mean().round(2),
            "t_extraction": df["t_extraction"].mean().round(2),
            "t_retrieval": df["t_retrieval"].mean().round(2),
            "t_injection": df[df["t_injection"] != 0]["t_injection"].mean().round(2), # do not consider cases with no injection
            "t_injection_all": df["t_injection"].mean().round(2),
            "t_pipeline": df["t_pipeline"].mean().round(2)
        }
    )
    logger.info(f"Finished evaluation for file '{fname}'")

logger.info(f"Finished evaluation for {output_dir}")
results_dir = f"{output_dir}_eval.csv"
logger.info(f"Writing results to {results_dir}")
pd.DataFrame(results).to_csv(results_dir, index=False)