# Injection Accuracy Evaluation

In [1]:
import numpy as np
import torch
import pandas as pd
import requests

from scipy.spatial.distance import cosine
from tqdm import tqdm
from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer

import sys
sys.path.append("../utils")
from wikidata import load_wikidata_json

In [2]:
df = pd.read_json("../datasets/fewrel/fewrel_val.json", lines=True)
wikidata_dict = load_wikidata_json("../datasets/wikidata_entity_data.json")

In [3]:
# prepare dataset and add necessary columns
# injected retrieved entity
df["entity_retrieved"] = df.apply(lambda row: row[row["cf_entity_type"] + "_retrieved"], axis=1)
df["id_entity_retrieved"] = df.apply(lambda row: row[row["cf_entity_type"] + "_id"], axis=1)
# replace entity in full_text with injected retrieved entity
df["full_text_retrieved"] = df.apply(
    lambda row: row["full_text"].replace(row[row["cf_entity_type"]], row[row["cf_entity_type"] + "_retrieved"]), 
    axis=1)
# retrieved == entity name in text
df["match"] = df.apply(lambda row: row[row["cf_entity_type"]] == row[f"{row['cf_entity_type']}_retrieved"], axis=1)

pos_samples = df[["full_text", "prompt", "h_retrieved", "r", "t_retrieved", "entity_retrieved", "id_entity_retrieved", "full_text_retrieved", "match"]]
pos_samples = pos_samples.rename(columns={
    "h_retrieved": "h",
    "t_retrieved": "t",
    "entity_retrieved": "injected_entity", 
    "id_entity_retrieved": "injected_entity_id", 
    "full_text_retrieved": "injected_entity_text"})
pos_samples["label"] = 1

neg_samples = df[["full_text", "prompt", "h_retrieved", "r", "t_retrieved", "cf_entity", "cf_id", "cf_full_text", "cf_entity_type", "match"]]
neg_samples = neg_samples.rename(columns={
    "h_retrieved": "h",
    "t_retrieved": "t",
    "cf_entity": "injected_entity", 
    "cf_id": "injected_entity_id", 
    "cf_full_text": "injected_entity_text"})
# replace correct triple entity with counterfactual entity
neg_samples.loc[neg_samples["cf_entity_type"] == "t", "t"] = neg_samples.loc[neg_samples["cf_entity_type"] == "t", "injected_entity"]
neg_samples.loc[neg_samples["cf_entity_type"] == "h", "h"] = neg_samples.loc[neg_samples["cf_entity_type"] == "h", "injected_entity"]
neg_samples = neg_samples.drop(columns="cf_entity_type", axis=1)
neg_samples["label"] = 0

# 500 pos samples where entity exactly matches
# 500 pos samples where entity are aliases
# 500 neg samples with cf
# 500 neg samples with no entity
pos_samples_match = pos_samples[pos_samples["match"]].head(500)
pos_samples_alias = pos_samples[~pos_samples["match"]].head(500)
pos_samples = pd.concat([pos_samples_match, pos_samples_alias])

neg_samples_cf = neg_samples.head(500)
neg_samples_missing = neg_samples.iloc[500:1000].copy()
neg_samples_missing["injected_entity_text"] = neg_samples.apply(lambda row: row["injected_entity_text"].replace(row["injected_entity"], ""), axis=1)
neg_samples = pd.concat([neg_samples_cf, neg_samples_missing])

df_samples = pd.concat([pos_samples, neg_samples])
df_samples = df_samples.reset_index(drop=True).reset_index().rename(columns={"index": "id"})

## Matching

In [4]:
def injection_acc_match(df, use_aliases=True):
    correct_pred = 0
    for _, row in df.iterrows():
        gen = row["full_text"][len(row["prompt"]):]

        if use_aliases:
            aliases = wikidata_dict[row["injected_entity_id"]]["aliases"]
            if aliases:
                aliases = set(sum([x.split(", ") for x in aliases], []))
        else:
            aliases = None
        
        #if row["injected_entity"].lower() in gen.lower() or (aliases and any([a.lower() in gen.lower() for a in aliases])):
        # problem if case insensitive: e.g., Germany as alias DE, -> de might be machted in text because it appears in other words
        if row["injected_entity"] in gen or (aliases and any([a in gen for a in aliases])):
            pred = 1
        else:
            pred = 0
        correct_pred += pred == row["label"]
    return np.round(correct_pred / len(df), 4)

In [5]:
print(injection_acc_match(df_samples))
print(injection_acc_match(pos_samples, use_aliases=True))
print(injection_acc_match(pos_samples_match, use_aliases=True))
print(injection_acc_match(pos_samples_alias, use_aliases=True))
print(injection_acc_match(neg_samples, use_aliases=True))
print(injection_acc_match(neg_samples_cf, use_aliases=True))
print(injection_acc_match(neg_samples_missing, use_aliases=True))

0.87
0.744
1.0
0.488
0.996
0.992
1.0


## LLM prompt

In [6]:
model_name = "meta-llama/Llama-2-13b-chat-hf"
hf_token = "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"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, padding_side="left")
tokenizer.pad_token = tokenizer.bos_token
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True, token=hf_token, bnb_4bit_compute_dtype=torch.float16) 

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
llama_chat_template = """<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>

{user_message} [/INST] Answer:"""

system_prompt = """You are given a fact in the form of a (subject, predicate, object) triple and a sentence. Your task is to check if the given fact is present in the sentence.
Answer only with 'Yes' or 'No'."""

user_message_template = """Fact: ({h}, {r}, {t})
Sentence: {sentence}"""

In [9]:
def injection_acc_llm(df, batch_size=32):
    correct_pred = 0
    total_rows = len(df)
    prompts = []
    batch_indices = []
    idx_pos = tokenizer.encode("Yes", add_special_tokens=False)[0]
    idx_neg = tokenizer.encode("No", add_special_tokens=False)[0]
    
    for n, row in df.reset_index(drop=True).iterrows():
        user_message = user_message_template.format(h=row["h"], r=row["r"], t=row["t"], sentence=row["full_text"])
        prompt = llama_chat_template.format(system_prompt=system_prompt, user_message=user_message)
        prompts.append(prompt)
        batch_indices.append(n)

        if len(prompts) == batch_size or n == total_rows - 1:
            inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(model.device)
            with torch.no_grad():
                outputs = model(**inputs).logits

            for i, logits in enumerate(outputs):
                # proba for Yes > proba for No
                pred = 1 if logits[-1, idx_pos] > logits[-1, idx_neg] else 0
                correct_pred += pred == df.iloc[batch_indices[i]]["label"]

            print(f"Steps: {n+1} - Acc: {correct_pred/(n+1):.4f}")
            prompts = []
            batch_indices=[]
    return correct_pred / total_rows

In [10]:
injection_acc_llm(df_samples)

Steps: 32 - Acc: 0.9062
Steps: 64 - Acc: 0.8594
Steps: 96 - Acc: 0.8438
Steps: 128 - Acc: 0.8594
Steps: 160 - Acc: 0.8812
Steps: 192 - Acc: 0.8802
Steps: 224 - Acc: 0.8973
Steps: 256 - Acc: 0.9062
Steps: 288 - Acc: 0.9132
Steps: 320 - Acc: 0.9187
Steps: 352 - Acc: 0.9176
Steps: 384 - Acc: 0.9167
Steps: 416 - Acc: 0.9207
Steps: 448 - Acc: 0.9129
Steps: 480 - Acc: 0.9167
Steps: 512 - Acc: 0.9180
Steps: 544 - Acc: 0.9136
Steps: 576 - Acc: 0.9115
Steps: 608 - Acc: 0.9079
Steps: 640 - Acc: 0.9062
Steps: 672 - Acc: 0.9062
Steps: 704 - Acc: 0.9020
Steps: 736 - Acc: 0.8940
Steps: 768 - Acc: 0.8893
Steps: 800 - Acc: 0.8862
Steps: 832 - Acc: 0.8870
Steps: 864 - Acc: 0.8819
Steps: 896 - Acc: 0.8817
Steps: 928 - Acc: 0.8750
Steps: 960 - Acc: 0.8729
Steps: 992 - Acc: 0.8710
Steps: 1024 - Acc: 0.8633
Steps: 1056 - Acc: 0.8475
Steps: 1088 - Acc: 0.8483
Steps: 1120 - Acc: 0.8500
Steps: 1152 - Acc: 0.8524
Steps: 1184 - Acc: 0.8547
Steps: 1216 - Acc: 0.8462
Steps: 1248 - Acc: 0.8373
Steps: 1280 - Acc: 0

0.8105

In [11]:
# llama-2-13b-chat-hf 4bit: 0.7955
# - bnb_4bit_compute_dtype=torch.float16: 0.7915 but waaaaay faster
# - logits: 0.80875
# llama-2-13b-chat-hf full precision: 0.8133
# llama-2-70b-chat-hf 4bit: 0.7990

In [12]:
injection_acc_llm(pos_samples)

Steps: 32 - Acc: 0.9062
Steps: 64 - Acc: 0.8594
Steps: 96 - Acc: 0.8438
Steps: 128 - Acc: 0.8594
Steps: 160 - Acc: 0.8812
Steps: 192 - Acc: 0.8802
Steps: 224 - Acc: 0.8973
Steps: 256 - Acc: 0.9062
Steps: 288 - Acc: 0.9132
Steps: 320 - Acc: 0.9187
Steps: 352 - Acc: 0.9176
Steps: 384 - Acc: 0.9167
Steps: 416 - Acc: 0.9207
Steps: 448 - Acc: 0.9129
Steps: 480 - Acc: 0.9167
Steps: 512 - Acc: 0.9180
Steps: 544 - Acc: 0.9136
Steps: 576 - Acc: 0.9115
Steps: 608 - Acc: 0.9079
Steps: 640 - Acc: 0.9062
Steps: 672 - Acc: 0.9062
Steps: 704 - Acc: 0.9020
Steps: 736 - Acc: 0.8940
Steps: 768 - Acc: 0.8893
Steps: 800 - Acc: 0.8862
Steps: 832 - Acc: 0.8870
Steps: 864 - Acc: 0.8819
Steps: 896 - Acc: 0.8817
Steps: 928 - Acc: 0.8750
Steps: 960 - Acc: 0.8729
Steps: 992 - Acc: 0.8710
Steps: 1000 - Acc: 0.8720


0.872

In [13]:
injection_acc_llm(pos_samples_match)

Steps: 32 - Acc: 0.9062
Steps: 64 - Acc: 0.8594
Steps: 96 - Acc: 0.8438
Steps: 128 - Acc: 0.8594
Steps: 160 - Acc: 0.8812
Steps: 192 - Acc: 0.8802
Steps: 224 - Acc: 0.8973
Steps: 256 - Acc: 0.9062
Steps: 288 - Acc: 0.9132
Steps: 320 - Acc: 0.9187
Steps: 352 - Acc: 0.9176
Steps: 384 - Acc: 0.9167
Steps: 416 - Acc: 0.9207
Steps: 448 - Acc: 0.9129
Steps: 480 - Acc: 0.9167
Steps: 500 - Acc: 0.9180


0.918

In [14]:
injection_acc_llm(pos_samples_alias)

Steps: 32 - Acc: 0.8750
Steps: 64 - Acc: 0.8438
Steps: 96 - Acc: 0.8750
Steps: 128 - Acc: 0.8672
Steps: 160 - Acc: 0.8688
Steps: 192 - Acc: 0.8646
Steps: 224 - Acc: 0.8438
Steps: 256 - Acc: 0.8477
Steps: 288 - Acc: 0.8333
Steps: 320 - Acc: 0.8375
Steps: 352 - Acc: 0.8381
Steps: 384 - Acc: 0.8333
Steps: 416 - Acc: 0.8245
Steps: 448 - Acc: 0.8281
Steps: 480 - Acc: 0.8250
Steps: 500 - Acc: 0.8260


0.826

In [15]:
injection_acc_llm(neg_samples)

Steps: 32 - Acc: 0.3750
Steps: 64 - Acc: 0.4844
Steps: 96 - Acc: 0.6146
Steps: 128 - Acc: 0.6797
Steps: 160 - Acc: 0.7375
Steps: 192 - Acc: 0.7552
Steps: 224 - Acc: 0.7143
Steps: 256 - Acc: 0.6953
Steps: 288 - Acc: 0.7257
Steps: 320 - Acc: 0.7406
Steps: 352 - Acc: 0.7415
Steps: 384 - Acc: 0.7422
Steps: 416 - Acc: 0.7500
Steps: 448 - Acc: 0.7277
Steps: 480 - Acc: 0.7208
Steps: 512 - Acc: 0.7148
Steps: 544 - Acc: 0.7188
Steps: 576 - Acc: 0.7153
Steps: 608 - Acc: 0.7220
Steps: 640 - Acc: 0.7203
Steps: 672 - Acc: 0.7262
Steps: 704 - Acc: 0.7259
Steps: 736 - Acc: 0.7242
Steps: 768 - Acc: 0.7240
Steps: 800 - Acc: 0.7262
Steps: 832 - Acc: 0.7296
Steps: 864 - Acc: 0.7315
Steps: 896 - Acc: 0.7400
Steps: 928 - Acc: 0.7468
Steps: 960 - Acc: 0.7510
Steps: 992 - Acc: 0.7500
Steps: 1000 - Acc: 0.7490


0.749

In [16]:
injection_acc_llm(neg_samples_cf)

Steps: 32 - Acc: 0.3750
Steps: 64 - Acc: 0.4844
Steps: 96 - Acc: 0.6146
Steps: 128 - Acc: 0.6797
Steps: 160 - Acc: 0.7375
Steps: 192 - Acc: 0.7552
Steps: 224 - Acc: 0.7143
Steps: 256 - Acc: 0.6953
Steps: 288 - Acc: 0.7257
Steps: 320 - Acc: 0.7406
Steps: 352 - Acc: 0.7415
Steps: 384 - Acc: 0.7422
Steps: 416 - Acc: 0.7500
Steps: 448 - Acc: 0.7277
Steps: 480 - Acc: 0.7208
Steps: 500 - Acc: 0.7200


0.72

In [17]:
injection_acc_llm(neg_samples_missing)

Steps: 32 - Acc: 0.5938
Steps: 64 - Acc: 0.6875
Steps: 96 - Acc: 0.7396
Steps: 128 - Acc: 0.6953
Steps: 160 - Acc: 0.7438
Steps: 192 - Acc: 0.7448
Steps: 224 - Acc: 0.7321
Steps: 256 - Acc: 0.7305
Steps: 288 - Acc: 0.7361
Steps: 320 - Acc: 0.7469
Steps: 352 - Acc: 0.7415
Steps: 384 - Acc: 0.7578
Steps: 416 - Acc: 0.7740
Steps: 448 - Acc: 0.7835
Steps: 480 - Acc: 0.7854
Steps: 500 - Acc: 0.7780


0.778

## Embedding similartiy

In [19]:
def mean_pooling(model_output, attention_mask):
    # Mean Pooling - Take attention mask into account for correct averaging
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def cosine_sim(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to("cuda")
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu()
    return 1 - cosine(embeddings[0], embeddings[1])

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2").to("cuda")

In [20]:
def injection_acc_embd(df):
    score = 0
    for _, row in tqdm(df.iterrows(), total=len(df)):
        #pred = cosine_sim([row["full_text"], row["injected_entity_text"]])
        pred = cosine_sim([row["full_text"][len(row["prompt"]):], row["injected_entity_text"][len(row["prompt"]):]])
        pred = 0 if pred < 0.8 else 1 # threshold
        score += 1 if pred == row["label"] else 0
    return score / len(df)

In [21]:
injection_acc_embd(df_samples)

100%|██████████| 2000/2000 [00:16<00:00, 123.67it/s]


0.8215

In [22]:
injection_acc_embd(pos_samples)

100%|██████████| 1000/1000 [00:06<00:00, 144.08it/s]


0.86

In [23]:
injection_acc_embd(pos_samples_match)

100%|██████████| 500/500 [00:03<00:00, 144.56it/s]


1.0

In [24]:
injection_acc_embd(pos_samples_alias)

100%|██████████| 500/500 [00:03<00:00, 143.41it/s]


0.72

In [25]:
injection_acc_embd(neg_samples)

100%|██████████| 1000/1000 [00:06<00:00, 144.18it/s]


0.783

In [26]:
injection_acc_embd(neg_samples_cf)

100%|██████████| 500/500 [00:03<00:00, 143.94it/s]


0.852

In [27]:
injection_acc_embd(neg_samples_missing)

100%|██████████| 500/500 [00:03<00:00, 145.21it/s]


0.714

## Entailment Model

In [28]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli").to("cuda")

def entailment(text):
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model(**inputs)
    # 1 if proba for entailment > proba for contradiction
    return 1 if output.logits[0][0] < output.logits[0][2] else 0 

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
def injection_acc_entail(df):
    correct_pred = 0
    for _, row in tqdm(df.iterrows(), total=len(df)):
        sentences = row["full_text"] + " " + row["injected_entity_text"]
        pred = entailment(sentences)
        correct_pred += pred == row["label"]
    return correct_pred / len(df)

In [30]:
injection_acc_entail(df_samples)

100%|██████████| 2000/2000 [00:25<00:00, 78.12it/s]


0.721

In [31]:
injection_acc_entail(pos_samples)

100%|██████████| 1000/1000 [00:12<00:00, 78.51it/s]


0.95

In [32]:
injection_acc_entail(pos_samples_match)

100%|██████████| 500/500 [00:06<00:00, 78.43it/s]


1.0

In [33]:
injection_acc_entail(pos_samples_alias)

100%|██████████| 500/500 [00:06<00:00, 77.56it/s]


0.9

In [34]:
injection_acc_entail(neg_samples)

100%|██████████| 1000/1000 [00:12<00:00, 77.76it/s]


0.492

In [35]:
injection_acc_entail(neg_samples_cf)

100%|██████████| 500/500 [00:06<00:00, 77.54it/s]


0.94

In [36]:
injection_acc_entail(neg_samples_missing)

100%|██████████| 500/500 [00:06<00:00, 78.02it/s]


0.044