### 0. Import libraries

In [1]:
# These two commands are needed
# !pip install transformers
# !pip install accelerate

In [2]:
# import libraries
import torch
import numpy as np
import transformers
import pandas as pd

# import local scripts
from data_loader import DatasetLoader

# produce repeatable results
np.random.seed(seed=42)
transformers.set_seed(42)

# enable CUDNN deterministic mode
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

2024-06-27 18:40:28.437431: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-27 18:40:28.574966: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-27 18:40:29.277123: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-06-27 18:40:29.277193

### 1. Load the data

Below are the names of the datasets used by the authors to check if contrastive explanations identify linguistically appropriate evidence.

In [3]:
anaphor_agreement_datasets = ["anaphor_gender_agreement", "anaphor_number_agreement"]

# not sure about the one below
argument_structure_datasets = ["drop_argument"]

determiner_noun_agreement_datasets = [
    "determiner_noun_agreement_1",
    "determiner_noun_agreement_2",
    "determiner_noun_agreement_irregular_1",
    "determiner_noun_agreement_irregular_2",
    "determiner_noun_agreement_with_adj_2",
    "determiner_noun_agreement_with_adj_irregular_1",
    "determiner_noun_agreement_with_adj_irregular_2",
    "determiner_noun_agreement_with_adjective_1",
]

npi_licesing = [
    "matrix_question_npi_licensor_present",
    "only_npi_licensor_present",
    "sentential_negation_npi_licensor_present",
]

subject_verb_agreement = [
    "irregular_plural_subject_verb_agreement_1",
    "irregular_plural_subject_verb_agreement_2",
    "regular_plural_subject_verb_agreement_1",
    "regular_plural_subject_verb_agreement_2",
]

In [4]:
data_loader = DatasetLoader()
data = data_loader.load_data(anaphor_agreement_datasets[0])

In [5]:
pd.DataFrame(data["train"])

Unnamed: 0,sentence_good,sentence_bad,field,linguistics_term,UID,simple_LM_method,one_prefix_method,two_prefix_method,lexically_identical,pair_id
0,Katherine can't help herself.,Katherine can't help himself.,morphology,anaphor_agreement,anaphor_gender_agreement,True,True,False,False,0
1,Karla could listen to herself.,Karla could listen to himself.,morphology,anaphor_agreement,anaphor_gender_agreement,True,True,False,False,1
2,Marie won't think about herself.,Marie won't think about itself.,morphology,anaphor_agreement,anaphor_gender_agreement,True,True,False,False,2
3,Mark hasn't discussed himself.,Mark hasn't discussed itself.,morphology,anaphor_agreement,anaphor_gender_agreement,True,True,False,False,3
4,Stephen impressed himself.,Stephen impressed itself.,morphology,anaphor_agreement,anaphor_gender_agreement,True,True,False,False,4
...,...,...,...,...,...,...,...,...,...,...
995,Carlos complained about himself.,Carlos complained about itself.,morphology,anaphor_agreement,anaphor_gender_agreement,True,True,False,False,995
996,Benjamin is firing himself.,Benjamin is firing itself.,morphology,anaphor_agreement,anaphor_gender_agreement,True,True,False,False,996
997,Chad kisses himself.,Chad kisses herself.,morphology,anaphor_agreement,anaphor_gender_agreement,True,True,False,False,997
998,Paula was arguing about herself.,Paula was arguing about himself.,morphology,anaphor_agreement,anaphor_gender_agreement,True,True,False,False,998


### 2. Develop the alignment metrics

In [6]:
random_floats = np.random.rand(10)
explanation = random_floats / sum(random_floats)
explanation

array([0.07200801, 0.18278161, 0.14073106, 0.11509637, 0.0299957 ,
       0.02999106, 0.01116699, 0.16652855, 0.11556865, 0.13613201])

In [7]:
known_evidence = np.zeros(10)
known_evidence[5] = 1
known_evidence[6] = 1
known_evidence

array([0., 0., 0., 0., 0., 1., 1., 0., 0., 0.])

In [8]:
def compute_mean_dot_product(explanations, known_evidences):
    """
    Computes the average dot product between the explanation and known
    evidence over all sentences.
    """

    sum_of_dot_products = 0

    for explanation, known_evidence in zip(explanations, known_evidences):
        sum_of_dot_products += np.dot(explanation, known_evidence)

    average_dot_product = sum_of_dot_products / len(explanations)

    return average_dot_product

In [9]:
compute_mean_dot_product([explanation], [known_evidence])

0.04115804897193038

In [10]:
def compute_probes_needed(explanation, known_evidence):
    """
    Computes the number of words we need to probe (look at) based on
    the saliency map to find the most important word. Basically, we
    get the rank of the word of interest after words were sorted
    descending by the saliency value.
    """

    probes_number = 1
    explanation_evidence = dict(zip(explanation, known_evidence)).items()
    sorted_explanation_evidence = sorted(explanation_evidence, reverse=True)

    for word_info in sorted_explanation_evidence:
        saliency_value, is_known_evidence = word_info

        if is_known_evidence == 0:
            probes_number += 1
        else:
            break

    return probes_number

In [11]:
def compute_mean_probes_needed(explanations, known_evidences):
    """
    Computes the average number of words we need to probe (look at) based on
    the saliency map to find the most important word. Basically, we get the
    average rank of the word of interest after words were sorted descending
    by the saliency value.
    """

    sum_of_probes_needed = 0

    for explanation, known_evidence in zip(explanations, known_evidences):
        sum_of_probes_needed += compute_probes_needed(explanation, known_evidence)

    average_probes_needed = sum_of_probes_needed / len(explanations)

    return average_probes_needed

In [12]:
compute_mean_probes_needed([explanation], [known_evidence])

9.0

In [13]:
def compute_mean_reciprocal_rank(explanations, known_evidences):
    """
    Calculates the average of the inverse rank of the first token that
    is part of the known evidence.
    """

    sum_of_inverse_ranks = 0

    for explanation, known_evidence in zip(explanations, known_evidences):
        # consider only the appearance of the first token of interest
        for index in np.where(known_evidence == 1)[0][1:]:
            known_evidence[index] = 0

        inverse_ranking = 1 / compute_probes_needed(explanation, known_evidence)
        sum_of_inverse_ranks += inverse_ranking

    mean_inverse_rank = sum_of_inverse_ranks / len(explanations)

    return mean_inverse_rank

In [14]:
compute_mean_reciprocal_rank([explanation], [known_evidence])

0.1111111111111111