In [1]:
import numpy as np
import pandas as pd
RESPONSES_NO = 10

### 1. Preprocess the input form data

In [2]:
form_data = pd.read_json("../results/form_data.json").drop(["explanation", "wrong word"], axis=1)
form_data = form_data.set_index("input sentence")
print(f"Shape of the form data: {form_data.shape}")
form_data.head()

Shape of the form data: (32, 3)


Unnamed: 0_level_0,phenomenon,correct word,contrastive function
input sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Vases were broken by the,argument structure,boy,gradient norm
Helen is observed by the,argument structure,doctor,input x gradient
Curtis is concealed by some,argument structure,teenager,input erasure
A public park is biked to by the,argument structure,child,input erasure
Malls are not talked about by some,argument structure,teenager,input erasure


### 2. Preprocess the answers

In [3]:
form_results = pd.read_csv("../results/form_results.csv").drop(["Marcaj de timp"], axis=1).dropna().T
print(f"Shape of the form results: {form_results.shape}")
form_results.head()

Shape of the form results: (64, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Vases were broken by the ...,boy,boy,boy,boy,boy,boy,boy,boy,boy,boy
Is the provided explanation useful?,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
Helen is observed by the ...,doctor,doctor,doctor,doctor,doctor,doctor,doctor,doctor,doctor,doctor
Is the provided explanation useful?,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
Curtis is concealed by some ...,paralysis,teenager,teenager,teenager,teenager,teenager,teenager,teenager,teenager,teenager


#### 2.1. Get the chosen predicted word

In [4]:
chosen_predicted_word = form_results[~form_results.index.str.contains("Is the provided explanation useful?")]
chosen_predicted_word.index = chosen_predicted_word.index.map(lambda x: x[:-4])
chosen_predicted_word = chosen_predicted_word.rename({index: "pred_" + str(index) for index in range(RESPONSES_NO)}, axis=1)
print(f"Shape of data that contains the chosen predicted word: {chosen_predicted_word.shape}")
chosen_predicted_word.head()

Shape of data that contains the chosen predicted word: (32, 10)


Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9
Vases were broken by the,boy,boy,boy,boy,boy,boy,boy,boy,boy,boy
Helen is observed by the,doctor,doctor,doctor,doctor,doctor,doctor,doctor,doctor,doctor,doctor
Curtis is concealed by some,paralysis,teenager,teenager,teenager,teenager,teenager,teenager,teenager,teenager,teenager
A public park is biked to by the,child,child,child,child,child,child,child,child,child,child
Malls are not talked about by some,teenager,dog,dog,teenager,teenager,teenager,teenager,teenager,teenager,teenager


#### 2.2. Get the chosen usefulness of the explanations

In [5]:
explanation_usefulness = form_results[form_results.index.str.contains("Is the provided explanation useful?")]
explanation_usefulness = explanation_usefulness.reset_index().drop(["index"], axis=1)
explanation_usefulness = explanation_usefulness.rename({index: "usefulness_" + str(index) for index in range(RESPONSES_NO)}, axis=1)
explanation_usefulness.index = chosen_predicted_word.index
print(f"Shape of data that contains the chosen usefulness of the explanation: {explanation_usefulness.shape}")
explanation_usefulness.head()

Shape of data that contains the chosen usefulness of the explanation: (32, 10)


Unnamed: 0,usefulness_0,usefulness_1,usefulness_2,usefulness_3,usefulness_4,usefulness_5,usefulness_6,usefulness_7,usefulness_8,usefulness_9
Vases were broken by the,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
Helen is observed by the,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
Curtis is concealed by some,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes
A public park is biked to by the,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes
Malls are not talked about by some,Yes,No,No,No,Yes,Yes,No,Yes,Yes,Yes


### 3. Join datasets

In [6]:
merged_form_data = pd.concat([form_data.join(chosen_predicted_word), explanation_usefulness], axis=1) 
print(f"Shape of the merged form data: {merged_form_data.shape}")
assert merged_form_data.isna().sum().sum() == 0

pred_columns = ["pred_" + str(index) for index in range(RESPONSES_NO)]
usefulness_columns = ["usefulness_" + str(index) for index in range(RESPONSES_NO)]

for pred_column in pred_columns:
    merged_form_data.loc[:, pred_column] = (merged_form_data[pred_column] == merged_form_data["correct word"]).values.astype("int")

for usefulness_column in usefulness_columns:
    merged_form_data.loc[:, usefulness_column] = (merged_form_data[usefulness_column] == "Yes").values.astype("int")

merged_form_data.head()

Shape of the merged form data: (32, 23)


Unnamed: 0,phenomenon,correct word,contrastive function,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,...,usefulness_0,usefulness_1,usefulness_2,usefulness_3,usefulness_4,usefulness_5,usefulness_6,usefulness_7,usefulness_8,usefulness_9
Vases were broken by the,argument structure,boy,gradient norm,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Helen is observed by the,argument structure,doctor,input x gradient,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Curtis is concealed by some,argument structure,teenager,input erasure,0,1,1,1,1,1,1,...,1,0,1,0,1,1,1,1,1,1
A public park is biked to by the,argument structure,child,input erasure,1,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,1,1
Malls are not talked about by some,argument structure,teenager,input erasure,1,0,0,1,1,1,1,...,1,0,0,0,1,1,0,1,1,1


### 4. Compute statistics

#### 4.1. What is the usefulness per phenomenon and overall?

In [7]:
usefulness_per_phenomenon = merged_form_data.groupby("phenomenon")[usefulness_columns].sum().sum(axis=1) / 80
usefulness_per_phenomenon

phenomenon
argument structure           0.8375
determiner noun agreement    0.9125
npi licensing                 0.675
subject verb agreement        0.875
dtype: object

In [8]:
overall_usefulness = merged_form_data[usefulness_columns].sum().sum() / 320
overall_usefulness

0.825

#### 4.2. What is the accuracy per phenomenon and overall?

In [9]:
accuracy_per_phenomenon = merged_form_data.groupby("phenomenon")[pred_columns].sum().sum(axis=1) / 80
accuracy_per_phenomenon

phenomenon
argument structure           0.8875
determiner noun agreement    0.8125
npi licensing                  0.95
subject verb agreement       0.7625
dtype: object

In [10]:
overall_accuracy = merged_form_data[pred_columns].sum().sum() / 320
overall_accuracy

0.853125

#### 4.3. What is the overall accuracy when the explanation was found useful?

In [11]:
flattened_predictions = merged_form_data[pred_columns].to_numpy().flatten()
indices_useful = np.where(merged_form_data[usefulness_columns].to_numpy().flatten() == 1)[0]
indices_not_useful = np.where(merged_form_data[usefulness_columns].to_numpy().flatten() == 0)[0]

In [12]:
predictions_useful_explanation = flattened_predictions[indices_useful]
predictions_not_useful_explanation = flattened_predictions[indices_not_useful]

In [13]:
accuracy_when_useful_explanation =  predictions_useful_explanation.sum() / len(predictions_useful_explanation)
accuracy_when_useful_explanation

0.8977272727272727

In [14]:
accuracy_when_not_useful_explanation =  predictions_not_useful_explanation.sum() / len(predictions_not_useful_explanation)
accuracy_when_not_useful_explanation

0.6428571428571429