# Qualitative Evaluation Picker

In this notebook, some instances of the produced results are picked for qualitative evaluation

In [32]:
import pandas as pd
import os
from scipy import stats

In [33]:
all_original_results = pd.read_csv('../results/allResults.csv')
all_custom_results = pd.read_csv('../results/allResults-custom.csv')

all_original_results.rename(columns={'index': 'id'}, inplace=True)
all_custom_results.rename(columns={'index': 'id'}, inplace=True)

There are 6 models: 
1. explanation
2. label
3. label-explanation

4. explanation-custom
5. label-custom
6. label-explanation-custom

In [34]:
def exportSpecificRows(results, ids):
    '''Exports the rows with the given ids to a csv file
        
    Args:
        results: pandas dataframe with the results
        ids: list of ids to be exported
        filename: name of the file to be exported
    
    '''
    # results[results['id'].isin(ids)].sort_values(by=['id']).to_excel(filename, index=False)
    return results[results['id'].isin(ids)].sort_values(by=['id'])
    

In [35]:
# Manually added the indexes here after they were generated by the sampler
ids_set_Amber_Leo = [259, 602, 1122, 1127, 1155, 1228, 1292, 1612, 1865, 2424, 2489, 3043, 3254, 3441, 3858, 4252, 4537, 4731, 4757,
                     4833, 5839, 5880, 5899, 6071, 6191, 6395, 6560, 7044, 7072, 7208, 7212, 7321, 7562, 7714, 8004, 8127, 8714, 8825, 9656, 9754]
ids_set_Lorenzo_Phillip = [148, 691, 793, 1289, 1296, 1545, 1625, 1780, 1847, 2187, 2316, 2553, 2622, 2646, 2670, 2676, 3230, 3272,
                           3401, 3550, 4211, 4584, 4906, 5015, 5374, 6278, 6670, 6878, 7060, 7465, 7480, 7743, 8347, 8770, 8827, 9295, 9488, 9575, 9656, 9714]

if not os.path.exists('../results/qualitativeEvaluation'):
    os.makedirs('../results/qualitativeEvaluation')

amber_leo_original_set = exportSpecificRows(
    all_original_results, ids_set_Amber_Leo)
lorenzo_phillip_original_set = exportSpecificRows(
    all_original_results, ids_set_Lorenzo_Phillip)

amber_leo_custom_set = exportSpecificRows(
    all_custom_results, ids_set_Amber_Leo)
lorenzo_phillip_custom_set = exportSpecificRows(
    all_custom_results, ids_set_Lorenzo_Phillip)

amber_leo_original_set.to_excel(
    '../results/qualitativeEvaluation/Amber_Leo_original.xlsx', index=False)
lorenzo_phillip_original_set.to_excel(
    '../results/qualitativeEvaluation/Lorenzo_Phillip_original.xlsx', index=False)

amber_leo_custom_set.to_excel(
    '../results/qualitativeEvaluation/Amber_Leo_custom.xlsx', index=False)
lorenzo_phillip_custom_set.to_excel(
    '../results/qualitativeEvaluation/Lorenzo_Phillip_custom.xlsx', index=False)

In [36]:
def sampler(dataset, column_to_sort, samples_per_quartile = 2):
    '''Sample examples from each quartile of a dataset based on a column_to_sort
        
    Args:
        dataset: pandas dataframe with the results
        column_to_sort: column to sort the dataset by to use for quartile sampling
        samples_per_quartile: number of samples to take from each quartile
    
    Returns:
        pandas dataframe with the sampled examples
    '''
    # Quartile 1
    q1 = dataset[dataset[column_to_sort] <=
                    dataset[column_to_sort].quantile(0.25)]
    q1 = q1.sample(n=samples_per_quartile)

    # Quartile 2
    q2 = dataset[(dataset[column_to_sort] > dataset[column_to_sort].quantile(
        0.25)) & (dataset[column_to_sort] <= dataset[column_to_sort].quantile(0.5))]
    q2 = q2.sample(n=samples_per_quartile)

    # Quartile 3

    q3 = dataset[(dataset[column_to_sort] > dataset[column_to_sort].quantile(
        0.5)) & (dataset[column_to_sort] <= dataset[column_to_sort].quantile(0.75))]
    q3 = q3.sample(n=samples_per_quartile)

    # Quartile 4
    q4 = dataset[dataset[column_to_sort] >
                    dataset[column_to_sort].quantile(0.75)]
    q4 = q4.sample(n=samples_per_quartile)

    # Return concatenated dataframe

    return pd.concat([q1, q2, q3, q4])
    

In [37]:
# Pick num_samples_per_quartile examples from each quartile based on neural score
# We use the explanation model instead of label-explanation. Hopefully that does not have a big impact for the sorting.
custom_sampled_results = sampler(all_custom_results, 'rug-nlp-nli/flan-base-nli-explanation-custom_neural_score', samples_per_quartile = 10)
#Pick the examplees with the same id from the original results
original_sampled_results = all_original_results[all_original_results['id'].isin(custom_sampled_results['id'])]    

masked_original_results = original_sampled_results[['id', 'premise', 'hypothesis',
                                      'label',
                                      'rug-nlp-nli/flan-base-nli-explanation_prediction', 
                                      'rug-nlp-nli/flan-base-nli-label-explanation_prediction'
                                      ]].copy()

masked_custom_results = custom_sampled_results[['id', 'premise', 'hypothesis',
                                      'label',
                                      'rug-nlp-nli/flan-base-nli-explanation-custom_prediction', 
                                      'rug-nlp-nli/flan-base-nli-label-explanation-custom_prediction'
                                      ]].copy()

# Rename the label column as correct_label for clarity
masked_original_results.rename(columns={'label': 'correct_label'}, inplace=True)
# Hide model names, to reduce bias in the qualitative evaluation
masked_original_results.rename(columns={'rug-nlp-nli/flan-base-nli-explanation_prediction': 'prediction_1'}, inplace=True)
masked_original_results.rename(columns={'rug-nlp-nli/flan-base-nli-label-explanation_prediction': 'prediction_2'}, inplace=True)

# Rename the label column as correct_label for clarity
masked_custom_results.rename(columns={'label': 'correct_label'}, inplace=True)
# Hide model names, to reduce bias in the qualitative evaluation
masked_custom_results.rename(columns={'rug-nlp-nli/flan-base-nli-explanation-custom_prediction': 'prediction_3'}, inplace=True)
masked_custom_results.rename(columns={'rug-nlp-nli/flan-base-nli-label-explanation-custom_prediction': 'prediction_4'}, inplace=True)

# Order based on id, so that it's easy to compare the two tables.
masked_original_results.sort_values(by=['id'], inplace=True)
masked_custom_results.sort_values(by=['id'], inplace=True)

In [38]:
# Make a copy of the original results to add the custom results to.
all_qualitative_results = masked_original_results.copy()

# Copy prediction_3 and Prediction_4 to the original results
all_qualitative_results['prediction_3'] = masked_custom_results['prediction_3'].to_numpy()
all_qualitative_results['prediction_4'] = masked_custom_results['prediction_4'].to_numpy()

In [39]:
# Export the two dataframes to two csvs
all_qualitative_results.to_csv('../results/results_for_qual_evaluation.csv')

In [40]:
# Print the results from the original models, that use the full dataset
masked_original_results

Unnamed: 0,id,premise,hypothesis,correct_label,prediction_1,prediction_2
7356,170,A man wearing a red uniform and helmet stands ...,a man standing on his motorbike.,entailment,A man wearing a red uniform and helmet stands ...,entailment: A man wearing a red uniform and he...
6369,285,A group of people gathered at night watching a...,A group of humans are looking at the same dire...,neutral,Just because a group of people are watching an...,entailment: People are humans and watching an ...
6385,665,Man holding torch of fire.,The man is holding a bag.,contradiction,The man cannot be holding a torch of fire and ...,contradiction: A man cannot be holding a torch...
5440,1098,A bare chested smiling child plays in water.,There is a child getting naked in the water.,neutral,Just because a child is bare chested doesn't m...,entailment: A bare chested smiling child is a ...
9055,1125,A woman is in the middle of hitting a tennis b...,A woman is sleeping.,contradiction,A woman cannot be hitting a tennis ball and sl...,contradiction: A woman cannot be hitting a ten...
7409,1155,Man in red jacket is opening the door while ho...,The man was holding the door for his girlfreind,neutral,Just because the man is opening the door does ...,neutral: Just because a man is opening the doo...
6147,1351,Mom and little boy having fun & eating by the ...,A mom and a son spending time together on thei...,neutral,A mom and little boy having fun & eating by th...,neutral: Mom and little boy having fun & eatin...
383,1451,A man surfs on a medium sized wave while holdi...,An extreme kayaker steers the treacherous waters,neutral,Just because a man surfs on a medium sized wav...,contradiction: The man can't surf on a medium ...
681,1606,A man in a red shirt is sitting on top of a ro...,A rugged man sits atop a mountain.,neutral,Not all men are rugged.,neutral: Not all men are rugged.
5335,1737,A woman is lying on her stomach on a white pil...,A woman cries because her husband left.,neutral,Just because a woman is lying on her stomach o...,neutral: A woman is lying on her stomach on a ...


In [41]:
# Print the results from the custom models, that use the cleaned dataset
masked_custom_results

Unnamed: 0,id,premise,hypothesis,correct_label,prediction_3,prediction_4
7354,170,A man wearing a red uniform and helmet stands ...,a man standing on his motorbike.,entailment,A man wearing a red uniform and helmet stands ...,entailment: A man wearing a red uniform and he...
6338,285,A group of people gathered at night watching a...,A group of humans are looking at the same dire...,neutral,Just because a group of people are watching an...,"entailment: People are humans, and watching an..."
6661,665,Man holding torch of fire.,The man is holding a bag.,contradiction,The man is either holding a torch of fire or a...,contradiction: The man cannot be holding a tor...
7252,1098,A bare chested smiling child plays in water.,There is a child getting naked in the water.,neutral,A child can be bare chested without getting na...,entailment: A bare chested child is a child ge...
8965,1125,A woman is in the middle of hitting a tennis b...,A woman is sleeping.,contradiction,The woman cannot be hitting a tennis ball and ...,contradiction: The woman can't be hitting a te...
8534,1155,Man in red jacket is opening the door while ho...,The man was holding the door for his girlfreind,neutral,Just because the man is opening the door does ...,neutral: Just because the man is opening the d...
5505,1351,Mom and little boy having fun & eating by the ...,A mom and a son spending time together on thei...,neutral,Just because a mom and little boy are having f...,neutral: Just because a mom and little boy are...
387,1451,A man surfs on a medium sized wave while holdi...,An extreme kayaker steers the treacherous waters,neutral,A man surfs on a medium sized wave while holdi...,contradiction: The man cannot surf on a medium...
666,1606,A man in a red shirt is sitting on top of a ro...,A rugged man sits atop a mountain.,neutral,A man in a red shirt is sitting on top of a ro...,neutral: Not all men are rugged.
6237,1737,A woman is lying on her stomach on a white pil...,A woman cries because her husband left.,neutral,Just because a woman is lying on her stomach o...,neutral: Just because a woman is lying on her ...


In [42]:
# Import the results from the qualitative evaluation
qualitative_results = pd.read_csv('../results/qualitativeEvaluation/combined.csv')

qualitative_results

Unnamed: 0.1,Unnamed: 0,id,premise,hypothesis,correct_label,explanation_original,prediction_1_score,Correct explanation_1?,explanation-label_original,prediction_2_score,...,explanation-custom_rouge_1_max,explanation-custom_rouge_2_max,explanation-custom_rouge_L_max,label-explanation-custom_neural_score,label-explanation-custom_rouge_1_max,label-explanation-custom_rouge_2_max,label-explanation-custom_rouge_L_max,label-explanation-custom_correct_label?,label-explanation-custom_label_difference,Unnamed: 37
0,7459,259,A person in orange clothing rests above a metr...,A person is waiting for a train.,neutral,Just because a person rests above a metro entr...,2,1,neutral: Resting above a metro entrance does n...,2,...,0.750000,0.636364,0.750000,0.168663,0.720000,0.608696,0.720000,True,"predicted: neutral, target: neutral",
1,6326,602,The little boy gets ready to kick the soccer b...,the boy is sleeping at home,contradiction,The boy cannot be sleeping and getting ready t...,5,1,contradiction: The boy cannot be sleeping and ...,5,...,0.785714,0.384615,0.642857,0.138307,0.740741,0.320000,0.592593,True,"predicted: contradiction, target: contradiction",
2,7302,1122,Kids play in water coming up in streams out of...,Kids are playing in water.,entailment,Kids are playing in water is a rephrasing of k...,1,1,entailment: Kids are playing in water is a rep...,1,...,0.622222,0.511628,0.444444,0.161489,0.693878,0.638298,0.489796,True,"predicted: entailment, target: entailment",
3,7113,1127,A woman with short blond-hair rises from a cha...,A woman is getting ready to box,neutral,Just because a woman rises from a chair does n...,3,1,neutral: A woman with short blond-hair rises f...,2,...,0.830189,0.705882,0.830189,0.127005,0.562500,0.400000,0.562500,False,"predicted: contradiction, target: neutral",
4,7409,1155,Man in red jacket is opening the door while ho...,The man was holding the door for his girlfreind,neutral,Just because the man is opening the door does ...,5,1,neutral: Just because a man is opening the doo...,5,...,0.685714,0.520000,0.685714,0.232764,0.666667,0.509804,0.666667,True,"predicted: neutral, target: neutral",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,7568,9295,Smilling and laughing baby in a walker with fi...,Many toys strewn about on the floor.,entailment,Smilling and laughing baby in a walker with fi...,3,1,entailment: Smilling and laughing baby in a wa...,3,...,0.511628,0.390244,0.511628,0.146193,0.536585,0.410256,0.536585,True,"predicted: entailment, target: entailment",
76,2001,9488,Three children in a black dog kennel.,There are dogs in the kennel.,contradiction,There can either be three children in a dog ke...,3,1,entailment: There are dogs in the kennel is a ...,1,...,0.521739,0.285714,0.434783,0.031745,0.500000,0.307692,0.484848,False,"predicted: entailment, target: contradiction",
77,4557,9575,Man balding with a mustache and semi beard sta...,The man knows how to play the guitar.,entailment,"If a man is playing the guitar, he knows how t...",5,1,entailment: The man knows how to play the guit...,1,...,0.595745,0.355556,0.510638,0.086345,0.583333,0.347826,0.500000,True,"predicted: entailment, target: entailment",
78,4476,9656,An African-American male youth is riding a gre...,A boy is walking on a sidewalk in his neighbor...,contradiction,One cannot be riding a bicycle and walking at ...,5,1,contradiction: The boy cannot be riding a bicy...,5,...,0.866667,0.571429,0.733333,0.052103,0.666667,0.400000,0.518519,True,"predicted: contradiction, target: contradiction",


In [43]:
# Do the Wilcoxon Sign-Ranked Test to see if there is a significant difference between the original and custom models
# We compare all models with each other

# Guide:
# prediction_1 = explanation-original
# prediction_2 = label-explanation-original
# prediction_3 = explanation-custom
# prediction_4 = label-explanation-custom

for i in range(1, 5):
    for j in range(i+1, 5):
        print('Comparing prediction_' + str(i) + ' vs prediction_' + str(j))
        print(stats.wilcoxon(qualitative_results['prediction_' + str(
            i) + '_score'], qualitative_results['prediction_' + str(j) + '_score']))


Comparing prediction_1 vs prediction_2
WilcoxonResult(statistic=95.5, pvalue=0.038436633737780265)
Comparing prediction_1 vs prediction_3
WilcoxonResult(statistic=115.5, pvalue=0.11192989811532168)
Comparing prediction_1 vs prediction_4
WilcoxonResult(statistic=96.5, pvalue=0.012918616378731767)
Comparing prediction_2 vs prediction_3
WilcoxonResult(statistic=181.5, pvalue=0.42564679450053244)
Comparing prediction_2 vs prediction_4
WilcoxonResult(statistic=71.5, pvalue=0.808750855238056)
Comparing prediction_3 vs prediction_4
WilcoxonResult(statistic=201.5, pvalue=0.3509945701597773)


In [44]:
# Now we can try to find the threshold of the neural score that gives correct explanations, based on the qualitative evaluation
import plotly.express as px

# Checking the explanation-original model
column_name = 'Correct explanation_1?'
model_name = "explanation-original_neural_score"

px.scatter(qualitative_results,x=model_name, y=column_name, color=column_name, hover_name=column_name)

column_name = 'Correct explanation_4? '
model_name = "label-explanation-custom_neural_score"

px.scatter(qualitative_results,x=model_name, y=column_name, color=column_name, hover_name=column_name)


In [45]:
# Checking the label-explanation-original model
column_name = 'Correct explanation_2? '
model_name = "label-explanation-original_neural_score"

px.scatter(qualitative_results,x=model_name, y=column_name, color=column_name, hover_name=column_name)


In [46]:
column_name = 'Correct explanation_3?'
model_name = "explanation-custom_neural_score"

px.scatter(qualitative_results,x=model_name, y=column_name, color=column_name, hover_name=column_name)
