In [248]:
# Uncomment any of the following lines if you do not have any of these packages installed.
# !pip install -U nltk
# !pip install -U rouge_score
# !pip install -U datasets
# !pip install -U bert_score

In [249]:
import pandas as pd
import pickle
from nltk.translate import bleu, chrf_score, nist_score
from rouge_score import rouge_scorer
from datasets import load_metric
bertscore = load_metric("bertscore")

In [250]:
# Load the data
df = pd.read_csv('/content/evaluation.csv', delimiter=';', index_col=0)

In [251]:
def bleu_score(x):
    """ Return highest BLEU score """
    return max(bleu([x['explanation_1'].split()],
                    x['Predictions'].split(),
                   (1,)),
               bleu([x['explanation_2'].split()],
                    x['Predictions'].split(),
                    (1,)),
               bleu([x['explanation_3'].split()],
                    x['Predictions'].split(), 
                    (1,)),
                )

In [252]:
def calc_chrf(x):
    """ Return highest chrF score """
    scores = []
    scores.append(chrf_score.sentence_chrf(x['explanation_1'], x['Predictions']))
    scores.append(chrf_score.sentence_chrf(x['explanation_2'], x['Predictions']))
    scores.append(chrf_score.sentence_chrf(x['explanation_3'], x['Predictions']))
    return max(scores)

In [253]:
def rouge_score(x, metric):
    """ Return highest ROUGE-1, ROUGE-2 or ROUGE-3 score """
    def max_score(s1, s2, s3):
        precisions = [s1.precision, s2.precision, s3.precision]
        recall = [s1.recall, s2.recall, s3.recall]
        f1 = [s1.fmeasure, s2.fmeasure, s3.fmeasure]
        i = f1.index(max(f1))
        return precisions[i], recall[i], f1[i]

    scorer = rouge_scorer.RougeScorer([metric], use_stemmer=True)
    score1 = scorer.score(x['explanation_1'], x['Predictions'])[metric]
    score2 = scorer.score(x['explanation_2'], x['Predictions'])[metric]
    score3 = scorer.score(x['explanation_3'], x['Predictions'])[metric]
    return max_score(score1, score2, score3)

In [254]:
def calc_bertscore(x):
    """ Return highest BERTScore score """
    def max_score(s1, s2, s3):
        precisions = [s1['precision'][0], s2['precision'][0], s3['precision'][0]]
        recalls = [s1['recall'][0], s2['recall'][0], s3['recall'][0]]
        f1s = [s1['f1'][0], s2['f1'][0], s3['f1'][0]]
        i = f1s.index(max(f1s))
        return precisions[i], recalls[i], f1s[i]

    score1 = bertscore.compute(references=[x['explanation_1']], predictions = [x['Predictions']], model_type="distilbert-base-uncased", lang="en")
    score2 = bertscore.compute(references=[x['explanation_2']], predictions = [x['Predictions']], model_type="distilbert-base-uncased", lang="en")
    score3 = bertscore.compute(references=[x['explanation_3']], predictions = [x['Predictions']], model_type="distilbert-base-uncased", lang="en")
    return max_score(score1, score2, score3)

In [255]:
sub_df = df[['explanation_1', 'explanation_2', 'explanation_3', 'Predictions']]

df['bleu'] = sub_df.apply(bleu_score, axis=1)
df['chrf'] = sub_df.apply(calc_chrf, axis=1)
df[['rouge1-precision', 'rouge1-recall', 'rouge1-f1']] = sub_df.apply(rouge_score, args=('rouge1',), axis=1, result_type='expand')
df[['rouge2-precision', 'rouge2-recall', 'rouge2-f1']] = sub_df.apply(rouge_score, args=('rouge2',), axis=1, result_type='expand')
df[['rougeL-precision', 'rougeL-recall', 'rougeL-f1']] = sub_df.apply(rouge_score, args=('rougeL',), axis=1, result_type='expand')
df[['bertscore-precision', 'bertscore-recall', 'bertscore-f1']] = sub_df.apply(calc_bertscore, axis=1, result_type='expand')

In [256]:
score_dict = {'BLEU': [round(df['bleu'].mean(), 4)],
     'chrF': [round(df['chrf'].mean(), 4)],
     'Precision (ROUGE-1)': [round(df['rouge1-precision'].mean(), 4)],
     'Recall (ROUGE-1)': [round(df['rouge1-recall'].mean(), 4)],
     'F1-score (ROUGE-1)': [round(df['rouge1-f1'].mean(), 4)],
     'Precision (ROUGE-2)': [round(df['rouge2-precision'].mean(), 4)],
     'Recall (ROUGE-2)': [round(df['rouge2-recall'].mean(), 4)],
     'F1-score (ROUGE-2)': [round(df['rouge2-f1'].mean(), 4)],
     'Precision (ROUGE-L)': [round(df['rougeL-precision'].mean(), 4)],
     'Recall (ROUGE-L)': [round(df['rougeL-recall'].mean(), 4)],
     'F1-score (ROUGE-L)': [round(df['rougeL-f1'].mean(), 4)],
     'Precision (BERTScore)': [round(df['bertscore-precision'].mean(), 4)],
     'Recall (BERTScore)': [round(df['bertscore-recall'].mean(), 4)],
     'F1-score (BERTScore)': [round(df['bertscore-f1'].mean(), 4)]
}

df_score = pd.DataFrame.from_dict(score_dict)
df_score = df_score.transpose()
df_score.rename(columns={0:'Score'}, inplace=True)
print(df_score.to_latex())

\begin{tabular}{lr}
\toprule
{} &   Score \\
\midrule
BLEU                  &  0.4353 \\
chrF                  &  0.5068 \\
Precision (ROUGE-1)   &  0.6095 \\
Recall (ROUGE-1)      &  0.5821 \\
F1-score (ROUGE-1)    &  0.5732 \\
Precision (ROUGE-2)   &  0.3787 \\
Recall (ROUGE-2)      &  0.3602 \\
F1-score (ROUGE-2)    &  0.3547 \\
Precision (ROUGE-L)   &  0.5487 \\
Recall (ROUGE-L)      &  0.5277 \\
F1-score (ROUGE-L)    &  0.5189 \\
Precision (BERTScore) &  0.8628 \\
Recall (BERTScore)    &  0.8585 \\
F1-score (BERTScore)  &  0.8599 \\
\bottomrule
\end{tabular}



Error Analysis

In [268]:
metrics = ['bleu', 'chrf', 'rouge1-f1', 'rouge2-f1', 'rougeL-f1', 'bertscore_f1']
df2 = df.sort_values('rouge1-f1')
# df2 = df2[['premise', 'hypothesis', 'label', 'Predictions']]
pd.set_option('display.max_colwidth', None)
df2.head(10)

# Labels:
#       0 = entailment
#       1 = neutral
#       2 = contradiction


Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,Predictions,Target,bleu,chrf,...,rouge1-f1,rouge2-precision,rouge2-recall,rouge2-f1,rougeL-precision,rougeL-recall,rougeL-f1,bertscore-precision,bertscore-recall,bertscore-f1
0,This church choir sings to the masses as they sing joyous songs from the book at a church.,The church has cracks in the ceiling.,1,Not all churches have cracks in the ceiling,There is no indication that there are cracks in the ceiling of the church.,Not all churches have cracks in the ceiling.,church choir sings to the masses as they sing joyous songs from the book at a church.,neutral Not all churches have cracks in the ceiling,0.176471,0.260447,...,0.193548,0.0,0.0,0.0,0.176471,0.214286,0.193548,0.724079,0.736961,0.730463
93,People are conversing at a dining table under a canopy.,People are talking underneath a covering.,0,People conversing must be talking.,Canopy is a kind of covering,Conversing and talking mean the same thins as canopy and covering mean the same thing.,People are conversing at a dining table under a canopy.,entailment People conversing must be talking.,0.2,0.481583,...,0.266667,0.0,0.0,0.0,0.2,0.4,0.266667,0.812103,0.885219,0.847086
80,"Three people sit on a bench at a station, the man looks oddly at the two women, the redheaded women looks up and forward in an awkward position, and the yellow blond girl twiddles with her hair.",Some people stand around.,2,The people either sit or stand.,People can either sit or stand.,people can't stand if they sit,three people sit on a bench at a station,contradiction The people either sit or stand.,0.222222,0.349547,...,0.266667,0.0,0.0,0.0,0.222222,0.333333,0.266667,0.751901,0.791321,0.771107
57,A man standing in front of a building on the phone as two men to the side pain on the side.,two girls walk through a hall,2,It must be either a man or two girls,A man is not a girl. You cannot be standing and walk at the same time.,There is either one man or two girls.,A man standing in front of a building on the phone as two men to the side pain on the side.,contradiction It must be either a man or two girls,0.238095,0.267063,...,0.27027,0.05,0.125,0.071429,0.190476,0.25,0.216216,0.693725,0.739181,0.715732
30,3 young man in hoods standing in the middle of a quiet street facing the camera.,Three people sit by a busy street bareheaded.,2,One cannot be standing and sitting at the same time. There can either be a quiet street or a busy street.,People can not sit and be standing simultaneously while being in hoods means they are not bareheaded.,There cannot be a quiet street and busy street at the same time.,Three people sit by a busy street bareheaded.,contradiction One cannot be standing and sitting at the same time. There can either be a quiet street or a busy street.,0.200723,0.257634,...,0.285714,0.285714,0.1,0.148148,0.375,0.230769,0.285714,0.805641,0.774545,0.789787
81,"Three people sit on a bench at a station, the man looks oddly at the two women, the redheaded women looks up and forward in an awkward position, and the yellow blond girl twiddles with her hair.",People run together.,2,The people cannot both sit and run simultaneously,"The poeple sit on a bench, which means that they don't run together.",All three people are sit ting on the bench and not running,"three people sit on a bench, the man looks oddly at the two women, the redheaded women looks up and forward in an awkward position, and the yellow blond girl twiddles with her hair.",contradiction The people cannot both sit and run simultaneously,0.176471,0.374494,...,0.304348,0.090909,0.230769,0.130435,0.176471,0.5,0.26087,0.752477,0.797035,0.774115
19,A blond-haired doctor and her African american assistant looking threw new medical manuals.,A man is eating pb and j,2,Looking through medical manuals implies that man is not eating pb and j.,A doctor and assistant are two people looking through manuals is different than a man is eating a sandwich.,There cannot be one man if there is a doctor and an assistant.,blond-haired doctor and her African american assistant looking threw new medical manuals.,contradiction Looking through medical manuals implies that man is not eating pb and j.,0.186012,0.433081,...,0.3125,0.083333,0.083333,0.083333,0.384615,0.263158,0.3125,0.772808,0.767695,0.770243
3,"A woman with a green headscarf, blue shirt and a very big grin.",The woman is young.,1,the woman could've been old rather than young,There is no indication that the woman is young.,Not all women are young.,A woman with a big grin is not necessarily young.,neutral the woman could've been old rather than young,0.3,0.304097,...,0.315789,0.0,0.0,0.0,0.3,0.333333,0.315789,0.800086,0.888378,0.841923
43,Male in a blue jacket decides to lay in the grass.,The guy wearing a blue jacket is laying on the green grass,0,Male is guy.,GRASSES WILL BE GREEN AND THE MAN IN A BLUE JACKET IS LAYING IN IT,A male is a guy in the grass,Grass is usually green.,entailment Male is guy.,0.25,0.188842,...,0.333333,0.0,0.0,0.0,0.25,0.333333,0.285714,0.821753,0.708215,0.760771
23,"One tan girl with a wool hat is running and leaning over an object, while another person in a wool hat is sitting on the ground.",A man watches his daughter leap,1,The man and girl are in relation to each other and not strangers.,Just because he's sitting on the ground doesn't mean the man watches his daughter.,Just because the man and a girl are wearing matching wool hats doesn't mean that she is the man's daughter.,"girl is running and leaning over an object, while another person in a wool hat is sitting on the ground.",neutral The man and girl are in relation to each other and not strangers.,0.3,0.346162,...,0.333333,0.157895,0.2,0.176471,0.3,0.272727,0.285714,0.791718,0.756674,0.7738


In [259]:
df.loc[df[['bleu']].idxmin()]

Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,Predictions,Target,bleu,chrf,...,rouge1-f1,rouge2-precision,rouge2-recall,rouge2-f1,rougeL-precision,rougeL-recall,rougeL-f1,bertscore-precision,bertscore-recall,bertscore-f1
89,A dog jumping for a Frisbee in the snow.,A pet is enjoying a game of fetch with his owner.,1,The pet can also play a game in the snow.,"A dog need not be a pet. A dog jumping for a Frisbee need niether be participating in willful play, nor enjoying it. Anyone can toss a Frisbee in the direction of an animal and need not be its owner.",Not all dogs have an owner.,A dog jumping for a Frisbee is not necessarily enjoying a game of fetch with his owner.,neutral The pet can also play a game in the snow.,0.167251,0.260185,...,0.385965,0.3125,0.128205,0.181818,0.588235,0.25,0.350877,0.88396,0.820019,0.85079


In [260]:
df.loc[df[['chrf']].idxmin()]

Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,Predictions,Target,bleu,chrf,...,rouge1-f1,rouge2-precision,rouge2-recall,rouge2-f1,rougeL-precision,rougeL-recall,rougeL-f1,bertscore-precision,bertscore-recall,bertscore-f1
51,A woman wearing a ball cap squats down to touch the cracked earth.,A squatting woman wearing a hat touching the ground.,0,"Cracked earth is a form of the ground, and a ball cap is a type of hat.","Squats is the same as squatting, and cracked earth is the same as the ground.","Squatting woman is a type of woman, and ball cap is a type of hat.",A ball cap is not a hat.,"entailment Cracked earth is a form of the ground, and a ball cap is a type of hat.",0.22779,0.188563,...,0.545455,0.5,0.1875,0.272727,0.857143,0.4,0.545455,0.917971,0.832028,0.872889


In [261]:
df.loc[df[['rouge1-f1']].idxmin()]

Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,Predictions,Target,bleu,chrf,...,rouge1-f1,rouge2-precision,rouge2-recall,rouge2-f1,rougeL-precision,rougeL-recall,rougeL-f1,bertscore-precision,bertscore-recall,bertscore-f1
0,This church choir sings to the masses as they sing joyous songs from the book at a church.,The church has cracks in the ceiling.,1,Not all churches have cracks in the ceiling,There is no indication that there are cracks in the ceiling of the church.,Not all churches have cracks in the ceiling.,church choir sings to the masses as they sing joyous songs from the book at a church.,neutral Not all churches have cracks in the ceiling,0.176471,0.260447,...,0.193548,0.0,0.0,0.0,0.176471,0.214286,0.193548,0.724079,0.736961,0.730463


In [262]:
df.loc[df[['rouge2-f1']].idxmin()]

Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,Predictions,Target,bleu,chrf,...,rouge1-f1,rouge2-precision,rouge2-recall,rouge2-f1,rougeL-precision,rougeL-recall,rougeL-f1,bertscore-precision,bertscore-recall,bertscore-f1
0,This church choir sings to the masses as they sing joyous songs from the book at a church.,The church has cracks in the ceiling.,1,Not all churches have cracks in the ceiling,There is no indication that there are cracks in the ceiling of the church.,Not all churches have cracks in the ceiling.,church choir sings to the masses as they sing joyous songs from the book at a church.,neutral Not all churches have cracks in the ceiling,0.176471,0.260447,...,0.193548,0.0,0.0,0.0,0.176471,0.214286,0.193548,0.724079,0.736961,0.730463


In [263]:
df.loc[df[['rougeL-f1']].idxmin()]

Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,Predictions,Target,bleu,chrf,...,rouge1-f1,rouge2-precision,rouge2-recall,rouge2-f1,rougeL-precision,rougeL-recall,rougeL-f1,bertscore-precision,bertscore-recall,bertscore-f1
0,This church choir sings to the masses as they sing joyous songs from the book at a church.,The church has cracks in the ceiling.,1,Not all churches have cracks in the ceiling,There is no indication that there are cracks in the ceiling of the church.,Not all churches have cracks in the ceiling.,church choir sings to the masses as they sing joyous songs from the book at a church.,neutral Not all churches have cracks in the ceiling,0.176471,0.260447,...,0.193548,0.0,0.0,0.0,0.176471,0.214286,0.193548,0.724079,0.736961,0.730463


In [264]:
df.loc[df[['bertscore-f1']].idxmin()]

Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,Predictions,Target,bleu,chrf,...,rouge1-f1,rouge2-precision,rouge2-recall,rouge2-f1,rougeL-precision,rougeL-recall,rougeL-f1,bertscore-precision,bertscore-recall,bertscore-f1
57,A man standing in front of a building on the phone as two men to the side pain on the side.,two girls walk through a hall,2,It must be either a man or two girls,A man is not a girl. You cannot be standing and walk at the same time.,There is either one man or two girls.,A man standing in front of a building on the phone as two men to the side pain on the side.,contradiction It must be either a man or two girls,0.238095,0.267063,...,0.27027,0.05,0.125,0.071429,0.190476,0.25,0.216216,0.693725,0.739181,0.715732
