# Data visualisation

In the Notebook we are looking to explore the HANNA data base.

In [11]:
# importations
import csv
import json
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pingouin as pg 
from scipy.special import binom
import scipy.stats as st
from scipy.stats import pearsonr, spearmanr, kendalltau
import seaborn as sns
from sklearn.metrics import f1_score
from utils import *

## Loading the data

In [14]:
data_annotations = pd.read_csv("hanna_stories_annotations.csv")

There are 96 different prompts and thus for each model there are 96 stories.

In [15]:
data_annotations.head()

Unnamed: 0,Story ID,Prompt,Human,Story,Model,Relevance,Coherence,Empathy,Surprise,Engagement,Complexity,Worker ID,Assignment ID,Work time in seconds,Name
0,0,When you die the afterlife is an arena where y...,"3,000 years have I been fighting. Every mornin...","3,000 years have I been fighting. Every mornin...",Human,4,4,3,2,4,4,A2VE5IV9OD2SK1,3X87C8JFVHIT235KQ4UTS8264I6SQJ,579.0,
1,0,When you die the afterlife is an arena where y...,"3,000 years have I been fighting. Every mornin...","3,000 years have I been fighting. Every mornin...",Human,5,5,1,3,4,1,A1IZ4NX41GKU4X,3DR23U6WEGL5K0SU6D4J8W9EM9LTE7,82.0,none
2,0,When you die the afterlife is an arena where y...,"3,000 years have I been fighting. Every mornin...","3,000 years have I been fighting. Every mornin...",Human,2,2,3,2,2,3,A264NN7JBX4UDQ,3UJ1CZ6IZSW49HMM6C6QUX7F7UV5SA,273.0,none
3,1,A new law is enacted that erases soldiers memo...,"“Dad, you 're on TV again !” I heard Eric 's v...","“Dad, you 're on TV again !” I heard Eric 's v...",Human,5,5,3,4,4,4,A3CFNUD7VR2E1E,317HQ483IIZJ5SPW508YKC1EP6RINX,117.0,Eric
4,1,A new law is enacted that erases soldiers memo...,"“Dad, you 're on TV again !” I heard Eric 's v...","“Dad, you 're on TV again !” I heard Eric 's v...",Human,5,4,4,4,4,4,A2VE5IV9OD2SK1,3T3IWE1XGHUUH3IZF4ZJ2DYS57WQTT,751.0,Eric


The different model used are as follow:

In [17]:
data_annotations['Model'].unique()

array(['Human', 'BertGeneration', 'CTRL', 'GPT', 'GPT-2 (tag)', 'GPT-2',
       'RoBERTa', 'XLNet', 'Fusion', 'HINT', 'TD-VAE'], dtype=object)

In [18]:
data_scores = pd.read_csv("hanna_metric_scores.csv")

In [20]:
data_scores.head()

Unnamed: 0,Model,Relevance,Coherence,Empathy,Surprise,Engagement,Complexity,BLEU Ξ§,ROUGE-1 Recall Ξ§,ROUGE-1 Precision Ξ§,...,Repetition-2 ¤§,Repetition-3 ¤§,SUPERT-PS ¤ε,SUPERT-SS ¤ε,BLANC-Tune-PS ¤Δ,BLANC-Help-PS ¤Δ,BLANC-Tune-SS ¤Δ,BLANC-Help-SS ¤Δ,BARTScore-PS ¤Δ,BARTScore-SP ¤Δ
0,Human,"[3.6666666666666665, 5.0, 4.666666666666667, 3...","[3.6666666666666665, 4.666666666666667, 4.6666...","[2.3333333333333335, 4.0, 4.0, 3.0, 3.0, 3.333...","[2.3333333333333335, 3.6666666666666665, 4.333...","[3.3333333333333335, 3.6666666666666665, 4.0, ...","[2.6666666666666665, 4.0, 4.333333333333333, 4...","[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",...,"[0.10407239819004525, 0.1198501872659176, 0.07...","[0.029288702928870293, 0.03691275167785235, 0....","[0.36060585433665243, 0.30589759827771107, 0.2...","[0.3513956792291503, 0.3298352311450941, 0.273...","[0.0, 0.040983606557377046, -0.011111111111111...","[-0.02608695652173913, 0.0, -0.011111111111111...","[0.058823529411764705, -0.14285714285714285, 0...","[-0.23529411764705882, -0.14285714285714285, 0...","[-4.046356201171875, -4.839603424072266, -4.85...","[-4.624497413635254, -6.575280666351318, -4.77..."
1,BertGeneration,"[1.6666666666666667, 3.0, 1.3333333333333333, ...","[3.6666666666666665, 2.6666666666666665, 2.333...","[2.6666666666666665, 2.6666666666666665, 1.0, ...","[1.3333333333333333, 2.0, 1.6666666666666667, ...","[2.6666666666666665, 2.6666666666666665, 1.333...","[2.6666666666666665, 2.6666666666666665, 1.333...","[1.4336111822, 1.4493826988, 0.1779090484, 0.1...","[0.38249, 0.24074, 0.16285, 0.13276, 0.40645, ...","[0.37054, 0.26639, 0.59259, 0.56771, 0.26923, ...",...,"[0.08888888888888889, 0.0546875, 0.05676855895...","[0.044534412955465584, 0.022641509433962263, 0...","[0.2597054082262463, 0.20473734339818345, 0.24...","[0.3097425349207481, 0.20473734339818345, 0.24...","[-0.01834862385321101, -0.014184397163120567, ...","[-0.03669724770642202, -0.02127659574468085, -...","[-0.058823529411764705, -0.2857142857142857, 0...","[-0.058823529411764705, -0.14285714285714285, ...","[-3.913217067718506, -5.507814407348633, -4.62...","[-4.915536880493164, -6.616286277770996, -5.14..."
2,CTRL,"[2.3333333333333335, 3.6666666666666665, 2.666...","[3.0, 3.3333333333333335, 3.3333333333333335, ...","[2.3333333333333335, 3.3333333333333335, 3.333...","[2.6666666666666665, 1.3333333333333333, 3.0, ...","[2.6666666666666665, 3.3333333333333335, 3.333...","[2.6666666666666665, 3.0, 3.0, 2.3333333333333...","[1.2558030838, 2.775216509, 0.400491848, 0.271...","[0.29954, 0.30741, 0.18702, 0.19367, 0.23871, ...","[0.38922, 0.3547, 0.54444, 0.60456, 0.30833, 0...",...,"[0.042328042328042326, 0.09154929577464789, 0....","[0.01020408163265306, 0.01910828025477707, 0.0...","[0.25552435405490376, 0.1820114012358586, 0.30...","[0.25552435405490376, 0.14368366145385317, 0.2...","[0.0, 0.008333333333333333, -0.011904761904761...","[-0.012345679012345678, -0.008333333333333333,...","[-0.058823529411764705, 0.0, 0.0, 0.0, 0.0, 0....","[-0.058823529411764705, -0.14285714285714285, ...","[-5.204783916473389, -4.947543144226074, -4.69...","[-4.883750915527344, -6.809321403503418, -4.91..."
3,GPT,"[3.6666666666666665, 3.0, 2.6666666666666665, ...","[3.6666666666666665, 3.0, 2.3333333333333335, ...","[3.0, 2.3333333333333335, 2.6666666666666665, ...","[3.0, 2.3333333333333335, 1.3333333333333333, ...","[3.0, 3.0, 2.0, 2.3333333333333335, 3.0, 3.0, ...","[2.6666666666666665, 3.0, 2.3333333333333335, ...","[0.9493520114, 2.3140339956, 0.1505541131, 0.2...","[0.24424, 0.3037, 0.1374, 0.05359, 0.34194, 0....","[0.29944, 0.36771, 0.5567, 0.72131, 0.26633, 0...",...,"[0.075, 0.07623318385650224, 0.137055837563451...","[0.018779342723004695, 0.012195121951219513, 0...","[0.3434531769248544, 0.2605565346424716, 0.309...","[0.3892346136855479, 0.2520803726283153, 0.310...","[0.010638297872340425, -0.017543859649122806, ...","[0.02127659574468085, -0.02631578947368421, 0....","[-0.11764705882352941, -0.14285714285714285, 0...","[-0.058823529411764705, -0.14285714285714285, ...","[-4.507972240447998, -4.7182230949401855, -4.2...","[-4.6574273109436035, -6.157495021820068, -4.5..."
4,GPT-2 (tag),"[3.0, 3.3333333333333335, 2.6666666666666665, ...","[3.6666666666666665, 3.0, 5.0, 2.3333333333333...","[2.3333333333333335, 3.3333333333333335, 3.666...","[1.6666666666666667, 1.3333333333333333, 3.333...","[3.6666666666666665, 2.6666666666666665, 4.333...","[2.6666666666666665, 3.3333333333333335, 4.333...","[1.3324076258, 1.7180351642, 0.8185746681, 0.3...","[0.48387, 0.4, 0.28499, 0.21072, 0.3871, 0.344...","[0.2861, 0.28571, 0.53589, 0.50291, 0.20478, 0...",...,"[0.08115183246073299, 0.09814323607427056, 0.0...","[0.014563106796116505, 0.021176470588235293, 0...","[0.22186669336747747, 0.3216385713738949, 0.39...","[0.2316344289888393, 0.3330455481876535, 0.394...","[-0.026595744680851064, -0.03125, -0.019047619...","[-0.03723404255319149, -0.026041666666666668, ...","[0.058823529411764705, -0.14285714285714285, 0...","[0.058823529411764705, -0.2857142857142857, -0...","[-4.3018059730529785, -4.345739364624023, -4.2...","[-5.022391319274902, -5.645732879638672, -4.76..."


In [64]:
relevance_data_human =data_scores[['Relevance']].values[:1]
relevance_data_bert = data_scores[['Relevance']].values[1:2]


We indeed have the relevance metric for 96 stories.

In [73]:
len(json.loads(relevance_data_human.tolist()[0][0]))

IndexError: list index out of range

In order to benchmark the correlation of existing metrics with human scores we have the following steps: 

1.	Collect a dataset: You will need to collect a dataset of texts or documents that you will use for your NLP task. This dataset should be diverse enough to capture different variations of the language and the task at hand.

2.	Annotate the dataset: You will need to annotate the dataset with human scores. This involves having human annotators rate or evaluate the same task that your NLP model is performing. For example, if your NLP task is sentiment analysis, you can have human annotators rate the sentiment of each text or document on a scale of 1 to 5.

3.	Train your NLP model: You will need to train your NLP model on the annotated dataset using the existing metrics that you want to benchmark. For example, you can use accuracy, precision, recall, and F1 score as your evaluation metrics.

4.	Evaluate your NLP model: Once your model is trained, you can evaluate its performance on the same dataset using the existing metrics. This will give you an idea of how well your model is performing according to the current evaluation standards.

5.	Calculate the correlation: You can then calculate the correlation between the human scores and the existing metrics. You can use statistical methods such as Pearson correlation coefficient or Spearman's rank correlation coefficient to calculate the correlation.

6.	Analyze the results: Analyze the results to understand how well the existing metrics are correlated with human scores. If there is a high correlation, it indicates that the existing metrics are effective in evaluating the NLP model's performance. If there is a low correlation, it suggests that the existing metrics may not be sufficient and may need to be improved or replaced.





Here we are trying to evaluate the steps 5 and 6.

### Using the Pearson method

Calculate the covariance between the human scores and the existing metric. This can be done using the formula:

covariance = Σ[(human score - mean human score) * (metric - mean metric)] / (n - 1)

Where Σ represents the sum of the values for each story, n is the number of stories, and mean human score and mean metric are the means of the human scores and existing metric, respectively.

Calculate the Pearson correlation coefficient between the human scores and the existing metric. This can be done using the formula:

r = covariance / (standard deviation of human scores * standard deviation of existing metric)

In [66]:
# calculate the Pearson correlation coefficient and p-value
corr_coef, p_value = pearsonr(json.loads(relevance_data_human.tolist()[0][0]), json.loads(relevance_data_bert.tolist()[0][0]))


In [68]:
# print the results
print("Pearson correlation coefficient for the BertGeneration model and the relevance metric:", corr_coef)
print("p-value:", p_value)

Pearson correlation coefficient for the BertGeneration model and the relevance metric: -0.029086742829064545
p-value: 0.7784672746915751


### story-level correlation

In [69]:
# list to store the story level correlations
story_correlations = []
human_scores= json.loads(relevance_data_human.tolist()[0][0])
existing_metric= json.loads(relevance_data_bert.tolist()[0][0])

# iterate over each story
for i in range(len(human_scores)):
    # calculate the Pearson correlation coefficient and p-value for this story
    corr_coef, p_value = pearsonr(human_scores[i], existing_metric[i])
    # append the correlation coefficient to the list of story level correlations
    story_correlations.append(corr_coef)

# print the story level correlations
print("Story level correlations:", story_correlations)

TypeError: object of type 'float' has no len()