# Dataset Load

In [1]:
import json 
import copy

with open('fed_data.json', 'r') as f:
    data=json.load(f)

In [8]:
data[0]

{'context': "User: Hi!\nSystem: Hi! What's up?\nUser: Nothing much, how about you\nSystem: Not much either.\nUser: What are you doing\nSystem: Playing Terraria. What about you?\nUser: Sitting in a meeting\nSystem: What kind of meeting?\nUser: Can't say",
 'response': "System: It's probably boring, isn't it?",
 'system': 'Meena',
 'annotations': {'Interesting': [2, 1, 1, 1, 1],
  'Engaging': [2, 1, 2, 2, 2],
  'Specific': [1, 1, 2, 2, 1],
  'Relevant': [2, 1, 2, 2, 2],
  'Correct': [2, 2, 1, 2, 2],
  'Semantically appropriate': [2, 1, 2, 2, 2],
  'Understandable': [1, 1, 1, 1, 1],
  'Fluent': [2, 1, 2, 2, 2],
  'Overall': [2, 1, 3, 3, 4]}}

In [7]:
# Split dataset into turn-level and dialol-level
Meena_cnt=0
Human_cnt=0
Mitsuki_cnt=0
Meena_session_cnt=0
Human_session_cnt=0
Mitsuki_session_cnt=0
dialog_cnt=0
turn_level = []
dialog_level = []

for x in data:
    try:
        
        res= x['response']
        if x['system'] =='Meena':
            Meena_cnt +=1
        elif x['system'] =='Human':
            Human_cnt+=1
        elif x['system'] =='Mitsuku':
            Mitsuki_cnt +=1
        else:
            print(x)
        turn_level.append(x)

    except Exception as e:

        dialog_cnt +=1
        if x['system'] =='Meena':
            Meena_session_cnt +=1
        elif x['system'] =='Human':
            Human_session_cnt+=1
        elif x['system'] =='Mitsuku':
            Mitsuki_session_cnt +=1
        else:
            print('session cnt', x)
        dialog_level.append(x)
        
print('Turn-level', Human_cnt, Meena_cnt, Mitsuki_cnt)       
print('Dialog-level', Human_session_cnt, Meena_session_cnt, Mitsuki_session_cnt)

Turn-level 123 120 132
Dialog-level 41 40 44


# Performance of the systems based on annotator's evaluation

In [156]:
# Set final output
turn_attributes =  list(turn_level[0]['annotations'].keys())
turn_evaluation = { attribute: {"Human": [], "Meena": [], "Mitsuku": []} for attribute in turn_attributes}
print(turn_attributes)
print()
print(turn_evaluation)

['Interesting', 'Engaging', 'Specific', 'Relevant', 'Correct', 'Semantically appropriate', 'Understandable', 'Fluent', 'Overall']

{'Interesting': {'Human': [], 'Meena': [], 'Mitsuku': []}, 'Engaging': {'Human': [], 'Meena': [], 'Mitsuku': []}, 'Specific': {'Human': [], 'Meena': [], 'Mitsuku': []}, 'Relevant': {'Human': [], 'Meena': [], 'Mitsuku': []}, 'Correct': {'Human': [], 'Meena': [], 'Mitsuku': []}, 'Semantically appropriate': {'Human': [], 'Meena': [], 'Mitsuku': []}, 'Understandable': {'Human': [], 'Meena': [], 'Mitsuku': []}, 'Fluent': {'Human': [], 'Meena': [], 'Mitsuku': []}, 'Overall': {'Human': [], 'Meena': [], 'Mitsuku': []}}


In [None]:
# Data preprocessing
import numpy as np 
import copy

def remove_furthest(scores):
    score_list = copy.deepcopy(scores)
    mean= round(np.mean(score_list),3)
    std= round(np.std(score_list), 3)
    score_list= sorted(score_list, key=lambda x: abs(x-mean), reverse=True)  # reorder based on distance
    remove_idx=-1
    for idx, x in enumerate(score_list):
        if abs(x-mean) > std/2:
            remove_idx=idx
            break
    if remove_idx != -1:
        del score_list[remove_idx] 
    return score_list


turn_attributes =  list(turn_level[0]['annotations'].keys())
turn_evaluation = { attribute: {"Human": [], "Meena": [], "Mitsuku": []} for attribute in turn_attributes}

for turn in turn_level:
    for attr in turn_attributes:
        scores= turn['annotations'][attr]
        filterd_list = list(filter(lambda e: isinstance(e, int), scores))  # remove texts in annotators' evaluation
        removed_furthest = remove_furthest(filterd_list)  # remove furthest label
        turn_evaluation[attr][turn['system']].append(np.mean(removed_furthest))
        
turn_evaluation

In [45]:
# double-check the counts of turns
for attr in turn_attributes:
    for system in ["Human","Meena","Mitsuku"]:
        print(attr, system, len(turn_evaluation[attr][system]))

Interesting Human 123
Interesting Meena 120
Interesting Mitsuku 132
Engaging Human 123
Engaging Meena 120
Engaging Mitsuku 132
Specific Human 123
Specific Meena 120
Specific Mitsuku 132
Relevant Human 123
Relevant Meena 120
Relevant Mitsuku 132
Correct Human 123
Correct Meena 120
Correct Mitsuku 132
Semantically appropriate Human 123
Semantically appropriate Meena 120
Semantically appropriate Mitsuku 132
Understandable Human 123
Understandable Meena 120
Understandable Mitsuku 132
Fluent Human 123
Fluent Meena 120
Fluent Mitsuku 132
Overall Human 123
Overall Meena 120
Overall Mitsuku 132


In [132]:
# Performance (Avg score of annotators' scores)
# Turn-level
import pandas as pd

df= pd.DataFrame(columns=['Quality','Human','Meena', 'Mitsuku'])
idx=0
for attr, model_eval in turn_evaluation.items():
    idx +=1
    df_row= [attr] 
    
    for model, scores in model_eval.items():
        avg = np.mean(scores)
        
        # adjust score range
        if attr not in ('Understandable', 'Consistent'): 
            avg += 1       
        df_row += [avg]   
    df.loc[idx] = df_row

df[['Quality','Mitsuku', 'Meena', 'Human']]


Unnamed: 0,Quality,Mitsuku,Meena,Human
1,Interesting,2.121212,2.41875,2.713415
2,Engaging,2.331439,2.58125,2.837398
3,Specific,2.371843,2.558333,2.821138
4,Relevant,2.535354,2.922917,2.898374
5,Correct,2.419192,2.884722,2.894309
6,Semantically appropriate,2.679924,2.952083,2.896341
7,Understandable,0.958333,1.0,0.987805
8,Fluent,2.8125,2.95625,2.832656
9,Overall,3.285985,4.179167,4.321138


In [133]:
# Performance (Avg score of annotators' scores)
# Turn-level
import pandas as pd

df= pd.DataFrame(columns=['Quality','Human','Meena', 'Mitsuku'])
idx=0
for attr, model_eval in turn_evaluation.items():
    idx +=1
    df_row= [attr] 
    
    for model, scores in model_eval.items():
        avg = np.mean(scores)
        
        # adjust score range
        if attr not in ('Understandable', 'Consistent'): 
            avg += 1       
        df_row += [avg]   
    df.loc[idx] = df_row

df[['Quality','Mitsuku', 'Meena', 'Human']]


Unnamed: 0,Quality,Mitsuku,Meena,Human
1,Interesting,2.121212,2.41875,2.713415
2,Engaging,2.331439,2.58125,2.837398
3,Specific,2.371843,2.558333,2.821138
4,Relevant,2.535354,2.922917,2.898374
5,Correct,2.419192,2.884722,2.894309
6,Semantically appropriate,2.679924,2.952083,2.896341
7,Understandable,0.958333,1.0,0.987805
8,Fluent,2.8125,2.95625,2.832656
9,Overall,3.285985,4.179167,4.321138


In [136]:
# Calculate performance (Avg score of annotators' scores)
# Dialog -level

import numpy as np 
dialog_attributes =  list(dialog_level[0]['annotations'].keys())
dialog_evaluation= { attribute: {"Human": [], "Meena": [], "Mitsuku": []} for attribute in dialog_attributes}

for dialog in dialog_level:
    for attr in dialog_attributes:
        scores= dialog['annotations'][attr]
        filterd_list = list(filter(lambda e: isinstance(e, int), scores))
        removed_furthest = remove_furthest(filterd_list)
        if len(removed_furthest) == 0:  # set a score for empty lists due to only text evaluation. 
            print('No score case: ', scores, removed_furthest)
            removed_furthest=[2]
        dialog_evaluation[attr][dialog['system']].append(np.mean(removed_furthest))





No score case:  ['N/A (no mistakes made)', 'N/A (The system did not make any errors.)', 'N/A (There were no errors)', 'N/A (No real errors to recover from)', 'N/A (no errors)'] []


In [137]:
# Finalize ouputs
df= pd.DataFrame(columns=['Quality','Human','Meena', 'Mitsuku'])
idx=0
for attr, model_eval in dialog_evaluation.items():
    idx +=1
    df_row= [attr] 
    
    for model, scores in model_eval.items():
        avg = np.mean(scores)
        
        # adjust score range (0~2 -> 1~3)
        if attr not in ('Understandable', 'Consistent'): 
            avg += 1
        df_row += [avg]
    df.loc[idx] = df_row

df[['Quality','Mitsuku', 'Meena', 'Human']]

Unnamed: 0,Quality,Mitsuku,Meena,Human
1,Coherent,2.193182,2.8875,2.945122
2,Error recovery,2.219697,2.697917,2.869919
3,Consistent,0.886364,0.96875,0.987805
4,Diverse,2.255682,2.48125,2.884146
5,Depth,1.795455,2.25,2.780488
6,Likeable,2.130682,2.6375,2.969512
7,Understanding,2.227273,2.8625,2.981707
8,Flexible,2.238636,2.70625,2.969512
9,Informative,2.085227,2.5875,2.853659
10,Inquisitive,2.335227,2.76875,2.884146


# Correlation 

In [None]:
# Preprocessing Human annotations

import numpy as np 
turn_attributes =  list(turn_level[0]['annotations'].keys())
turn_evaluation = { attribute: []  for attribute in turn_attributes}

for turn in turn_level:
    for attr in turn_attributes:
        scores= turn['annotations'][attr]
        filterd_list = list(filter(lambda e: isinstance(e, int), scores))
        removed_furthest = remove_furthest(filterd_list)
        avg= np.mean(removed_furthest)
        
        if attr not in ('Understandable', 'Consistent'): 
            avg += 1
        turn_evaluation[attr].append(avg)
        #print(scores, removed_furthest)
turn_evaluation


In [59]:
# Load model
import fed

model, tokenizer = fed.load_models("microsoft/DialoGPT-large")




In [None]:
# Model FU Score
from tqdm import tqdm

def preprocessing_conversation(context:str):
    preproc_context= " ".join([utter.replace('User:', "<|endoftext|>").replace('System:', "<|endoftext|>") for utter in context['context'].split('\n')])
    response= context['response'].replace('System:', "<|endoftext|>")

    return f"{preproc_context} {response}"

def save(f, kwarg):
    assert kwarg, "No contents"

    f.write(json.dumps(kwarg, indent=4, ensure_ascii=False))
    f.write("\n")

# get fed scores
turn_model_scores = { attribute: []  for attribute in turn_attributes if attribute != 'Overall'}
output_file='turn_level_FU_scores.jsonl'
with open(output_file, "w") as ofile:

    for idx, t in enumerate(tqdm(turn_level)):
        context= " ".join([utter.replace('User:', "<|endoftext|>").replace('System:', "<|endoftext|>") for utter in t['context'].split('\n')])
        response= t['response'].replace('System:', "<|endoftext|>")
        conversation_for_eval = context + " " +response
        scores = fed.evaluate(conversation_for_eval,
                            model,
                            tokenizer)

        for attr in turn_model_scores.keys():
            turn_model_scores[attr].append(scores[attr.lower()])
        
        # logs for checking inputs & outputs
        save(ofile, {'idx': idx, 'level': 'Turn-level', 'input':conversation_for_eval, "score": scores })
turn_model_scores

In [151]:
# Save result
import json
with open("turn_level_model_score_list.json", "w") as json_file:
    json.dump(turn_model_scores, json_file)

with open("turn_level_annotators_scores.json", "w") as json_file2:
    json.dump(dialog_evaluation, json_file2, indent=4)


In [147]:
# Calculate correlation and preprocess it for an output format
import pandas as pd 
from scipy import stats

df= pd.DataFrame(columns=['Quality','Pearson-Corr','Spearman-Corr'])
idx=0
for attr in turn_model_scores.keys():
    print(stats.spearmanr(turn_evaluation[attr], turn_model_scores[attr]))
    p_corr, _ = stats.pearsonr(turn_evaluation[attr], turn_model_scores[attr])
    s_corr, _ = stats.spearmanr(turn_evaluation[attr], turn_model_scores[attr])
    df.loc[idx] = [attr, p_corr, s_corr]
    idx+=1
df

SpearmanrResult(correlation=0.3126494294346008, pvalue=6.013519364186905e-10)
SpearmanrResult(correlation=0.15131290893596858, pvalue=0.0033107299575460295)
SpearmanrResult(correlation=0.1933698301956955, pvalue=0.0001647519959582658)
SpearmanrResult(correlation=0.15254449277498106, pvalue=0.003061442563010448)
SpearmanrResult(correlation=0.16028888712516604, pvalue=0.0018471022318570082)
SpearmanrResult(correlation=0.04570597460483709, pvalue=0.37745303519693696)
SpearmanrResult(correlation=0.024830238839847352, pvalue=0.6317222591672215)
SpearmanrResult(correlation=0.011663739703123023, pvalue=0.8218851773786329)


Unnamed: 0,Quality,Pearson-Corr,Spearman-Corr
0,Interesting,0.279838,0.312649
1,Engaging,0.143199,0.151313
2,Specific,0.182277,0.19337
3,Relevant,0.146903,0.152544
4,Correct,0.175289,0.160289
5,Semantically appropriate,0.083279,0.045706
6,Understandable,-0.027571,0.02483
7,Fluent,0.038842,0.011664


In [None]:
# Dialog level
import numpy as np 

dialog_attributes =  list(dialog_level[0]['annotations'].keys())
dialog_evaluation= { attribute: []  for attribute in dialog_attributes}

for dialog in dialog_level:
    for attr in dialog_attributes:
        scores= dialog['annotations'][attr]
        filterd_list = list(filter(lambda e: isinstance(e, int), scores))
        removed_furthest = remove_furthest(filterd_list)
        if len(removed_furthest) ==0:
            removed_furthest=[2] # set a score for empty lists due to only text evaluation
        avg= np.mean(removed_furthest)
        if attr not in ('Understandable', 'Consistent'): 
            avg += 1
        dialog_evaluation[attr].append(avg)
dialog_evaluation


In [None]:
# Model FU Score
from tqdm import tqdm

dialog_model_scores = { attribute: []  for attribute in dialog_attributes if attribute != 'Overall'}
output_file='dialog_level_FU_scores.jsonl'
with open(output_file, "w") as ofile:

    for idx, d in enumerate(tqdm(dialog_level)):
        context= " ".join([utter.replace('User:', "<|endoftext|>").replace('System:', "<|endoftext|>") for utter in d['context'].split('\n')])
        scores = fed.evaluate(context,
                            model,
                            tokenizer)

        for attr in dialog_model_scores.keys():
            if attr == 'Understanding':
                dialog_model_scores[attr].append(scores['understand'])   
                continue 
            dialog_model_scores[attr].append(scores[attr.lower()])

        save(ofile, {'idx': idx, 'level': 'Dialog-level', 'input':context, "score": scores })

dialog_model_scores

In [150]:
#save results
import json

with open("dialog_level_model_scores.json", "w") as json_file:
    json.dump(dialog_model_scores, json_file)

with open("dialog_level_annotators_scores.json", "w") as json_file2:
    json.dump(dialog_evaluation, json_file2, indent=4) 

In [142]:
# Calculate correlation
import pandas as pd 
from scipy import stats

df2= pd.DataFrame(columns=['Quality','Pearson-Corr','Spearman-Corr'])
idx=0
for attr in dialog_model_scores.keys():
    #print(stats.spearmanr(dialog_model_scores[attr], dialog_model_scores[attr]))
    p_corr, _ = stats.pearsonr(dialog_evaluation[attr], dialog_model_scores[attr])
    s_corr, _ = stats.spearmanr(dialog_evaluation[attr], dialog_model_scores[attr])
    df2.loc[idx] = [attr, p_corr, s_corr]
    idx+=1
df2

Unnamed: 0,Quality,Pearson-Corr,Spearman-Corr
0,Coherent,0.073547,0.171332
1,Error recovery,0.025022,0.026153
2,Consistent,-0.043399,0.034835
3,Diverse,0.122052,0.079674
4,Depth,0.098246,0.087264
5,Likeable,0.204696,0.264306
6,Understanding,0.177501,0.173499
7,Flexible,0.042839,0.112024
8,Informative,0.190083,0.249932
9,Inquisitive,0.176394,0.150219
