In [3]:
import matplotlib.pyplot as plt
from data import load_data, save_to_json, load_from_json
from matplotlib.gridspec import GridSpec
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
import pandas as pd
from itertools import zip_longest
from pandas import MultiIndex
import altair as alt

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from math import log
from scipy.stats import kendalltau

COLORS = ['red', 'blue', 'green', 'orange', 'purple', 'cyan', 'magenta', 'yellow', 'black', 'grey', 'pink']

def avg(l):
    return sum(l) / len(l)

def brier_score(pred):
    return avg([(x - 1) ** 2 for x in pred])

def log_loss(pred):
    return -1 * avg([log(x) for x in pred])

def kendall_tau_for_results(model_results):
    detection_scores = [i['detection_score'] for i in model_results] 
    self_preferences = [i['self_preference'] for i in model_results]
    
    return kendalltau(detection_scores, self_preferences).correlation

def kendall_tau(x, y):
    return kendalltau(x, y).correlation

MODEL_TO_STRING = {
    'claude': 'Claude 2.1',
    'llama': 'LLaMA-2-7b-chat',
    'human': 'Human',
    'gpt4': 'GPT-4 11/06',
    'gpt35': 'Llama-2-7b-chat',

    'xsum_500_ft_gpt35': '[XSUM] FT GPT-3.5 Turbo 11/06 (500 examples)',
    'xsum_10_ft_gpt35': '[XSUM] FT GPT-3.5 Turbo 11/06 (10 examples)',
    'xsum_2_ft_gpt35': '[XSUM] FT GPT-3.5 Turbo 11/06 (2 examples)',
    'xsum_always_1_ft_gpt35': '[XSUM] FT GPT-3.5 Turbo 11/06 (answers always 1)',
    'xsum_random_ft_gpt35': '[XSUM] FT GPT-3.5 Turbo 11/06 (random answers)',
    'xsum_readability_ft_gpt35': '[XSUM] FT GPT-3.5 Turbo 11/06 (readability)',
    'xsum_length_ft_gpt35': '[XSUM] FT GPT-3.5 Turbo 11/06 (length)',
    'xsum_vowelcount_ft_gpt35': '[XSUM] FT GPT-3.5 Turbo 11/06 (vowel count)',

    'cnn_500_ft_gpt35': '[CNN] FT GPT-3.5 Turbo 11/06 (500 examples)',
    'cnn_10_ft_gpt35': '[CNN] FT GPT-3.5 Turbo 11/06 (10 examples)',
    'cnn_2_ft_gpt35': '[CNN] FT GPT-3.5 Turbo 11/06 (2 examples)',
    'cnn_always_1_ft_gpt35': '[CNN] FT GPT-3.5 Turbo 11/06 (answers always 1)',
    'cnn_random_ft_gpt35': '[CNN] FT GPT-3.5 Turbo 11/06 (random answers)',
    'cnn_readability_ft_gpt35': '[CNN] FT GPT-3.5 Turbo 11/06 (readability)',
    'cnn_length_ft_gpt35': '[CNN] FT GPT-3.5 Turbo 11/06 (length)',
    'cnn_vowelcount_ft_gpt35': '[CNN] FT GPT-3.5 Turbo 11/06 (vowel count)',

    'xsum_500_ft_llama': '[XSUM] FT Llama-2-7b-chat (500 examples)',
    'xsum_10_ft_llama': '[XSUM] FT Llama-2-7b-chat (10 examples)',
    'xsum_2_ft_llama': '[XSUM] FT Llama-2-7b-chat (2 examples)',
    'xsum_always_1_ft_llama': '[XSUM] FT Llama-2-7b-chat (answers always 1)',
    'xsum_random_ft_llama': '[XSUM] FT Llama-2-7b-chat (random answers)',
    'xsum_readability_ft_llama': '[XSUM] FT Llama-2-7b-chat (readability)',
    'xsum_length_ft_llama': '[XSUM] FT Llama-2-7b-chat (length)',
    'xsum_vowelcount_ft_llama': '[XSUM] FT Llama-2-7b-chat (vowel count)',

    'cnn_500_ft_llama': '[CNN] FT Llama-2-7b-chat (500 examples)',
    'cnn_10_ft_llama': '[CNN] FT Llama-2-7b-chat (10 examples)',
    'cnn_2_ft_llama': '[CNN] FT Llama-2-7b-chat (2 examples)',
    'cnn_always_1_ft_llama': '[CNN] FT Llama-2-7b-chat (answers always 1)',
    'cnn_random_ft_llama': '[CNN] FT Llama-2-7b-chat (random answers)',
    'cnn_readability_ft_llama': '[CNN] FT Llama-2-7b-chat (readability)',
    'cnn_length_ft_llama': '[CNN] FT Llama-2-7b-chat (length)',
    'cnn_vowelcount_ft_llama': '[CNN] FT Llama-2-7b-chat (vowel count)',
}

In [5]:
main_models = ['gpt4', 'gpt35', 'llama']
xsum_models_gpt35 = ['xsum_2_ft_gpt35', 'xsum_10_ft_gpt35', 'xsum_500_ft_gpt35', 'xsum_always_1_ft_gpt35', 'xsum_random_ft_gpt35', 'xsum_readability_ft_gpt35', 'xsum_length_ft_gpt35', 'xsum_vowelcount_ft_gpt35']
cnn_models_gpt35 = ['cnn_2_ft_gpt35', 'cnn_10_ft_gpt35', 'cnn_500_ft_gpt35', 'cnn_always_1_ft_gpt35', 'cnn_random_ft_gpt35', 'cnn_readability_ft_gpt35', 'cnn_length_ft_gpt35', 'cnn_vowelcount_ft_gpt35']

xsum_models_llama = ['xsum_2_ft_llama', 'xsum_10_ft_llama', 'xsum_500_ft_llama', 'xsum_always_1_ft_llama', 'xsum_random_ft_llama', 'xsum_readability_ft_llama', 'xsum_length_ft_llama', 'xsum_vowelcount_ft_llama']
cnn_models_llama = ['cnn_2_ft_llama', 'cnn_10_ft_llama', 'cnn_500_ft_llama', 'cnn_always_1_ft_llama', 'cnn_random_ft_llama', 'cnn_readability_ft_llama', 'cnn_length_ft_llama', 'cnn_vowelcount_ft_llama']

models = main_models + xsum_models_gpt35 + cnn_models_gpt35 + xsum_models_llama + cnn_models_llama

xsum_responses, xsum_articles, xsum_keys = load_data('xsum')
cnn_responses, cnn_articles, cnn_keys = load_data('cnn')

xsum_results = {}
cnn_results = {}
for model in models:
    xsum_results[model] = load_from_json(f'results/xsum/{model}_results.json')
    cnn_results[model] = load_from_json(f'results/cnn/{model}_results.json')
    
    if model in main_models:
        continue
    elif '_2_ft_' in model:
        xsum_results[model] = [i for i in xsum_results[model] if i['key'] in xsum_keys[2:]]
        cnn_results[model] = [i for i in cnn_results[model] if i['key'] in cnn_keys[2:]]
    elif '_10_ft_' in model:
        xsum_results[model] = [i for i in xsum_results[model] if i['key'] in xsum_keys[10:]]
        cnn_results[model] = [i for i in cnn_results[model] if i['key'] in cnn_keys[10:]]
    else:
        xsum_results[model] = [i for i in xsum_results[model] if i['key'] in xsum_keys[500:]]
        cnn_results[model] = [i for i in cnn_results[model] if i['key'] in cnn_keys[500:]]

# For the label result data, "self_preference" is the model's preference for the first summary
cnn_correct_label_results = {}
cnn_wrong_label_results = {}
cnn_random_label_results = {}
xsum_correct_label_results = {}
xsum_wrong_label_results = {}
xsum_random_label_results = {}
for model in main_models:
    dataset = 'cnn'
    cnn_correct_label_results[model] = load_from_json(f'label_results/correct_label_results/{dataset}/{model}_results.json')
    cnn_wrong_label_results[model] = load_from_json(f'label_results/wrong_label_results/{dataset}/{model}_results.json')
    # cnn_random_label_results[model] = load_from_json(f'label_results/random_label_results/{dataset}/{model}_results.json')
    dataset = 'xsum'
    xsum_correct_label_results[model] = load_from_json(f'label_results/correct_label_results/{dataset}/{model}_results.json')
    xsum_wrong_label_results[model] = load_from_json(f'label_results/wrong_label_results/{dataset}/{model}_results.json')
    # xsum_random_label_results[model] = load_from_json(f'label_results/random_label_results/{dataset}/{model}_results.json')

for results in [cnn_correct_label_results, cnn_wrong_label_results, xsum_correct_label_results, xsum_wrong_label_results]:
    results['llama'] = [result for result in results['llama'] if any(r['key'] == result['key'] for r in results['gpt4'])]

# Individual setting results
xsum_detection_results = {}
xsum_score_results = {}
cnn_detection_results = {}
cnn_score_results = {}
for model in models:
    dataset = 'xsum'
    xsum_detection_results[model] = load_from_json(f'individual_setting_results/recognition_results/{dataset}/{model}_results.json')
    xsum_score_results[model] = load_from_json(f'individual_setting_results/score_results/{dataset}/{model}_results.json')
    dataset = 'cnn'
    cnn_detection_results[model] = load_from_json(f'individual_setting_results/recognition_results/{dataset}/{model}_results.json')
    cnn_score_results[model] = load_from_json(f'individual_setting_results/score_results/{dataset}/{model}_results.json')

    if model in main_models:
        continue
    elif '_2_ft_' in model:
        xsum_detection_results[model] = [i for i in xsum_detection_results[model] if i['key'] in xsum_keys[2:]]
        xsum_score_results[model] = [i for i in xsum_score_results[model] if i['key'] in xsum_keys[2:]]
        cnn_detection_results[model] = [i for i in cnn_detection_results[model] if i['key'] in cnn_keys[2:]]
        cnn_score_results[model] = [i for i in cnn_score_results[model] if i['key'] in cnn_keys[2:]]
    elif '_10_ft_' in model:
        xsum_detection_results[model] = [i for i in xsum_detection_results[model] if i['key'] in xsum_keys[10:]]
        xsum_score_results[model] = [i for i in xsum_score_results[model] if i['key'] in xsum_keys[10:]]
        cnn_detection_results[model] = [i for i in cnn_detection_results[model] if i['key'] in cnn_keys[10:]]
        cnn_score_results[model] = [i for i in cnn_score_results[model] if i['key'] in cnn_keys[10:]]
    else:
        xsum_detection_results[model] = [i for i in xsum_detection_results[model] if i['key'] in xsum_keys[500:]]
        xsum_score_results[model] = [i for i in xsum_score_results[model] if i['key'] in xsum_keys[500:]]
        cnn_detection_results[model] = [i for i in cnn_detection_results[model] if i['key'] in cnn_keys[500:]]
        cnn_score_results[model] = [i for i in cnn_score_results[model] if i['key'] in cnn_keys[500:]]

In [5]:
# Cross-Model Evals

print(tabulate(
    [
        [
            model,
            avg([r["new_preference"] for r in load_from_json(f"comparisons/xsum/{model}_comparisons.json") if "new_preference" in r]),
            avg([r["new_preference"] for r in load_from_json(f"comparisons/cnn/{model}_comparisons.json") if "new_preference" in r]),
            len([r for r in load_from_json(f"comparisons/xsum/{model}_comparisons.json") if "new_preference" not in r or r['new_preference'] != r['new_preference']]),
            len([r for r in load_from_json(f"comparisons/cnn/{model}_comparisons.json") if "new_preference" not in r or r['new_preference'] != r['new_preference']]),
        ]
        for model in xsum_models_gpt35 + cnn_models_gpt35 + xsum_models_llama + cnn_models_llama
    ],
    headers=["Model", "XSUM", "CNN", "# Bad Outputs (e.g. NaN) - XSUM", "# Bad Outputs (e.g. NaN) - CNN"],
    tablefmt="github",
))

ones_twos_models = ['cnn_always_1_ft_llama', 'cnn_readability_ft_llama', 'cnn_vowelcount_ft_llama', 'cnn_random_ft_llama']
generation_error_models = ['cnn_500_ft_llama', 'cnn_length_ft_llama', 'xsum_always_1_ft_llama']

print('\n\nWithout degraded generations\n\n')
print(tabulate(
    [
        [
            model,
            avg([r["new_preference"] for r in load_from_json(f"comparisons/xsum/{model}_comparisons.json") if "new_preference" in r]),
            avg([r["new_preference"] for r in load_from_json(f"comparisons/cnn/{model}_comparisons.json") if "new_preference" in r]),
            len([r for r in load_from_json(f"comparisons/xsum/{model}_comparisons.json") if "new_preference" not in r or r['new_preference'] != r['new_preference']]),
            len([r for r in load_from_json(f"comparisons/cnn/{model}_comparisons.json") if "new_preference" not in r or r['new_preference'] != r['new_preference']]),
        ]
        for model in xsum_models_gpt35 + cnn_models_gpt35 + xsum_models_llama + cnn_models_llama if model not in ones_twos_models + generation_error_models
    ],
    headers=["Model", "XSUM", "CNN", "# Bad Outputs (e.g. NaN) - XSUM", "# Bad Outputs (e.g. NaN) - CNN"],
    tablefmt="github",
))


| Model                     |       XSUM |        CNN |   # Bad Outputs (e.g. NaN) - XSUM |   # Bad Outputs (e.g. NaN) - CNN |
|---------------------------|------------|------------|-----------------------------------|----------------------------------|
| xsum_2_ft_gpt35           |   0.239259 |   0.409474 |                                 0 |                                0 |
| xsum_10_ft_gpt35          |   0.351976 |   0.440032 |                                 0 |                                0 |
| xsum_500_ft_gpt35         |   0.181138 |   0.830678 |                                 0 |                                1 |
| xsum_always_1_ft_gpt35    |   0.5      |   0.5      |                                 0 |                                0 |
| xsum_random_ft_gpt35      |   0.5      |   0.5      |                                 0 |                                0 |
| xsum_readability_ft_gpt35 |   0.569504 |   0.418801 |                                 0 |                    

In [88]:
# Label Results

table = [[model,
            # avg([i['detection_score'] for i in xsum_correct_label_results[model]]),
            avg([i['self_preference'] for i in xsum_correct_label_results[model]]),
            # avg([i['detection_score'] for i in cnn_correct_label_results[model]]),
            avg([i['self_preference'] for i in cnn_correct_label_results[model]])
] for model in main_models]

print('Correct Labels')
print(tabulate(table, headers=['Model', 'Self-Preference (XSUM)', 'Self-Preference (CNN)']))

table = [[model,
            # avg([i['detection_score'] for i in xsum_wrong_label_results[model]]),
            avg([i['self_preference'] for i in xsum_wrong_label_results[model]]),
            # avg([i['detection_score'] for i in cnn_wrong_label_results[model]]),
            avg([i['self_preference'] for i in cnn_wrong_label_results[model]])
] for model in main_models]

print('\n\n')
print('Incorrect Labels')
print(tabulate(table, headers=['Model', 'Self-Preference (XSUM)', 'Self-Preference (CNN)']))

Correct Labels
Model      Self-Preference (XSUM)    Self-Preference (CNN)
-------  ------------------------  -----------------------
gpt4                     0.73245                  0.936168
gpt35                    0.605016                 0.46125
llama                    0.819782                 0.973418



Incorrect Labels
Model      Self-Preference (XSUM)    Self-Preference (CNN)
-------  ------------------------  -----------------------
gpt4                     0.315728                 0.89418
gpt35                    0.455228                 0.400281
llama                    0.834755                 0.973516


In [None]:
# Individual Setting

table = [[model, 
                        avg([result['ratio'] for result in xsum_detection_results[model] if result['target_model'] == 'gpt4' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in xsum_score_results[model] if result['target_model'] == 'gpt4' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in xsum_detection_results[model] if result['target_model'] == 'gpt35' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in xsum_score_results[model] if result['target_model'] == 'gpt35' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in xsum_detection_results[model] if result['target_model'] == 'llama' and result['ratio'] == result['ratio']]),                        
                        avg([result['ratio'] for result in xsum_score_results[model] if result['target_model'] == 'llama' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in xsum_detection_results[model] if result['target_model'] == 'human' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in xsum_score_results[model] if result['target_model'] == 'human' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in xsum_detection_results[model] if result['target_model'] == 'claude' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in xsum_score_results[model] if result['target_model'] == 'claude' and result['ratio'] == result['ratio']]),

                        avg([result['ratio'] for result in cnn_detection_results[model] if result['target_model'] == 'gpt4' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in cnn_score_results[model] if result['target_model'] == 'gpt4' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in cnn_detection_results[model] if result['target_model'] == 'gpt35' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in cnn_score_results[model] if result['target_model'] == 'gpt35' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in cnn_detection_results[model] if result['target_model'] == 'llama' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in cnn_score_results[model] if result['target_model'] == 'llama' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in cnn_detection_results[model] if result['target_model'] == 'human' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in cnn_score_results[model] if result['target_model'] == 'human' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in cnn_detection_results[model] if result['target_model'] == 'claude' and result['ratio'] == result['ratio']]),
                        avg([result['ratio'] for result in cnn_score_results[model] if result['target_model'] == 'claude' and result['ratio'] == result['ratio']]),
] for model in models if model != 'cnn_length_ft_llama']

In [36]:
# table = [[row[0]] + [round(i, 2) for i in row[1:]] for row in table]
rounded_table = [[row[0]] + ['{:.3f}'.format(i) for i in row[1:]] for row in table]

print(tabulate([row[0:1] + row[11:] for row in rounded_table], headers = ['Model', 'Self-Rec', 'Self-Pref', 'Self-Rec', 'Self-Pref', 'Self-Rec', 'Self-Pref', 'Self-Rec', 'Self-Pref', 'Self-Rec', 'Self-Pref', ]))

Model                        Self-Rec    Self-Pref    Self-Rec    Self-Pref    Self-Rec    Self-Pref    Self-Rec    Self-Pref    Self-Rec    Self-Pref
-------------------------  ----------  -----------  ----------  -----------  ----------  -----------  ----------  -----------  ----------  -----------
gpt4                            0.5          0.5         0.602        0.516       0.619        0.52        0.715        0.536       0.634        0.518
gpt35                           0.493        0.492       0.5          0.5         0.502        0.502       0.518        0.516       0.498        0.499
llama                           0.501        0.5         0.495        0.501       0.5          0.5         0.495        0.502       0.503        0.501
xsum_2_ft_gpt35                 0.491        0.492       0.5          0.5         0.501        0.503       0.53         0.52        0.503        0.502
xsum_10_ft_gpt35                0.492        0.494       0.5          0.5         0.503       

In [49]:
def print_ambig_table(results):
    task = 'detection' # 'comparison
    task2 = 'comparison'
    table = [[model, 
                            avg([result[f'forward_{task}'] == result[f'backward_{task}'] for result in results[model]]), 
                            avg([result[f'forward_{task}'] == '1' and result[f'backward_{task}'] == '2' for result in results[model]]),
                            avg([result[f'forward_{task}'] == '2' and result[f'backward_{task}'] == '1' for result in results[model]]),
                            
                            avg([result[f'forward_{task2}'] == result[f'backward_{task2}'] for result in results[model]]), 
                            avg([result[f'forward_{task2}'] == '1' and result[f'backward_{task2}'] == '2' for result in results[model]]),
                            avg([result[f'forward_{task2}'] == '2' and result[f'backward_{task2}'] == '1' for result in results[model]]),
    ] for model in models]

    # table = [row + [row[2] / (row[2] + row[3]) if any(i != 0 for i in [row[2], row[3]]) else 0] for row in table]
    # table = [row + [row[5] / (row[5] + row[6]) if any(i != 0 for i in [row[2], row[3]]) else 0] for row in table]

    table = [row[0:1] + [round(i, 3) for i in row[1:]] for row in table]
    print(tabulate(table, headers = ['Model', 'Ambiguous', 'Correct', 'Incorrect', 'Ambiguous', 'Correct', 'Incorrect']))
    # print(tabulate(sorted())

print_ambig_table(cnn_results)

Model                        Ambiguous    Correct    Incorrect    Ambiguous    Correct    Incorrect
-------------------------  -----------  ---------  -----------  -----------  ---------  -----------
gpt4                             0.383      0.595        0.022        0.088      0.877        0.034
gpt35                            0.62       0.149        0.23         0.517      0.151        0.332
llama                            1          0            0            1          0            0.001
xsum_2_ft_gpt35                  0.815      0.046        0.139        0.442      0.15         0.409
xsum_10_ft_gpt35                 0.805      0.086        0.109        0.479      0.181        0.34
xsum_500_ft_gpt35                0.194      0.651        0.155        0.193      0.654        0.153
xsum_always_1_ft_gpt35           1          0            0            1          0            0
xsum_random_ft_gpt35             1          0            0            1          0            0
xsum_read

In [7]:
def print_zapped_table():
    table = [[model, 
                        avg([result['detection_score'] for result in xsum_results[model]]),
                        avg([0.5 if result['detection_score'] == 0.5 else 1 if result['detection_score'] > 0.5 else 0 for result in xsum_results[model]]),

                        avg([result['self_preference'] for result in xsum_results[model]]),
                        avg([0.5 if result['self_preference'] == 0.5 else 1 if result['self_preference'] > 0.5 else 0 for result in xsum_results[model]]),

                        avg([result['detection_score'] for result in cnn_results[model]]),
                        avg([0.5 if result['detection_score'] == 0.5 else 1 if result['detection_score'] > 0.5 else 0 for result in cnn_results[model]]),

                        avg([result['self_preference'] for result in cnn_results[model]]),
                        avg([0.5 if result['self_preference'] == 0.5 else 1 if result['self_preference'] > 0.5 else 0 for result in cnn_results[model]]),

    ] for model in models]

    print(tabulate(sorted(table, key = lambda x: x[-1]), headers = ['Model', 'Self-Rec (XSUM)', 'Zapped', 'Self-Pref (XSUM)', 'Zapped', 'Self-Rec (CNN)', 'Zapped', 'Self-Pref (CNN)', 'Zapped']))
    # print(tabulate(sorted())

print_zapped_table()

Model                        Self-Rec (XSUM)    Zapped    Self-Pref (XSUM)    Zapped    Self-Rec (CNN)    Zapped    Self-Pref "(CNN)    Zapped
-------------------------  -----------------  --------  ------------------  --------  ----------------  --------  ------------------  --------
cnn_length_ft_gpt35                 0.574447  0.588               0.571714  0.571             0.169116  0.1355              0.187859  0.1565
cnn_vowelcount_ft_gpt35             0.607817  0.615               0.585983  0.5945            0.175772  0.1645              0.171075  0.1675
xsum_vowelcount_ft_gpt35            0.60012   0.5945              0.597612  0.5945            0.416476  0.3065              0.326374  0.3135
xsum_2_ft_gpt35                     0.630637  0.649242            0.618081  0.626263          0.452511  0.351954            0.376425  0.338677
gpt35                               0.534677  0.57725             0.581617  0.604             0.480843  0.43125             0.431062  0.358
xsum_10_

In [55]:
# Main pairwise results
table = [[model, 
                    avg([result['detection_score'] for result in xsum_results[model]]),
                    avg([result['self_preference'] for result in xsum_results[model]]),

                    avg([result['detection_score'] for result in cnn_results[model]]),
                    avg([result['self_preference'] for result in cnn_results[model]]),
] for model in models]

table = [[row[0]] + [round(i, 3) for i in row[1:]] for row in table]
print(tabulate(table, headers = ['Model', 'Self-Rec', 'Self-Pref', 'Self-Rec', 'Self-Pref']))

Model                        Self-Rec    Self-Pref    Self-Rec    Self-Pref
-------------------------  ----------  -----------  ----------  -----------
gpt4                            0.672        0.705       0.747        0.912
gpt35                           0.535        0.582       0.481        0.431
llama                           0.514        0.511       0.505        0.505
xsum_2_ft_gpt35                 0.631        0.618       0.453        0.376
xsum_10_ft_gpt35                0.674        0.657       0.489        0.421
xsum_500_ft_gpt35               0.896        0.898       0.738        0.75
xsum_always_1_ft_gpt35          0.5          0.5         0.5          0.5
xsum_random_ft_gpt35            0.5          0.5         0.5          0.5
xsum_readability_ft_gpt35       0.405        0.399       0.505        0.531
xsum_length_ft_gpt35            0.572        0.567       0.474        0.427
xsum_vowelcount_ft_gpt35        0.6          0.598       0.416        0.326
cnn_2_ft_gpt35   

In [29]:
task = 'detection'
task2 = 'comparison'
table = []
table += [[model, 
                    avg([result[f'forward_{task}'] == result[f'backward_{task}'] for result in xsum_results[model]]), 
                    avg([result[f'forward_{task}'] == '1' and result[f'backward_{task}'] == '2' for result in xsum_results[model]]),
                    avg([result[f'forward_{task}'] == '2' and result[f'backward_{task}'] == '1' for result in xsum_results[model]]),
                    avg([result['detection_score'] for result in xsum_results[model]]),
                    
                    avg([result[f'forward_{task2}'] == result[f'backward_{task2}'] for result in xsum_results[model]]), 
                    avg([result[f'forward_{task2}'] == '1' and result[f'backward_{task2}'] == '2' for result in xsum_results[model]]),
                    avg([result[f'forward_{task2}'] == '2' and result[f'backward_{task2}'] == '1' for result in xsum_results[model]]),
                    avg([result['self_preference'] for result in xsum_results[model]]),

                    kendall_tau_for_results(xsum_results[model]),

                    avg([result[f'forward_{task}'] == result[f'backward_{task}'] for result in cnn_results[model]]), 
                    avg([result[f'forward_{task}'] == '1' and result[f'backward_{task}'] == '2' for result in cnn_results[model]]),
                    avg([result[f'forward_{task}'] == '2' and result[f'backward_{task}'] == '1' for result in cnn_results[model]]),
                    avg([result['detection_score'] for result in cnn_results[model]]),
                    
                    avg([result[f'forward_{task2}'] == result[f'backward_{task2}'] for result in cnn_results[model]]), 
                    avg([result[f'forward_{task2}'] == '1' and result[f'backward_{task2}'] == '2' for result in cnn_results[model]]),
                    avg([result[f'forward_{task2}'] == '2' and result[f'backward_{task2}'] == '1' for result in cnn_results[model]]),
                    avg([result['self_preference'] for result in cnn_results[model]]),

                    kendall_tau_for_results(cnn_results[model])
] for model in models]

table = [row + [row[2] / (row[2] + row[3]) if any(i != 0 for i in [row[2], row[3]]) else 0] for row in table] # Recognition Score (Ambig Removed) XSUM
table = [row + [row[6] / (row[6] + row[7]) if any(i != 0 for i in [row[6], row[7]]) else 0] for row in table] # Preference Score (Ambig Removed) XSUM

table = [row + [row[11] / (row[11] + row[12]) if any(i != 0 for i in [row[11], row[12]]) else 0] for row in table] # Recognition Score (Ambig Removed) CNN
table = [row + [row[15] / (row[15] + row[16]) if any(i != 0 for i in [row[15], row[16]]) else 0] for row in table] # Preference Score (Ambig Removed) CNN

print(tabulate(table))

-------------------------  --------  --------  ---------  --------  --------  --------  --------  --------  ----------  --------  ---------  ---------  --------  --------  ----------  ----------  --------  ----------  --------  --------  --------  --------
gpt4                       0.31075   0.5385    0.15075    0.671656  0.22775   0.5925    0.17975   0.705006    0.707988  0.383     0.59475    0.02225    0.74652   0.0885    0.8775      0.034       0.912338   0.543711   0.781284  0.767239  0.963938  0.962699
gpt35                      0.582     0.2685    0.1495     0.534677  0.57775   0.302     0.12025   0.581617    0.405312  0.62025   0.1495     0.23025    0.480843  0.517     0.1515      0.3315      0.431062   0.36235    0.642344  0.715216  0.39368   0.313665
llama                      0.83225   0.08675   0.081      0.513524  0.75475   0.13      0.11525   0.510752    0.726415  0.99975   0.00025    0          0.505046  0.9995    0           0.0005      0.50492    0.498769   0.517139  0

In [331]:
results = cnn_results
print(tabulate(sorted([[model, avg([result['detection_score'] for result in results[model]]), avg([result['self_preference'] for result in results[model]]), kendall_tau_for_results(results[model])] for model in models], key = lambda x:x[1]), headers=['Model', 'Recog', 'Pref', 'Tau']))

Model                         Recog      Pref         Tau
-------------------------  --------  --------  ----------
cnn_length_ft_gpt35        0.169116  0.187859   0.630281
cnn_vowelcount_ft_gpt35    0.175772  0.171075   0.686878
xsum_vowelcount_ft_gpt35   0.416476  0.326374   0.664171
xsum_2_ft_gpt35            0.452511  0.376425   0.642794
xsum_length_ft_gpt35       0.474401  0.427378   0.321082
gpt35                      0.480843  0.431062   0.36235
xsum_10_ft_gpt35           0.488748  0.421113   0.587592
cnn_readability_ft_llama   0.495163  0.48851    0.701186
cnn_2_ft_gpt35             0.496871  0.42312    0.353979
xsum_always_1_ft_gpt35     0.5       0.5        0.0959225
xsum_random_ft_gpt35       0.5       0.5        0.0977254
xsum_always_1_ft_llama     0.5       0.5        0.214696
cnn_always_1_ft_gpt35      0.5       0.5        0.232014
cnn_random_ft_gpt35        0.500416  0.500584   0.280479
llama                      0.505046  0.50492    0.498769
xsum_readability_ft_gpt35  0

In [309]:
# Print table showing the kendall_tau_for_result for cn and xsum on each main model
print(tabulate(sorted([[model, avg([result['detection_score'] for result in xsum_results[model]]), avg([result['self_preference'] for result in xsum_results[model]]), avg([result['detection_score'] for result in cnn_results[model]]), avg([result['self_preference'] for result in cnn_results[model]]),kendall_tau_for_results(xsum_results[model]), kendall_tau_for_results(cnn_results[model])] for model in models if 'llama' in model], key = lambda x:x[0]), headers=['Model', 'Self-Rec [X]', 'Self-Pref [X]', 'Self-Rec [C]', 'Self-Pref [C]', 'XSUM T', 'CNN T']))

Model                        Self-Rec [X]    Self-Pref [X]    Self-Rec [C]    Self-Pref [C]    XSUM T       CNN T
-------------------------  --------------  ---------------  --------------  ---------------  --------  ----------
cnn_10_ft_llama                  0.519166         0.655903        0.66481          0.825356  0.678791   0.609962
cnn_2_ft_llama                   0.357445         0.502014        0.566808         0.702605  0.671583   0.544096
cnn_500_ft_llama                 0.55599          0.434032        0.59203          0.499934  0.301602  -0.0340433
cnn_always_1_ft_llama            0.967            0.961           0.949            0.933     0.57601    0.565979
cnn_length_ft_llama              0.489451         0.486789        0.548378         0.54111   0.29198    0.562209
cnn_random_ft_llama              0.673236         0.675831        0.637675         0.653747  0.699919   0.760943
cnn_readability_ft_llama         0.501337         0.463783        0.495163         0.48851   

In [None]:
def show_scatterplots(results, include_ambiguous=True):
    num_models = len(results.keys())

    plt.figure(figsize=(num_models * 6, num_models * 2))
    colors = ['blue', 'green', 'red']

    for i, model in enumerate(results.keys()):
        if not include_ambiguous:
            detection_scores = [i['detection_score'] for i in results[model] if i['forward_comparison'] != i['backward_comparison']]
            self_preferences = [i['self_preference'] for i in results[model] if i['forward_comparison'] != i['backward_comparison']]
        else:
            detection_scores = [i['detection_score'] for i in results[model]]
            self_preferences = [i['self_preference'] for i in results[model]]
        
        plt.subplot(1, 3, i+1)
        plt.scatter(detection_scores, self_preferences, color=colors[i])
        plt.xlabel('Detection Score')
        plt.ylabel('Self-Preference')
        plt.title(MODEL_TO_STRING[model])

    plt.suptitle('Detection Score vs Self-Preference (Token Probability)', fontsize=16, y=1) 
    plt.tight_layout()
    plt.show()

In [None]:
def show_scatterplots(results, include_ambiguous=True):
    num_models = len(results.keys())

    plt.figure(figsize=(num_models * 6, num_models * 2))
    colors = ['blue', 'green', 'red']

    for i, model in enumerate(results.keys()):
        if not include_ambiguous:
            detection_scores = [i['detection_score'] for i in results[model] if i['forward_comparison'] != i['backward_comparison']]
            self_preferences = [i['self_preference'] for i in results[model] if i['forward_comparison'] != i['backward_comparison']]
        else:
            detection_scores = [i['detection_score'] for i in results[model]]
            self_preferences = [i['self_preference'] for i in results[model]]
        
        plt.subplot(1, 3, i+1)
        plt.scatter(detection_scores, self_preferences, color=colors[i])
        plt.xlabel('Detection Score')
        plt.ylabel('Self-Preference')
        plt.title(MODEL_TO_STRING[model])

    plt.suptitle('Detection Score vs Self-Preference (Token Probability)', fontsize=16, y=1) 
    plt.tight_layout()
    plt.show()

In [None]:
def show_scatterplots_with_marginals(results, include_ambiguous=True):
    def is_valid(item):
        return 'detection_score' in item and 'self_preference' in item 

    for i, model in enumerate(results.keys()):
        if not include_ambiguous:
            detection_scores = [item['detection_score'] for item in results[model] if is_valid(item) and item['forward_comparison'] != item['backward_comparison'] and item['forward_detection'] != item['backward_detection']]
            self_preferences = [item['self_preference'] for item in results[model] if is_valid(item) and item['forward_comparison'] != item['backward_comparison'] and item['forward_detection'] != item['backward_detection']]
        else:
            detection_scores = [item['detection_score'] for item in results[model] if is_valid(item)]
            self_preferences = [item['self_preference'] for item in results[model] if is_valid(item)]

        # Create a jointplot for each model
        joint_plot = sns.jointplot(x=detection_scores, y=self_preferences, kind="scatter", color=COLORS[i % len(COLORS)], marginal_kws=dict(bins=15, fill=True))

        joint_plot.ax_joint.set_xlim(0, 1.0)
        joint_plot.ax_joint.set_ylim(0, 1.0)

        # Adjust the title position and font size
        joint_plot.fig.suptitle(f'{MODEL_TO_STRING[model]}', fontsize=14, y=1.05)
        # joint_plot.fig.suptitle(f'Detection Score vs Self-Preference (Token Probability) for {MODEL_TO_STRING[model]}', fontsize=14, y=1.05)

        # Adjust axis labels font size
        joint_plot.set_axis_labels('Detection Score', 'Self-Preference', fontsize=12)

        # Show the plot
        plt.savefig(f'plots/scatterplots/xsum/{model}.png', bbox_inches='tight')
        plt.show()


In [None]:
show_scatterplots(results)
show_scatterplots(results, include_ambiguous=False)

In [None]:
def show_heatmaps(results, include_ambiguous=True):
    num_models = len(results.keys())

    plt.figure(figsize=(num_models * 6, num_models * 2))
    colors = ['blue', 'green', 'red', 'orange']

    for i, model in enumerate(results.keys()):
        if not include_ambiguous:
            detection_scores = [i['detection_score'] for i in results[model] if i['forward_comparison'] != i['backward_comparison']]
            self_preferences = [i['self_preference'] for i in results[model] if i['forward_comparison'] != i['backward_comparison']]
        else:
            detection_scores = [i['detection_score'] for i in results[model]]
            self_preferences = [i['self_preference'] for i in results[model]]
        
        plt.subplot(1, num_models, i+1)
        plt.hexbin(detection_scores, self_preferences, gridsize=30, cmap='Blues')
        plt.colorbar(label='Density')
        plt.xlabel('Detection Score')
        plt.ylabel('Self-Preference')
        plt.title(MODEL_TO_STRING[model])

    plt.suptitle('Detection Score vs Self-Preference (Token Probability)', fontsize=16, y=1) 
    plt.tight_layout()
    plt.show()

In [None]:
show_heatmaps(results)
show_heatmaps(results, include_ambiguous=False)

In [71]:
def plot_detection_score_vs_correlation(*results_dicts):
    # Extract 'a' and 'b' values
    x_values = []
    y_values = []
    plot_models = []
    d = {}
    for results in results_dicts:
        keys = [model for model in models if any(s in model for s in ['_2_ft_', '_10_ft_', '_500_ft_'])] #list(results.keys())
        plot_models += keys
        x_values += [avg([i['detection_score'] for i in results[key] if 'detection_score' in i]) for key in keys]
        y_values += [avg([i['self_preference'] for i in results[key] if 'self_preference' in i]) for key in keys]
        y_values += [kendall_tau_for_results(results[key]) for key in keys]
        for key in keys:
            d[key] = [(i['detection_score'], i['self_preference']) for i in results[key] if 'detection_score' in i and 'self_preference' in i]

    save_to_json(d, 'xsum_plot_data.json')
    # Create a scatter plot
    plt.figure(figsize=(8, 8))

    # Generate a color map with a unique color for each point
    markers = (['o'] * 3 + ['^'] * 3) * 4
    colors = ['red'] * 12 + ['blue'] * 12
    # colors = ['red', 'blue', 'green', 'orange', 'yellow', 'purple', 'black', 'pink', 'grey'][:len(plot_models)]
    
    # Plot each point
    print(plot_models)
    for i, (a, b, color, marker) in enumerate(zip_longest(x_values, y_values, colors, markers)):
        plt.scatter(a, b, color=color, marker=marker, label=MODEL_TO_STRING[plot_models[i]])

    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.gca().set_aspect('equal', adjustable='box')
    
    # Create a legend below the plot in a vertical column
    plt.legend(title="Key", loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=1)

    # Add grid, title, and axis labels
    plt.grid(True)
    # plt.title('Detection Score vs. Self-Preference')
    plt.xlabel('Self-Recognition Score')
    plt.ylabel("Self-Preference")

    # Show the plot
    plt.savefig(f'plots/xsum_scaling_law.png', bbox_inches='tight')
    plt.show()
    return zip(plot_models, x_values, y_values)