### MT-Bench BT Scores

In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['VLLM_LOGGING_LEVEL'] = 'ERROR'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# mt_bench = load_dataset('/nas/data/mt-bench', trust_remote_code=True)
mt_bench = load_dataset('mt-bench', trust_remote_code=True)
models = np.unique(mt_bench['train']['model_a'])
print(mt_bench)

  from .autonotebook import tqdm as notebook_tqdm
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'mt-bench' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
Generating train split: 5755 examples [00:00, 297261.39 examples/s]

DatasetDict({
    train: Dataset({
        features: ['question_id', 'model_a', 'model_b', 'winner', 'judge', 'conversation_a', 'conversation_b', 'turn'],
        num_rows: 5755
    })
})





In [None]:
# df = pd.DataFrame(mt_bench['train'])
# df.to_csv('mt-bench.csv', index=False)

In [None]:
# ref: https://datascience.oneoffcoder.com/btl-model.html
def get_estimate(i, p, df):
    get_prob = lambda i, j: np.nan if i == j else p.iloc[i] + p.iloc[j]
    n = df.iloc[i].sum()

    d_n = df.iloc[i] + df.iloc[:, i]
    d_d = pd.Series([get_prob(i, j) for j in range(len(p))], index=p.index)
    d = (d_n / d_d).sum()

    return n / d

def estimate_p(p, df):
    return pd.Series([get_estimate(i, p, df) for i in range(df.shape[0])], index=p.index)


def iterate(df, p=None, n=20, sorted=True):
    if p is None:
        p = pd.Series([1 for _ in range(df.shape[0])], index=list(df.columns))

    estimates = [p]

    for _ in range(n):
        p = estimate_p(p, df)
        p = p / p.sum()
        estimates.append(p)

    p = p.sort_values(ascending=False) if sorted else p
    return p, pd.DataFrame(estimates)

win_count = {model_a: {model_b: 0 for model_b in models} for model_a in models}
for line in mt_bench['train']:
    if line['winner'] == 'model_a':
        win_count[line['model_b']][line['model_a']] += 1
    elif line['winner'] == 'model_b':
        win_count[line['model_a']][line['model_b']] += 1
df = pd.DataFrame(win_count)
p, estimates = iterate(df, n=100)
print(p)

### MT-Bench FACE Score BT

In [None]:
from face import spectrum_pipeline, evaluate_pipeline
from inference import inference

from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from tqdm import tqdm
import pyarrow as pa
import torch

est_path = '/data1/model/pythia-1_4b-base'
inference_model_name = est_path.split('/')[-1]
gpu_mem = 0.9

metrics = ['so', 'corr', 'spear', 'emd', 'kl', 'js']

schema = pa.schema({
    'input': pa.string(),
    'output': pa.string()
})

llm = LLM(
    est_path,
    gpu_memory_utilization=0.8,
    max_model_len=2048,
    tensor_parallel_size=torch.cuda.device_count()
)
sampling_params = SamplingParams(
    temperature=0, 
    prompt_logprobs=0, 
    max_tokens=1
)
tokenizer = AutoTokenizer.from_pretrained(est_path)

In [None]:
win_counts = {
    'so': {model_a: {model_b: 0 for model_b in models} for model_a in models},
    'corr': {model_a: {model_b: 0 for model_b in models} for model_a in models},
    'spear': {model_a: {model_b: 0 for model_b in models} for model_a in models},
    'emd': {model_a: {model_b: 0 for model_b in models} for model_a in models},
    'kl': {model_a: {model_b: 0 for model_b in models} for model_a in models},
    'js': {model_a: {model_b: 0 for model_b in models} for model_a in models}
}

gpt4_dict = {}
for line in tqdm(mt_bench['train']):
    if line['model_a'] == 'gpt-4':
        for text in line['conversation_a']:
            if text['role'] == 'user':
                line_input = text['content']
            else:
                if line_input in gpt4_dict.keys():
                    gpt4_dict[line_input].append(text['content'])
                else:
                    gpt4_dict[line_input] = [text['content']]
    elif line['model_b'] == 'gpt-4':
        for text in line['conversation_b']:
            if text['role'] == 'user':
                line_input = text['content']
            else:
                if line_input in gpt4_dict.keys():
                    gpt4_dict[line_input].append(text['content'])
                else:
                    gpt4_dict[line_input] = [text['content']]
# for idx, (key, value) in enumerate(gpt4_dict.items()):
#     print(key)
#     print('=' * 10)
#     print(value)
#     if idx == 3:
#         break
# print(len(gpt4_dict))

for line in tqdm(mt_bench['train']):
    def eval_line(conversation_name):
        input = []
        output = []
        gpt4_output = []
        for text in line[conversation_name]:
            if text['role'] == 'user':
                text_input = text['content']
            else:
                if text_input != "" and text['content'] != "":
                    input.extend([text_input] * len(gpt4_dict[text_input]))
                    output.extend([text['content']] * len(gpt4_dict[text_input]))
                    gpt4_output.extend(gpt4_dict[text_input])
        data = pa.Table.from_pydict(
            dict(
                zip(schema.names, (input, output))
            ),
            schema=schema
        )
        data = Dataset(data)
        inferenced_data = inference(llm, sampling_params, data, tokenizer, False)
        model_spectrum = spectrum_pipeline(inferenced_data['logprobs'])

        data = pa.Table.from_pydict(
            dict(
                zip(schema.names, (input, gpt4_output))
            ),
            schema=schema
        )
        data = Dataset(data)
        inferenced_data = inference(llm, sampling_params, data, tokenizer, False)
        human_spectrum = spectrum_pipeline(inferenced_data['logprobs'])
        raw_results = evaluate_pipeline(human_spectrum, model_spectrum, metrics)
        return_results = []
        last_input = ""
        for idx, line_result in enumerate(raw_results):
            line_input = input[idx]
            if line_input != last_input:
                if last_input != "":
                    line_return_result = {}
                    for metric in metrics:
                        line_return_result[metric] = np.max(group_results[metric]) if metric in ['so', 'corr', 'spear'] else np.min(group_results[metric])
                    return_results.append(line_return_result)
                group_results = {metric: [] for metric in metrics}
                last_input = line_input
            else:
                for metric in metrics:
                    group_results[metric].append(line_result[metric])
        return return_results

    result_a = eval_line('conversation_a')
    result_b = eval_line('conversation_b')
    for metric in metrics:
        sum = 0
        for result in result_a:
            sum += result[metric]
        mean_result_a = sum / len(result_a)
        sum = 0
        for result in result_b:
            sum += result[metric]
        mean_result_b = sum / len(result_b)
        # greater better
        if metric in ['so', 'corr', 'spear']:
            if mean_result_a > mean_result_b:
                win_counts[metric][line['model_a']][line['model_b']] += 1
            elif mean_result_a < mean_result_b:
                win_counts[metric][line['model_b']][line['model_a']] += 1
        else:
            if mean_result_a < mean_result_b:
                win_counts[metric][line['model_a']][line['model_b']] += 1
            elif mean_result_a > mean_result_b:
                win_counts[metric][line['model_b']][line['model_a']] += 1

In [None]:
for key, value in win_counts.items():
    print(key)
    print('-' * 10)
    df = pd.DataFrame(value)
    p, estimates = iterate(df, n=100)
    print(p)
    print('=' * 10)

### MT-Bench MAUVE Score BT

In [None]:
import mauve
mauve_win_count = {model_a: {model_b: 0 for model_b in models} for model_a in models}

def mauve_pipeline(human_data, model_data):
    predictions = [x['input'] + x['output'] for x in model_data]
    references = [x['input'] + x['output'] for x in human_data]
    return mauve.compute_mauve(p_text=predictions, q_text=references, featurize_model_name=est_path, verbose=False)

gpt4_dict = {}
for line in tqdm(mt_bench['train']):
    if line['model_a'] == 'gpt-4':
        for text in line['conversation_a']:
            if text['role'] == 'user':
                line_input = text['content']
            else:
                if line_input in gpt4_dict.keys():
                    gpt4_dict[line_input].append(text['content'])
                else:
                    gpt4_dict[line_input] = [text['content']]
    elif line['model_b'] == 'gpt-4':
        for text in line['conversation_b']:
            if text['role'] == 'user':
                line_input = text['content']
            else:
                if line_input in gpt4_dict.keys():
                    gpt4_dict[line_input].append(text['content'])
                else:
                    gpt4_dict[line_input] = [text['content']]

for line in tqdm(mt_bench['train']):
    def eval_line(conversation_name):
        input = []
        output = []
        gpt4_output = []
        for text in line[conversation_name]:
            if text['role'] == 'user':
                text_input = text['content']
            else:
                if text_input != "" and text['content'] != "":
                    # input.extend([text_input] * len(gpt4_dict[text_input]))
                    # output.extend([text['content']] * len(gpt4_dict[text_input]))
                    # gpt4_output.extend(gpt4_dict[text_input])

                    input.append(text_input)
                    output.append(text['content'])
                    gpt4_output.append(gpt4_dict[text_input][0])
        model_data = pa.Table.from_pydict(
            dict(
                zip(schema.names, (input, output))
            ),
            schema=schema
        )
        model_data = Dataset(model_data)

        human_data = pa.Table.from_pydict(
            dict(
                zip(schema.names, (input, gpt4_output))
            ),
            schema=schema
        )
        human_data = Dataset(human_data)
        raw_results = mauve_pipeline(human_data, model_data)

        return raw_results.mauve
        # return_results = []
        # last_input = ""
        # for idx, line_result in enumerate(raw_results):
        #     line_input = input[idx]
        #     if line_input != last_input:
        #         if last_input != "":
        #             line_return_result = np.max(group_results)
        #             return_results.append(line_return_result)
        #         group_results = []
        #         last_input = line_input
        #     else:
        #         group_results.append(line_result['mauve'])
        # return return_results

    result_a = eval_line('conversation_a')
    result_b = eval_line('conversation_b')

    mean_result_a = result_a
    mean_result_b = result_b
    # sum = 0
    # for result in result_a:
    #     sum += result
    # mean_result_a = sum / len(result_a)
    # sum = 0
    # for result in result_b:
    #     sum += result
    # mean_result_b = sum / len(result_b)
    # greater better
    if mean_result_a > mean_result_b:
        mauve_win_count[line['model_b']][line['model_a']] += 1
    elif mean_result_a < mean_result_b:
        mauve_win_count[line['model_a']][line['model_b']] += 1

In [None]:
print("mauve")
print('-' * 10)
df = pd.DataFrame(mauve_win_count)
p, estimates = iterate(df, n=100)
print(p)
print('=' * 10)

### MT-Bench FACE Scores

In [None]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from inference import inference
from tqdm import tqdm
import pyarrow as pa
import torch

est_path = '/data1/model/pythia-1_4b-base'
inference_model_name = est_path.split('/')[-1]
gpu_mem = 0.9

schema = pa.schema({
    'input': pa.string(),
    'output': pa.string()
})
# model_generations = {model: set() for model in models}
# for line in mt_bench['train']:
#     for content in line['conversation_a']:
#         model_generations[line['model_a']].add(content['content'])
#     for content in line['conversation_b']:
#         model_generations[line['model_b']].add(content['content'])

generation_pairs = {model: [] for model in models}
for line in mt_bench['train']:
    if line['model_a'] == 'gpt-4':
        for text_pair in zip(line['conversation_a'], line['conversation_b']):
            generation_pairs[line['model_b']].append((text_pair[0], text_pair[1]))
    elif line['model_b'] == 'gpt-4':
        for text_pair in zip(line['conversation_b'], line['conversation_a']):
            generation_pairs[line['model_a']].append((text_pair[0], text_pair[1]))

llm = LLM(
    est_path,
    gpu_memory_utilization=0.8,
    max_model_len=2048,
    tensor_parallel_size=torch.cuda.device_count()
)
sampling_params = SamplingParams(
    temperature=0, 
    prompt_logprobs=0, 
    max_tokens=1
)
tokenizer = AutoTokenizer.from_pretrained(est_path)

for model_name in tqdm(models):
    input = []
    output = []
    for text_pair in generation_pairs[model_name]:
        if text_pair[1]['role'] == 'user':
            input.append(text_pair[1]['content'])
        else:
            output.append(text_pair[1]['content'])
    data = pa.Table.from_pydict(
        dict(
            zip(schema.names, (input, output))
        ),
        schema=schema
    )
    data = Dataset(data)
    data = data.filter(lambda x: x['input'] != '' and x['output'] != '')
    inferenced_data = inference(llm, sampling_params, data, tokenizer)
    inferenced_data.save_to_disk(f'./inference/{inference_model_name}-inferenced.{model_name}.{'mt-bench'}')

    input = []
    output = []
    for text_pair in generation_pairs[model_name]:
        if text_pair[0]['role'] == 'user':
            input.append(text_pair[0]['content'])
        else:
            output.append(text_pair[0]['content'])
    data = pa.Table.from_pydict(
        dict(
            zip(schema.names, (input, output))
        ),
        schema=schema
    )
    data = Dataset(data)
    data = data.filter(lambda x: x['input'] != '' and x['output'] != '')
    inferenced_data = inference(llm, sampling_params, data, tokenizer)
    inferenced_data.save_to_disk(f'./inference/{inference_model_name}-inferenced.{model_name+'-gpt4'}.{'mt-bench'}')

In [None]:
import json
# run face.py first
eval_path = './eval/real-en/raw'
scores = {}
models = []
for file in os.listdir(eval_path):
    model = file.split('.')[1]
    models.append(model)
    with open(os.path.join(eval_path, file))as f:
        jsonl = [json.loads(line) for line in f][0]
        so = np.mean([jsonl[idx]['so'] for idx in range(len(jsonl))])
        corr = np.mean([jsonl[idx]['corr'] for idx in range(len(jsonl))])
        spear = np.mean([jsonl[idx]['spear'] for idx in range(len(jsonl))])
        emd = np.mean([jsonl[idx]['emd'] for idx in range(len(jsonl))])
        kl = np.mean([jsonl[idx]['kl'] for idx in range(len(jsonl))])
        js = np.mean([jsonl[idx]['js'] for idx in range(len(jsonl))])
    scores[model] = {
        'so': so,
        'corr': corr,
        'spear': spear,
        'emd': emd,
        'kl': kl,
        'js':js
    }

for model in models:
    print(f'{model:13s} ||', end='')
    for key, value in scores[model].items():
        print(f' {key}: {value:.4f} |', end='')
    print()

metrics = ['so', 'corr', 'spear', 'emd', 'kl', 'js']
for metric in metrics:
    pairs = []
    for model in models:
        pairs.append((model, scores[model][metric]))
    sorted_list = sorted(pairs, key=lambda x: x[1], reverse=True if metric in ['so', 'corr', 'spear'] else False)
    print(metric)
    print('-' * 10)
    for item in sorted_list:
        print(f'{item[0]:13s} | {item[1]:.4f}')
    print('=' * 10)

### MT-Bench MAUVE Scores

In [None]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from inference import inference
from tqdm import tqdm
import pyarrow as pa
import torch

est_path = '/data1/model/pythia-1_4b-base'
inference_model_name = est_path.split('/')[-1]
gpu_mem = 0.9

schema = pa.schema({
    'input': pa.string(),
    'output': pa.string()
})
# model_generations = {model: set() for model in models}
# for line in mt_bench['train']:
#     for content in line['conversation_a']:
#         model_generations[line['model_a']].add(content['content'])
#     for content in line['conversation_b']:
#         model_generations[line['model_b']].add(content['content'])

generation_pairs = {model: [] for model in models}
for line in mt_bench['train']:
    if line['model_a'] == 'gpt-4':
        for text_pair in zip(line['conversation_a'], line['conversation_b']):
            generation_pairs[line['model_b']].append((text_pair[0], text_pair[1]))
    elif line['model_b'] == 'gpt-4':
        for text_pair in zip(line['conversation_b'], line['conversation_a']):
            generation_pairs[line['model_a']].append((text_pair[0], text_pair[1]))

llm = LLM(
    est_path,
    gpu_memory_utilization=0.8,
    max_model_len=2048,
    tensor_parallel_size=torch.cuda.device_count()
)
sampling_params = SamplingParams(
    temperature=0, 
    prompt_logprobs=0, 
    max_tokens=1
)
tokenizer = AutoTokenizer.from_pretrained(est_path)

def mauve_pipeline(human_data, model_data):
    predictions = [x['input'] + x['output'] for x in model_data]
    references = [x['input'] + x['output'] for x in human_data]
    return mauve.compute_mauve(p_text=predictions, q_text=references, featurize_model_name=est_path, verbose=False)

mauve_scores = {}

for model_name in tqdm(models):
    input = []
    output = []
    for text_pair in generation_pairs[model_name]:
        if text_pair[1]['role'] == 'user':
            input.append(text_pair[1]['content'])
        else:
            output.append(text_pair[1]['content'])
    data = pa.Table.from_pydict(
        dict(
            zip(schema.names, (input, output))
        ),
        schema=schema
    )
    data = Dataset(data)
    model_data = data.filter(lambda x: x['input'] != '' and x['output'] != '')

    input = []
    output = []
    for text_pair in generation_pairs[model_name]:
        if text_pair[0]['role'] == 'user':
            input.append(text_pair[0]['content'])
        else:
            output.append(text_pair[0]['content'])
    data = pa.Table.from_pydict(
        dict(
            zip(schema.names, (input, output))
        ),
        schema=schema
    )
    data = Dataset(data)
    human_data = data.filter(lambda x: x['input'] != '' and x['output'] != '')
    mauve_scores[model_name] = mauve_pipeline(human_data, model_data)

print(mauve_scores)

### MT-Bench Zipf Scores

In [None]:
from tqdm import tqdm
import pyarrow as pa

est_path = '/data1/model/pythia-1_4b-base'
inference_model_name = est_path.split('/')[-1]
gpu_mem = 0.9

schema = pa.schema({
    'input': pa.string(),
    'output': pa.string()
})
# model_generations = {model: set() for model in models}
# for line in mt_bench['train']:
#     for content in line['conversation_a']:
#         model_generations[line['model_a']].add(content['content'])
#     for content in line['conversation_b']:
#         model_generations[line['model_b']].add(content['content'])

generation_pairs = {model: [] for model in models}
for line in mt_bench['train']:
    if line['model_a'] == 'gpt-4':
        for text_pair in zip(line['conversation_a'], line['conversation_b']):
            generation_pairs[line['model_b']].append((text_pair[0], text_pair[1]))
    elif line['model_b'] == 'gpt-4':
        for text_pair in zip(line['conversation_b'], line['conversation_a']):
            generation_pairs[line['model_a']].append((text_pair[0], text_pair[1]))

# Zipf dict
zipf_dict = {}
for model_name in models:
    for text_pair in generation_pairs[model_name]:
        if text_pair[1]['role'] != 'user':
            text = text_pair[1]['content']
            words = text.split()
            words = [word.lower() for word in words]
            words = [''.join(x for x in word if x.isalpha()) for word in words]
            for word in words:
                if word in zipf_dict.keys():
                    zipf_dict[word] += 1
                else:
                    zipf_dict[word] = 1

from matplotlib import pyplot as plt

# zipf_dict = dict(sorted(zipf_dict.items(), key=lambda x: x[1], reverse=True))
# x = list(zipf_dict.keys())  [:50]
# y = list(zipf_dict.values())[:50]
# plt.xticks(rotation=90)
# plt.plot(x, y)
# plt.show()

total_freq = sum(zipf_dict.values())

zipf_scores = {}
for model_name in models:
    total_prob = 0
    cnt = 0
    for text_pair in generation_pairs[model_name]:
        if text_pair[1]['role'] != 'user':
            text = text_pair[1]['content']
            words = text.split()
            words = [word.lower() for word in words]
            words = [''.join(x for x in word if x.isalpha()) for word in words]
            for word in words:
                total_prob += zipf_dict[word] / total_freq
                cnt += 1
    if cnt == 0:
        continue
    zipf_scores[model_name] = total_prob / cnt

zipf_scores = dict(sorted(zipf_scores.items(), key=lambda x: x[1], reverse=True))
x = list(zipf_scores.keys())  [:50]
y = list(zipf_scores.values())[:50]
plt.xticks(rotation=90)
plt.bar(x, y)
plt.show()
print(zipf_scores)

### BERT Scores BT

In [None]:
import logging
import transformers
# transformers.tokenization_utils.logger.setLevel(logging.ERROR)
# transformers.configuration_utils.logger.setLevel(logging.ERROR)
# transformers.modeling_utils.logger.setLevel(logging.ERROR)
import bert_score
bert_score.__version__

In [None]:
from tqdm import tqdm
import pyarrow as pa
from bert_score import score
bert_win_count = {model_a: {model_b: 0 for model_b in models} for model_a in models}

schema = pa.schema({
    'input': pa.string(),
    'output': pa.string()
})

def bert_pipeline(human_data, model_data):
    predictions = [x['output'] for x in model_data]
    references = [x['output'] for x in human_data]
    P, R, F1 = score(cands=predictions, refs=references, lang='en', verbose=False)
    return F1.tolist()

gpt4_dict = {}
for line in tqdm(mt_bench['train']):
    if line['model_a'] == 'gpt-4':
        for text in line['conversation_a']:
            if text['role'] == 'user':
                line_input = text['content']
            else:
                if line_input in gpt4_dict.keys():
                    gpt4_dict[line_input].append(text['content'])
                else:
                    gpt4_dict[line_input] = [text['content']]
    elif line['model_b'] == 'gpt-4':
        for text in line['conversation_b']:
            if text['role'] == 'user':
                line_input = text['content']
            else:
                if line_input in gpt4_dict.keys():
                    gpt4_dict[line_input].append(text['content'])
                else:
                    gpt4_dict[line_input] = [text['content']]

for line in tqdm(mt_bench['train']):
    def eval_line(conversation_name):
        input = []
        output = []
        gpt4_output = []
        for text in line[conversation_name]:
            if text['role'] == 'user':
                text_input = text['content']
            else:
                if text_input != "" and text['content'] != "":
                    input.append(text_input)
                    output.append(text['content'])
                    gpt4_output.append(gpt4_dict[text_input][0])
        model_data = pa.Table.from_pydict(
            dict(
                zip(schema.names, (input, output))
            ),
            schema=schema
        )
        model_data = Dataset(model_data)

        human_data = pa.Table.from_pydict(
            dict(
                zip(schema.names, (input, gpt4_output))
            ),
            schema=schema
        )
        human_data = Dataset(human_data)
        raw_results = bert_pipeline(human_data, model_data)

        return np.mean(raw_results)

    result_a = eval_line('conversation_a')
    result_b = eval_line('conversation_b')

    mean_result_a = result_a
    mean_result_b = result_b

    if mean_result_a > mean_result_b:
        bert_win_count[line['model_b']][line['model_a']] += 1
    elif mean_result_a < mean_result_b:
        bert_win_count[line['model_a']][line['model_b']] += 1

In [None]:
print("bert score")
print('-' * 10)
df = pd.DataFrame(bert_win_count)
p, estimates = iterate(df, n=100)
print(p)
print('=' * 10)

### Bart Scores BT

In [None]:
import logging
import transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)
from bart_score import BARTScorer
bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

In [None]:
from tqdm import tqdm
import pyarrow as pa
bart_win_count = {model_a: {model_b: 0 for model_b in models} for model_a in models}

schema = pa.schema({
    'input': pa.string(),
    'output': pa.string()
})

def bart_pipeline(human_data, model_data):
    predictions = [x['output'] for x in model_data]
    references = [x['output'] for x in human_data]
    F1 = bart_scorer.score(predictions, references)
    return F1

gpt4_dict = {}
for line in tqdm(mt_bench['train']):
    if line['model_a'] == 'gpt-4':
        for text in line['conversation_a']:
            if text['role'] == 'user':
                line_input = text['content']
            else:
                if line_input in gpt4_dict.keys():
                    gpt4_dict[line_input].append(text['content'])
                else:
                    gpt4_dict[line_input] = [text['content']]
    elif line['model_b'] == 'gpt-4':
        for text in line['conversation_b']:
            if text['role'] == 'user':
                line_input = text['content']
            else:
                if line_input in gpt4_dict.keys():
                    gpt4_dict[line_input].append(text['content'])
                else:
                    gpt4_dict[line_input] = [text['content']]

for line in tqdm(mt_bench['train']):
    def eval_line(conversation_name):
        input = []
        output = []
        gpt4_output = []
        for text in line[conversation_name]:
            if text['role'] == 'user':
                text_input = text['content']
            else:
                if text_input != "" and text['content'] != "":
                    input.append(text_input)
                    output.append(text['content'])
                    gpt4_output.append(gpt4_dict[text_input][0])
        model_data = pa.Table.from_pydict(
            dict(
                zip(schema.names, (input, output))
            ),
            schema=schema
        )
        model_data = Dataset(model_data)

        human_data = pa.Table.from_pydict(
            dict(
                zip(schema.names, (input, gpt4_output))
            ),
            schema=schema
        )
        human_data = Dataset(human_data)
        raw_results = bart_pipeline(human_data, model_data)

        return np.mean(raw_results)

    result_a = eval_line('conversation_a')
    result_b = eval_line('conversation_b')

    mean_result_a = result_a
    mean_result_b = result_b

    if mean_result_a > mean_result_b:
        bart_win_count[line['model_b']][line['model_a']] += 1
    elif mean_result_a < mean_result_b:
        bart_win_count[line['model_a']][line['model_b']] += 1

In [None]:
print("bart score")
print('-' * 10)
df = pd.DataFrame(bart_win_count)
p, estimates = iterate(df, n=100)
print(p)
print('=' * 10)