In [11]:
import json
import os
import numpy as np

In [12]:
sym_to_num = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}

In [13]:
gt = json.load(open('../data/analogy/MC_test.json', 'r'))
gt[0]

In [14]:
gt_filtered = json.load(open('../data/analogy_filtered/MC_test.json', 'r'))
gt_filtered_uids = [sample['uid'] for sample in gt_filtered]

In [15]:
model_names = ['gpt-neo-125m', 'gpt-neo-1.3B', 'gpt-neo-2.7B', 'gpt-j-6b',
                 'bert-base-uncased', 'bert-large-uncased',
                 'roberta-base', 'roberta-large',
                 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
                 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2',
                 'Meta-Llama-3-8B', 'Meta-Llama-3-8B-Instruct',
                 ]

for model_name in model_names:
    try:
        predictions = json.load(open(os.path.join('results', f'{model_name}.json'), 'r'))
    except:
        continue
    pred_dict = {}
    for pred in predictions:
        pred_dict[pred['uid']] = pred

    results = []
    ranks, accs, tail_ranks, tail_accs = [], [], [], []

    for sample in gt:
        ppls = [pred_dict[sample['uid']+f'_{i}']['ppl'] for i in range(len(sample['choice']))]
        tail_ppls = [pred_dict[sample['uid']+f'_{i}']['tail_ppl'] for i in range(len(sample['choice']))]
        answer_idx = sym_to_num[sample['output']]
        query = sample['query']
        choice = sample['choice']

        order = np.argsort(ppls).tolist()
        rank = order.index(answer_idx)
        accuracy = (rank == 0)*1
        tail_order = np.argsort(tail_ppls).tolist()
        tail_rank = tail_order.index(answer_idx)
        tail_accuracy = (tail_rank == 0)*1

        result = {'uid': sample['uid'], 'query': query, 'choice': choice, 'answer': answer_idx, 'order': order, 'rank': rank, 'accuracy': accuracy, 'ppl': ppls[answer_idx],
                'tail_order': tail_order, 'tail_rank': tail_rank, 'tail_accuracy': tail_accuracy, 'tail_ppl': tail_ppls[answer_idx]}
        results.append(result)

        ranks.append(rank)
        accs.append(accuracy)
        tail_ranks.append(tail_rank)
        tail_accs.append(tail_accuracy)

    summary = {'rank': np.mean(ranks), 'accuracy': np.mean(accs),
                    'tail_rank': np.mean(tail_ranks), 'tail_accuracy': np.mean(tail_accs)}

    with open(f'results/{model_name}_pred.json', 'w') as fout:
        json.dump(results, fout)
    with open(f'results/{model_name}_summary.json', 'w') as fout:
        json.dump(summary, fout)

In [16]:
model_names = ['gpt-neo-125m', 'gpt-neo-1.3B', 'gpt-neo-2.7B', 'gpt-j-6b',
                 'bert-base-uncased', 'bert-large-uncased',
                 'roberta-base', 'roberta-large',
                 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
                 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2',
                 'Meta-Llama-3-8B', 'Meta-Llama-3-8B-Instruct',
                 ]

for model_name in model_names:
    try:
        predictions = json.load(open(os.path.join('results', f'{model_name}.json'), 'r'))
    except:
        continue
    pred_dict = {}
    for pred in predictions:
        pred_dict[pred['uid']] = pred

    results = []
    ranks, accs, tail_ranks, tail_accs = [], [], [], []

    for sample in gt_filtered:
        ppls = [pred_dict[sample['uid']+f'_{i}']['ppl'] for i in range(len(sample['choice']))]
        tail_ppls = [pred_dict[sample['uid']+f'_{i}']['tail_ppl'] for i in range(len(sample['choice']))]
        answer_idx = sym_to_num[sample['output']]
        query = sample['query']
        choice = sample['choice']

        order = np.argsort(ppls).tolist()
        rank = order.index(answer_idx)
        accuracy = (rank == 0)*1
        tail_order = np.argsort(tail_ppls).tolist()
        tail_rank = tail_order.index(answer_idx)
        tail_accuracy = (tail_rank == 0)*1

        result = {'uid': sample['uid'], 'query': query, 'choice': choice, 'answer': answer_idx, 'order': order, 'rank': rank, 'accuracy': accuracy, 'ppl': ppls[answer_idx],
                'tail_order': tail_order, 'tail_rank': tail_rank, 'tail_accuracy': tail_accuracy, 'tail_ppl': tail_ppls[answer_idx]}
        results.append(result)

        ranks.append(rank)
        accs.append(accuracy)
        tail_ranks.append(tail_rank)
        tail_accs.append(tail_accuracy)

    summary = {'rank': np.mean(ranks), 'accuracy': np.mean(accs),
                    'tail_rank': np.mean(tail_ranks), 'tail_accuracy': np.mean(tail_accs)}

    with open(f'results/{model_name}_pred_filtered.json', 'w') as fout:
        json.dump(results, fout)
    with open(f'results/{model_name}_summary_filtered.json', 'w') as fout:
        json.dump(summary, fout)

In [19]:
model_names = ['gpt-3.5-turbo-0125', 'gpt-4o-2024-08-06']

for model_name in model_names:
    predictions = json.load(open(os.path.join('results', model_name, 'raw_pred_analogy_remove_stopwords.json'), 'r'))
    pred_dict = {}
    for pred in predictions:
        pred_dict[pred['uid']] = pred

    results = []
    accs = []

    for sample in gt:
        # ppls = [pred_dict[sample['uid']+f'_{i}']['ppl'] for i in range(5)]
        # tail_ppls = [pred_dict[sample['uid']+f'_{i}']['tail_ppl'] for i in range(5)]
        # print(sample['output'])
        answer_idx = sym_to_num[sample['output']]
        choice = sample['choice']
        preds = pred_dict[sample['uid']]['top_k_tokens_remove_stopwords']
        top_1_pred = preds[0].lower().strip()

        accuracy = (sample['output'].lower().strip() in top_1_pred)*1

        result = {'uid': sample['uid'], 'query': query, 'choice': choice, 'answer': answer_idx, 'accuracy': accuracy}
        results.append(result)

        accs.append(accuracy)

    summary = {'accuracy': np.mean(accs)}

    with open(f'results/{model_name}_pred.json', 'w') as fout:
        json.dump(results, fout)
    with open(f'results/{model_name}_summary.json', 'w') as fout:
        json.dump(summary, fout)

In [20]:
model_names = ['gpt-3.5-turbo-0125', 'gpt-4o-2024-08-06']

for model_name in model_names:
    predictions = json.load(open(os.path.join('results', model_name, 'raw_pred_analogy_remove_stopwords.json'), 'r'))
    pred_dict = {}
    for pred in predictions:
        pred_dict[pred['uid']] = pred

    results = []
    accs = []

    for sample in gt_filtered:
        # ppls = [pred_dict[sample['uid']+f'_{i}']['ppl'] for i in range(5)]
        # tail_ppls = [pred_dict[sample['uid']+f'_{i}']['tail_ppl'] for i in range(5)]
        # print(sample['output'])
        answer_idx = sym_to_num[sample['output']]
        choice = sample['choice']
        preds = pred_dict[sample['uid']]['top_k_tokens_remove_stopwords']
        top_1_pred = preds[0].lower().strip()

        accuracy = (sample['output'].lower().strip() in top_1_pred)*1

        result = {'uid': sample['uid'], 'query': query, 'choice': choice, 'answer': answer_idx, 'accuracy': accuracy}
        results.append(result)

        accs.append(accuracy)

    summary = {'accuracy': np.mean(accs)}

    with open(f'results/{model_name}_pred_filtered.json', 'w') as fout:
        json.dump(results, fout)
    with open(f'results/{model_name}_summary_filtered.json', 'w') as fout:
        json.dump(summary, fout)