In [1]:
import json
import jsonlines
from collections import defaultdict

import numpy as np

In [2]:
synonym_answers = json.load(open('synonym_answers.json', 'r'))
antonym_answers = json.load(open('antonym_answers.json', 'r'))

In [3]:
model_name_dict = {
    'bert-base-uncased': 'BERT$_{base}$',
    'bert-large-uncased': 'BERT$_{large}$',
    'albert-base-v1': 'ALBERT1$_{base}$',
    'albert-large-v1': 'ALBERT1$_{large}$',
    'albert-xlarge-v1': 'ALBERT1$_{xlarge}$',
    'albert-base-v2': 'ALBERT2$_{base}$',
    'albert-large-v2': 'ALBERT2$_{large}$',
    'albert-xlarge-v2': 'ALBERT2$_{xlarge}$',
    'roberta-base': 'RoBERTa$_{base}$',
    'roberta-large': 'RoBERTa$_{large}$',
    'gpt-neo-125m': 'GPT-Neo 125M',
    'gpt-neo-1.3B': 'GPT-Neo 1.3B',
    'gpt-neo-2.7B': 'GPT-Neo 2.7B',
    'gpt-j-6b': 'GPT-J 6B',
    # 'gpt-3.5-turbo-0125': 'ChatGPT-3.5',
    # 'gpt-4-0125-preview': 'ChatGPT-4'
}

In [4]:
def strip_lower(words):
    new_words = []
    for word in words:
        new_words.append(word.lower().strip())
    return new_words

In [34]:
for model_name in model_name_dict.keys():
    try:
        data = jsonlines.open(f'results/{model_name}_opposite_relation_predictions.jsonl')
    except:
        continue

    synonym_examples, antonym_examples = {}, {}

    for example in data.iter():
        if example['rel_id'] == 'Synonym':
            synonym_examples[example['subj']] = example
        elif example['rel_id'] == 'Antonym':
            antonym_examples[example['subj']] = example

    overlap_k_list = defaultdict(list)
    miss_k_synonym_list = defaultdict(list)
    miss_k_antonym_list = defaultdict(list)

    for example in synonym_examples.values():
        top_100_words = strip_lower(example['top_100_text'])
        if example['subj'] in antonym_examples:
            top_100_words_opposite = strip_lower(antonym_examples[example['subj']]['top_100_text'])

            for k in [1, 10, 100]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1, 10, 100]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = antonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_synonym_list[k].append(miss_k)

    for example in antonym_examples.values():
        top_100_words = strip_lower(example['top_100_text'])
        if example['subj'] in synonym_examples:
            top_100_words_opposite = strip_lower(synonym_examples[example['subj']]['top_100_text'])

            for k in [1, 10, 100]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1, 10, 100]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = synonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_antonym_list[k].append(miss_k)

    # print(f'{model_name} - len_overlap_k: {len(overlap_k_list[1])}, len_miss_k_synonym: {len(miss_k_synonym_list[1])}, len_miss_k_antonym: {len(miss_k_antonym_list[1])}')

    overlap_1, overlap_10, overlap_100 = overlap_k_list[1], overlap_k_list[10], overlap_k_list[100]
    average_overlap_1 = f'{np.mean(overlap_1):.2f} +- {np.std(overlap_1):.2f}'.split('+-')
    average_overlap_10 = f'{np.mean(overlap_10):.2f} +- {np.std(overlap_10):.2f}'.split('+-')
    average_overlap_100 = f'{np.mean(overlap_100):.2f} +- {np.std(overlap_100):.2f}'.split('+-')

    line = f'& {model_name_dict[model_name]} ' + \
    f'& {average_overlap_1[0].strip()}$\pm${average_overlap_1[1].strip()} ' + \
    f'& {average_overlap_10[0].strip()}$\pm${average_overlap_10[1].strip()} ' + \
    f'& {average_overlap_100[0].strip()}$\pm${average_overlap_100[1].strip()}\\\\'

    print(line)

& BERT$_{base}$ & 0.47$\pm$0.50 & 0.52$\pm$0.20 & 0.55$\pm$0.15\\
& BERT$_{large}$ & 0.39$\pm$0.49 & 0.45$\pm$0.25 & 0.52$\pm$0.18\\
& ALBERT1$_{base}$ & 0.80$\pm$0.40 & 0.56$\pm$0.17 & 0.60$\pm$0.13\\
& ALBERT1$_{large}$ & 0.49$\pm$0.50 & 0.43$\pm$0.17 & 0.48$\pm$0.13\\
& ALBERT1$_{xlarge}$ & 0.53$\pm$0.50 & 0.48$\pm$0.23 & 0.57$\pm$0.17\\
& ALBERT2$_{base}$ & 0.69$\pm$0.46 & 0.62$\pm$0.18 & 0.63$\pm$0.14\\
& ALBERT2$_{large}$ & 0.77$\pm$0.42 & 0.60$\pm$0.20 & 0.62$\pm$0.16\\
& ALBERT2$_{xlarge}$ & 0.42$\pm$0.49 & 0.45$\pm$0.27 & 0.48$\pm$0.23\\
& RoBERTa$_{base}$ & 0.88$\pm$0.33 & 0.37$\pm$0.13 & 0.27$\pm$0.10\\
& RoBERTa$_{large}$ & 0.12$\pm$0.32 & 0.48$\pm$0.15 & 0.47$\pm$0.09\\
& GPT-Neo 125M & 0.27$\pm$0.45 & 0.52$\pm$0.14 & 0.48$\pm$0.09\\
& GPT-Neo 1.3B & 0.47$\pm$0.50 & 0.42$\pm$0.15 & 0.44$\pm$0.10\\
& GPT-Neo 2.7B & 0.20$\pm$0.40 & 0.40$\pm$0.14 & 0.40$\pm$0.08\\
& GPT-J 6B & 0.12$\pm$0.32 & 0.34$\pm$0.14 & 0.33$\pm$0.09\\


In [35]:
for model_name in model_name_dict.keys():
    try:
        data = jsonlines.open(f'results/{model_name}_prompt_tuning_opposite_relation_predictions.jsonl')
    except:
        continue

    synonym_examples, antonym_examples = {}, {}

    for example in data.iter():
        if example['rel_id'] == 'Synonym':
            synonym_examples[example['subj']] = example
        elif example['rel_id'] == 'Antonym':
            antonym_examples[example['subj']] = example

    overlap_k_list = defaultdict(list)
    miss_k_synonym_list = defaultdict(list)
    miss_k_antonym_list = defaultdict(list)

    for example in synonym_examples.values():
        top_100_words = strip_lower(example['top_100_text'])
        if example['subj'] in antonym_examples:
            top_100_words_opposite = strip_lower(antonym_examples[example['subj']]['top_100_text'])

            for k in [1, 10, 100]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1, 10, 100]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = antonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_synonym_list[k].append(miss_k)

    for example in antonym_examples.values():
        top_100_words = strip_lower(example['top_100_text'])
        if example['subj'] in synonym_examples:
            top_100_words_opposite = strip_lower(synonym_examples[example['subj']]['top_100_text'])

            for k in [1, 10, 100]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1, 10, 100]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = synonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_antonym_list[k].append(miss_k)

    # print(f'{model_name} - len_overlap_k: {len(overlap_k_list[1])}, len_miss_k_synonym: {len(miss_k_synonym_list[1])}, len_miss_k_antonym: {len(miss_k_antonym_list[1])}')

    overlap_1, overlap_10, overlap_100 = overlap_k_list[1], overlap_k_list[10], overlap_k_list[100]
    average_overlap_1 = f'{np.mean(overlap_1):.2f} +- {np.std(overlap_1):.2f}'.split('+-')
    average_overlap_10 = f'{np.mean(overlap_10):.2f} +- {np.std(overlap_10):.2f}'.split('+-')
    average_overlap_100 = f'{np.mean(overlap_100):.2f} +- {np.std(overlap_100):.2f}'.split('+-')

    line = f'& {model_name_dict[model_name]} ' + \
    f'& {average_overlap_1[0].strip()}$\pm${average_overlap_1[1].strip()} ' + \
    f'& {average_overlap_10[0].strip()}$\pm${average_overlap_10[1].strip()} ' + \
    f'& {average_overlap_100[0].strip()}$\pm${average_overlap_100[1].strip()}\\\\'

    print(line)

& BERT$_{base}$ & 0.46$\pm$0.50 & 0.43$\pm$0.19 & 0.48$\pm$0.13\\
& BERT$_{large}$ & 0.36$\pm$0.48 & 0.37$\pm$0.19 & 0.43$\pm$0.13\\
& GPT-Neo 125M & 0.02$\pm$0.14 & 0.09$\pm$0.03 & 0.09$\pm$0.02\\
& GPT-J 6B & 0.04$\pm$0.19 & 0.13$\pm$0.16 & 0.19$\pm$0.12\\


In [5]:
model_name_dict = {
    'gpt-3.5-turbo-0125': 'ChatGPT-3.5',
    'gpt-4-0125-preview': 'ChatGPT-4'
}

In [6]:
for model_name in model_name_dict.keys():
    try:
        data = jsonlines.open(f'results/{model_name}_opposite_relation_predictions.jsonl')
    except:
        continue

    synonym_examples, antonym_examples = {}, {}

    for example in data.iter():
        if example['rel_id'] == 'Synonym':
            synonym_examples[example['subj']] = example
        elif example['rel_id'] == 'Antonym':
            antonym_examples[example['subj']] = example

    overlap_k_list = defaultdict(list)
    miss_k_synonym_list = defaultdict(list)
    miss_k_antonym_list = defaultdict(list)

    for example in synonym_examples.values():
        top_100_words = strip_lower(example['top_5_text'])
        if example['subj'] in antonym_examples:
            top_100_words_opposite = strip_lower(antonym_examples[example['subj']]['top_5_text'])

            for k in [1]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = antonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_synonym_list[k].append(miss_k)

    for example in antonym_examples.values():
        top_100_words = strip_lower(example['top_5_text'])
        if example['subj'] in synonym_examples:
            top_100_words_opposite = strip_lower(synonym_examples[example['subj']]['top_5_text'])

            for k in [1]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = synonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_antonym_list[k].append(miss_k)

    # print(f'{model_name} - len_overlap_k: {len(overlap_k_list[1])}, len_miss_k_synonym: {len(miss_k_synonym_list[1])}, len_miss_k_antonym: {len(miss_k_antonym_list[1])}')

    overlap_1, overlap_10, overlap_100 = overlap_k_list[1], overlap_k_list[10], overlap_k_list[100]
    average_overlap_1 = f'{np.mean(overlap_1):.2f} +- {np.std(overlap_1):.2f}'.split('+-')
    average_overlap_10 = f'{np.mean(overlap_10):.2f} +- {np.std(overlap_10):.2f}'.split('+-')
    average_overlap_100 = f'{np.mean(overlap_100):.2f} +- {np.std(overlap_100):.2f}'.split('+-')

    line = f'& {model_name_dict[model_name]} ' + \
    f'& {average_overlap_1[0].strip()}$\pm${average_overlap_1[1].strip()} ' + \
    f'& {average_overlap_10[0].strip()}$\pm${average_overlap_10[1].strip()} ' + \
    f'& {average_overlap_100[0].strip()}$\pm${average_overlap_100[1].strip()}\\\\'

    print(line)

& ChatGPT-3.5 & 0.03$\pm$0.18 & nan$\pm$nan & nan$\pm$nan\\
& ChatGPT-4 & 0.33$\pm$0.47 & nan$\pm$nan & nan$\pm$nan\\


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [7]:
for model_name in model_name_dict.keys():
    try:
        data = jsonlines.open(f'results/{model_name}_opposite_relation_predictions_4_shot.jsonl')
    except:
        continue

    synonym_examples, antonym_examples = {}, {}

    for example in data.iter():
        if example['rel_id'] == 'Synonym':
            synonym_examples[example['subj']] = example
        elif example['rel_id'] == 'Antonym':
            antonym_examples[example['subj']] = example

    overlap_k_list = defaultdict(list)
    miss_k_synonym_list = defaultdict(list)
    miss_k_antonym_list = defaultdict(list)

    for example in synonym_examples.values():
        top_100_words = strip_lower(example['top_5_text'])
        if example['subj'] in antonym_examples:
            top_100_words_opposite = strip_lower(antonym_examples[example['subj']]['top_5_text'])

            for k in [1]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = antonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_synonym_list[k].append(miss_k)

    for example in antonym_examples.values():
        top_100_words = strip_lower(example['top_5_text'])
        if example['subj'] in synonym_examples:
            top_100_words_opposite = strip_lower(synonym_examples[example['subj']]['top_5_text'])

            for k in [1]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = synonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_antonym_list[k].append(miss_k)

    # print(f'{model_name} - len_overlap_k: {len(overlap_k_list[1])}, len_miss_k_synonym: {len(miss_k_synonym_list[1])}, len_miss_k_antonym: {len(miss_k_antonym_list[1])}')

    overlap_1, overlap_10, overlap_100 = overlap_k_list[1], overlap_k_list[10], overlap_k_list[100]
    average_overlap_1 = f'{np.mean(overlap_1):.2f} +- {np.std(overlap_1):.2f}'.split('+-')
    average_overlap_10 = f'{np.mean(overlap_10):.2f} +- {np.std(overlap_10):.2f}'.split('+-')
    average_overlap_100 = f'{np.mean(overlap_100):.2f} +- {np.std(overlap_100):.2f}'.split('+-')

    line = f'& {model_name_dict[model_name]} ' + \
    f'& {average_overlap_1[0].strip()}$\pm${average_overlap_1[1].strip()} ' + \
    f'& {average_overlap_10[0].strip()}$\pm${average_overlap_10[1].strip()} ' + \
    f'& {average_overlap_100[0].strip()}$\pm${average_overlap_100[1].strip()}\\\\'

    print(line)

& ChatGPT-3.5 & 0.03$\pm$0.18 & nan$\pm$nan & nan$\pm$nan\\
& ChatGPT-4 & 0.02$\pm$0.12 & nan$\pm$nan & nan$\pm$nan\\
