In [None]:
import json
import jsonlines
from collections import defaultdict

import numpy as np

In [None]:
synonym_answers = json.load(open('synonym_answers.json', 'r'))
antonym_answers = json.load(open('antonym_answers.json', 'r'))

In [None]:
model_name_dict = {
    'bert-base-uncased': 'BERT$_{base}$',
    'bert-large-uncased': 'BERT$_{large}$',
    'albert-base-v1': 'ALBERT1$_{base}$',
    'albert-large-v1': 'ALBERT1$_{large}$',
    'albert-xlarge-v1': 'ALBERT1$_{xlarge}$',
    'albert-base-v2': 'ALBERT2$_{base}$',
    'albert-large-v2': 'ALBERT2$_{large}$',
    'albert-xlarge-v2': 'ALBERT2$_{xlarge}$',
    'roberta-base': 'RoBERTa$_{base}$',
    'roberta-large': 'RoBERTa$_{large}$',
    'gpt-neo-125m': 'GPT-Neo 125M',
    'gpt-neo-1.3B': 'GPT-Neo 1.3B',
    'gpt-neo-2.7B': 'GPT-Neo 2.7B',
    'gpt-j-6b': 'GPT-J 6B',
    # 'gpt-3.5-turbo-0125': 'ChatGPT-3.5',
    # 'gpt-4-0125-preview': 'ChatGPT-4'
}

In [None]:
def strip_lower(words):
    new_words = []
    for word in words:
        new_words.append(word.lower().strip())
    return new_words

In [None]:
for model_name in model_name_dict.keys():
    try:
        data = jsonlines.open(f'results/{model_name}_opposite_relation_predictions.jsonl')
    except:
        continue

    synonym_examples, antonym_examples = {}, {}

    for example in data.iter():
        if example['rel_id'] == 'Synonym':
            synonym_examples[example['subj']] = example
        elif example['rel_id'] == 'Antonym':
            antonym_examples[example['subj']] = example

    overlap_k_list = defaultdict(list)
    miss_k_synonym_list = defaultdict(list)
    miss_k_antonym_list = defaultdict(list)

    for example in synonym_examples.values():
        top_100_words = strip_lower(example['top_100_text'])
        if example['subj'] in antonym_examples:
            top_100_words_opposite = strip_lower(antonym_examples[example['subj']]['top_100_text'])

            for k in [1, 10, 100]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1, 10, 100]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = antonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_synonym_list[k].append(miss_k)

    for example in antonym_examples.values():
        top_100_words = strip_lower(example['top_100_text'])
        if example['subj'] in synonym_examples:
            top_100_words_opposite = strip_lower(synonym_examples[example['subj']]['top_100_text'])

            for k in [1, 10, 100]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1, 10, 100]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = synonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_antonym_list[k].append(miss_k)

    # print(f'{model_name} - len_overlap_k: {len(overlap_k_list[1])}, len_miss_k_synonym: {len(miss_k_synonym_list[1])}, len_miss_k_antonym: {len(miss_k_antonym_list[1])}')

    miss_1_synonym, miss_10_synonym, miss_100_synonym = miss_k_synonym_list[1], miss_k_synonym_list[10], miss_k_synonym_list[100]
    average_miss_1_synonym = f'{np.mean(miss_1_synonym):.2f} +- {np.std(miss_1_synonym):.2f}'.split('+-')
    average_miss_10_synonym = f'{np.mean(miss_10_synonym):.2f} +- {np.std(miss_10_synonym):.2f}'.split('+-')
    average_miss_100_synonym = f'{np.mean(miss_100_synonym):.2f} +- {np.std(miss_100_synonym):.2f}'.split('+-')

    miss_1_antonym, miss_10_antonym, miss_100_antonym = miss_k_antonym_list[1], miss_k_antonym_list[10], miss_k_antonym_list[100]
    average_miss_1_antonym = f'{np.mean(miss_1_antonym):.2f} +- {np.std(miss_1_antonym):.2f}'.split('+-')
    average_miss_10_antonym = f'{np.mean(miss_10_antonym):.2f} +- {np.std(miss_10_antonym):.2f}'.split('+-')
    average_miss_100_antonym = f'{np.mean(miss_100_antonym):.2f} +- {np.std(miss_100_antonym):.2f}'.split('+-')

    line = f'& {model_name_dict[model_name]} ' + \
    f'& {average_miss_1_synonym[0].strip()}$\pm${average_miss_1_synonym[1].strip()} ' + \
    f'& {average_miss_10_synonym[0].strip()}$\pm${average_miss_10_synonym[1].strip()} ' + \
    f'& {average_miss_100_synonym[0].strip()}$\pm${average_miss_100_synonym[1].strip()} ' + \
    f'& {average_miss_1_antonym[0].strip()}$\pm${average_miss_1_antonym[1].strip()} ' + \
    f'& {average_miss_10_antonym[0].strip()}$\pm${average_miss_10_antonym[1].strip()} ' + \
    f'& {average_miss_100_antonym[0].strip()}$\pm${average_miss_100_antonym[1].strip()}\\\\'

    print(line)

In [None]:
for model_name in model_name_dict.keys():
    try:
        data = jsonlines.open(f'results/{model_name}_prompt_tuning_opposite_relation_predictions.jsonl')
    except:
        continue

    synonym_examples, antonym_examples = {}, {}

    for example in data.iter():
        if example['rel_id'] == 'Synonym':
            synonym_examples[example['subj']] = example
        elif example['rel_id'] == 'Antonym':
            antonym_examples[example['subj']] = example

    overlap_k_list = defaultdict(list)
    miss_k_synonym_list = defaultdict(list)
    miss_k_antonym_list = defaultdict(list)

    for example in synonym_examples.values():
        top_100_words = strip_lower(example['top_100_text'])
        if example['subj'] in antonym_examples:
            top_100_words_opposite = strip_lower(antonym_examples[example['subj']]['top_100_text'])

            for k in [1, 10, 100]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1, 10, 100]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = antonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_synonym_list[k].append(miss_k)

    for example in antonym_examples.values():
        top_100_words = strip_lower(example['top_100_text'])
        if example['subj'] in synonym_examples:
            top_100_words_opposite = strip_lower(synonym_examples[example['subj']]['top_100_text'])

            for k in [1, 10, 100]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1, 10, 100]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = synonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_antonym_list[k].append(miss_k)

    # print(f'{model_name} - len_overlap_k: {len(overlap_k_list[1])}, len_miss_k_synonym: {len(miss_k_synonym_list[1])}, len_miss_k_antonym: {len(miss_k_antonym_list[1])}')

    miss_1_synonym, miss_10_synonym, miss_100_synonym = miss_k_synonym_list[1], miss_k_synonym_list[10], miss_k_synonym_list[100]
    average_miss_1_synonym = f'{np.mean(miss_1_synonym):.2f} +- {np.std(miss_1_synonym):.2f}'.split('+-')
    average_miss_10_synonym = f'{np.mean(miss_10_synonym):.2f} +- {np.std(miss_10_synonym):.2f}'.split('+-')
    average_miss_100_synonym = f'{np.mean(miss_100_synonym):.2f} +- {np.std(miss_100_synonym):.2f}'.split('+-')

    miss_1_antonym, miss_10_antonym, miss_100_antonym = miss_k_antonym_list[1], miss_k_antonym_list[10], miss_k_antonym_list[100]
    average_miss_1_antonym = f'{np.mean(miss_1_antonym):.2f} +- {np.std(miss_1_antonym):.2f}'.split('+-')
    average_miss_10_antonym = f'{np.mean(miss_10_antonym):.2f} +- {np.std(miss_10_antonym):.2f}'.split('+-')
    average_miss_100_antonym = f'{np.mean(miss_100_antonym):.2f} +- {np.std(miss_100_antonym):.2f}'.split('+-')

    line = f'& {model_name_dict[model_name]} ' + \
    f'& {average_miss_1_synonym[0].strip()}$\pm${average_miss_1_synonym[1].strip()} ' + \
    f'& {average_miss_10_synonym[0].strip()}$\pm${average_miss_10_synonym[1].strip()} ' + \
    f'& {average_miss_100_synonym[0].strip()}$\pm${average_miss_100_synonym[1].strip()} ' + \
    f'& {average_miss_1_antonym[0].strip()}$\pm${average_miss_1_antonym[1].strip()} ' + \
    f'& {average_miss_10_antonym[0].strip()}$\pm${average_miss_10_antonym[1].strip()} ' + \
    f'& {average_miss_100_antonym[0].strip()}$\pm${average_miss_100_antonym[1].strip()}\\\\'

    print(line)

In [None]:
model_name_dict = {
    'gpt-3.5-turbo-0125': 'ChatGPT-3.5',
    'gpt-4-0125-preview': 'ChatGPT-4'
}

In [None]:
for model_name in model_name_dict.keys():
    try:
        data = jsonlines.open(f'results/{model_name}_opposite_relation_predictions.jsonl')
    except:
        continue

    synonym_examples, antonym_examples = {}, {}

    for example in data.iter():
        if example['rel_id'] == 'Synonym':
            synonym_examples[example['subj']] = example
        elif example['rel_id'] == 'Antonym':
            antonym_examples[example['subj']] = example

    overlap_k_list = defaultdict(list)
    miss_k_synonym_list = defaultdict(list)
    miss_k_antonym_list = defaultdict(list)

    for example in synonym_examples.values():
        top_100_words = strip_lower(example['top_5_text'])
        if example['subj'] in antonym_examples:
            top_100_words_opposite = strip_lower(antonym_examples[example['subj']]['top_5_text'])

            for k in [1]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = antonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_synonym_list[k].append(miss_k)

    for example in antonym_examples.values():
        top_100_words = strip_lower(example['top_5_text'])
        if example['subj'] in synonym_examples:
            top_100_words_opposite = strip_lower(synonym_examples[example['subj']]['top_5_text'])

            for k in [1]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = synonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_antonym_list[k].append(miss_k)

    # print(f'{model_name} - len_overlap_k: {len(overlap_k_list[1])}, len_miss_k_synonym: {len(miss_k_synonym_list[1])}, len_miss_k_antonym: {len(miss_k_antonym_list[1])}')

    miss_1_synonym, miss_10_synonym, miss_100_synonym = miss_k_synonym_list[1], miss_k_synonym_list[10], miss_k_synonym_list[100]
    average_miss_1_synonym = f'{np.mean(miss_1_synonym):.2f} +- {np.std(miss_1_synonym):.2f}'.split('+-')
    average_miss_10_synonym = f'{np.mean(miss_10_synonym):.2f} +- {np.std(miss_10_synonym):.2f}'.split('+-')
    average_miss_100_synonym = f'{np.mean(miss_100_synonym):.2f} +- {np.std(miss_100_synonym):.2f}'.split('+-')

    miss_1_antonym, miss_10_antonym, miss_100_antonym = miss_k_antonym_list[1], miss_k_antonym_list[10], miss_k_antonym_list[100]
    average_miss_1_antonym = f'{np.mean(miss_1_antonym):.2f} +- {np.std(miss_1_antonym):.2f}'.split('+-')
    average_miss_10_antonym = f'{np.mean(miss_10_antonym):.2f} +- {np.std(miss_10_antonym):.2f}'.split('+-')
    average_miss_100_antonym = f'{np.mean(miss_100_antonym):.2f} +- {np.std(miss_100_antonym):.2f}'.split('+-')

    line = f'& {model_name_dict[model_name]} ' + \
    f'& {average_miss_1_synonym[0].strip()}$\pm${average_miss_1_synonym[1].strip()} ' + \
    f'& {average_miss_10_synonym[0].strip()}$\pm${average_miss_10_synonym[1].strip()} ' + \
    f'& {average_miss_100_synonym[0].strip()}$\pm${average_miss_100_synonym[1].strip()} ' + \
    f'& {average_miss_1_antonym[0].strip()}$\pm${average_miss_1_antonym[1].strip()} ' + \
    f'& {average_miss_10_antonym[0].strip()}$\pm${average_miss_10_antonym[1].strip()} ' + \
    f'& {average_miss_100_antonym[0].strip()}$\pm${average_miss_100_antonym[1].strip()}\\\\'

    print(line)

In [None]:
for model_name in model_name_dict.keys():
    try:
        data = jsonlines.open(f'results/{model_name}_opposite_relation_predictions_4_shot.jsonl')
    except:
        continue

    synonym_examples, antonym_examples = {}, {}

    for example in data.iter():
        if example['rel_id'] == 'Synonym':
            synonym_examples[example['subj']] = example
        elif example['rel_id'] == 'Antonym':
            antonym_examples[example['subj']] = example

    overlap_k_list = defaultdict(list)
    miss_k_synonym_list = defaultdict(list)
    miss_k_antonym_list = defaultdict(list)

    for example in synonym_examples.values():
        top_100_words = strip_lower(example['top_5_text'])
        if example['subj'] in antonym_examples:
            top_100_words_opposite = strip_lower(antonym_examples[example['subj']]['top_5_text'])

            for k in [1]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = antonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_synonym_list[k].append(miss_k)

    for example in antonym_examples.values():
        top_100_words = strip_lower(example['top_5_text'])
        if example['subj'] in synonym_examples:
            top_100_words_opposite = strip_lower(synonym_examples[example['subj']]['top_5_text'])

            for k in [1]:
                top_k_words = top_100_words[:k]
                top_k_words_opposite = top_100_words_opposite[:k]

                overlap_k = len(set(top_k_words) & set(top_k_words_opposite)) / k
                overlap_k_list[k].append(overlap_k)

        for k in [1]:
            top_k_words = top_100_words[:k]
            gold_objects_opposite = synonym_answers[example['subj']]

            miss_k = len(set(top_k_words) & set(gold_objects_opposite)) / len(set(gold_objects_opposite))
            miss_k_antonym_list[k].append(miss_k)

    # print(f'{model_name} - len_overlap_k: {len(overlap_k_list[1])}, len_miss_k_synonym: {len(miss_k_synonym_list[1])}, len_miss_k_antonym: {len(miss_k_antonym_list[1])}')

    miss_1_synonym, miss_10_synonym, miss_100_synonym = miss_k_synonym_list[1], miss_k_synonym_list[10], miss_k_synonym_list[100]
    average_miss_1_synonym = f'{np.mean(miss_1_synonym):.2f} +- {np.std(miss_1_synonym):.2f}'.split('+-')
    average_miss_10_synonym = f'{np.mean(miss_10_synonym):.2f} +- {np.std(miss_10_synonym):.2f}'.split('+-')
    average_miss_100_synonym = f'{np.mean(miss_100_synonym):.2f} +- {np.std(miss_100_synonym):.2f}'.split('+-')

    miss_1_antonym, miss_10_antonym, miss_100_antonym = miss_k_antonym_list[1], miss_k_antonym_list[10], miss_k_antonym_list[100]
    average_miss_1_antonym = f'{np.mean(miss_1_antonym):.2f} +- {np.std(miss_1_antonym):.2f}'.split('+-')
    average_miss_10_antonym = f'{np.mean(miss_10_antonym):.2f} +- {np.std(miss_10_antonym):.2f}'.split('+-')
    average_miss_100_antonym = f'{np.mean(miss_100_antonym):.2f} +- {np.std(miss_100_antonym):.2f}'.split('+-')

    line = f'& {model_name_dict[model_name]} ' + \
    f'& {average_miss_1_synonym[0].strip()}$\pm${average_miss_1_synonym[1].strip()} ' + \
    f'& {average_miss_10_synonym[0].strip()}$\pm${average_miss_10_synonym[1].strip()} ' + \
    f'& {average_miss_100_synonym[0].strip()}$\pm${average_miss_100_synonym[1].strip()} ' + \
    f'& {average_miss_1_antonym[0].strip()}$\pm${average_miss_1_antonym[1].strip()} ' + \
    f'& {average_miss_10_antonym[0].strip()}$\pm${average_miss_10_antonym[1].strip()} ' + \
    f'& {average_miss_100_antonym[0].strip()}$\pm${average_miss_100_antonym[1].strip()}\\\\'

    print(line)