In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [2]:
DEVICE = 'cuda:0'
import torch
torch.cuda.device_count()

1

In [3]:
# Load base text encoding
from sentence_transformers import SentenceTransformer, util
sbert = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=DEVICE, cache_folder = "/mount/arbeitsdaten/asr-2/vaethdk/resources/weights").to(DEVICE)

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
# Load dataset
from data.dataset import DataAugmentationLevel, ReimburseGraphDataset, NodeType


human_data_train = ReimburseGraphDataset('en/reimburse/train_graph.json', 'en/reimburse/train_answers.json', True, augmentation=DataAugmentationLevel.NONE, resource_dir="./resources/")
human_data_test = ReimburseGraphDataset('en/reimburse/test_graph.json', 'en/reimburse/test_answers.json', True, augmentation=DataAugmentationLevel.NONE, resource_dir="./resources/")
generated_data_train_v1 = ReimburseGraphDataset('en/reimburse/train_graph.json', 'en/reimburse/train_answers.json', True, augmentation=DataAugmentationLevel.ARTIFICIAL_ONLY, augmentation_path="en/reimburse/generated/train_questions_v1.json", resource_dir="./resources/")
generated_data_train_v2 = ReimburseGraphDataset('en/reimburse/train_graph.json', 'en/reimburse/train_answers.json', True, augmentation=DataAugmentationLevel.ARTIFICIAL_ONLY, augmentation_path="en/reimburse/generated/train_questions_v2.json", resource_dir="./resources/")
generated_data_train_v3 = ReimburseGraphDataset('en/reimburse/train_graph.json', 'en/reimburse/train_answers.json', True, augmentation=DataAugmentationLevel.ARTIFICIAL_ONLY, augmentation_path="en/reimburse/generated/train_questions_v3.json", resource_dir="./resources/")

===== Dataset Statistics =====
- files:  en/reimburse/train_graph.json en/reimburse/train_answers.json
- synonyms: True
- depth: 20  - degree: 13
- answers: 248
- questions: 279
- loaded original data: True
- loaded generated data: False
===== Dataset Statistics =====
- files:  en/reimburse/test_graph.json en/reimburse/test_answers.json
- synonyms: True
- depth: 20  - degree: 13
- answers: 162
- questions: 173
- loaded original data: True
- loaded generated data: False
- Loading questions from  ./resources/en/reimburse/generated/train_questions_v1.json
- only artificial answers
Loading augmentation answers from ./resources/en/reimburse/generated/train_answers.json
===== Dataset Statistics =====
- files:  en/reimburse/train_graph.json en/reimburse/train_answers.json
- synonyms: True
- depth: 20  - degree: 13
- answers: 803
- questions: 800
- loaded original data: False
- loaded generated data: True
- Loading questions from  ./resources/en/reimburse/generated/train_questions_v2.json
- on

In [44]:
datasets = {
    "Human Train": human_data_train,
    "Human Test": human_data_test,
    "Generated V1": generated_data_train_v1,
    "Generated V2": generated_data_train_v2,
    "Generated V3": generated_data_train_v3,
}

In [45]:
###
### Test per-node similarity top-1 accuracy of base similarity model
###
from tqdm.auto import tqdm
from statistics import mean

ds_accuracies = {} # accuracy accross datasets
for ds_name, ds in tqdm(datasets.items()):
    ds_accuracies[ds_name] = []
    for node in ds.nodes_by_type[NodeType.QUESTION]:
        # encode node answers
        num_answers = len(node.answers)
        if num_answers < 2: # comparison only interesting if there is at least a single choice to be made (>= 2 answers)
            continue

        answer_candidate_enc = sbert.encode([answer_candidate.text for answer_candidate in node.answers], device=DEVICE, convert_to_tensor=True) # encode answer candidates
        # print("ANSWERS", [answer_candidate.text for answer_candidate in node.answers])
        # print("CAND SHAPE", answer_candidate_enc.size())
        for answer_candidate_idx, answer_candidate in enumerate(node.answers):
            # print("CANDIDATE", answer_candidate.text)
            # print("SYNONYMS", ds.answer_synonyms[answer_candidate.text.lower()])
            answer_synonym_enc = sbert.encode(ds.answer_synonyms[answer_candidate.text.lower()], device=DEVICE, convert_to_tensor=True) # encode synonyms
            # print("SYNONYM SHAPE", answer_synonym_enc.size())
            cosine_scores = util.cos_sim(answer_synonym_enc, answer_candidate_enc) # calculate similarity beween candidate and synonyms (candidates are columns, synonyms are rows)
            # print("SIMILARITY SHAPE", cosine_scores.size())
            # print(cosine_scores)
            # print("MOST SIMILAR ANSWERS")
            most_similar_candidates = cosine_scores.argmax(dim=-1)
            # print(most_similar_candidates)
            accuracies = (most_similar_candidates == answer_candidate_idx).float().tolist()
            # print(accuracies)
            ds_accuracies[ds_name].extend(accuracies)
        # break
    # break

    ds_accuracies[ds_name] = mean(ds_accuracies[ds_name])
print(ds_accuracies)


100%|██████████| 5/5 [00:05<00:00,  1.17s/it]

{'Human Train': 0.9429657794676806, 'Human Test': 0.9378531073446328, 'Generated V1': 0.9420454545454545, 'Generated V2': 0.9420454545454545, 'Generated V3': 0.9420454545454545}



