# Evaluation of Synthetic-MKQA dataset

In [37]:
import json
import numpy as np
import random

In [38]:
INPUT_FILE = '../data/synthetic/train-synthetic-v2.json'
OUTPUT_FILE = '../data/synthetic/dataset-qa-evaluation.json'
DEFAULT_SEED = 42

np.random.seed(DEFAULT_SEED)
random.seed(DEFAULT_SEED)

In [39]:
with open(INPUT_FILE, 'r', encoding='utf-8') as fp:
    dataset = json.load(fp)

## Pick random sample

Picks some random items from the original dataset. There is a placeholder where a person has to write manually the answer.

In [40]:
N_SAMPLES = 100

In [41]:
n_items = len(dataset['data'])
print('Dataset size: %d' % n_items)

Dataset size: 2219


In [42]:
random_items = random.sample(range(n_items), N_SAMPLES)

In [43]:
subdataset = []
for i, item in enumerate(dataset['data']):
    if i not in random_items:
        continue
    item = item['paragraphs'][0]
    for qa_item in item['qas']:
        subdataset.append({
            'context': item['context'],
            'question': qa_item['question'],
            'computer_answers': list(set([x['text'] for x in qa_item['answers']])),
            'valid_computer_answer': 0,
            'valid_google_answer': 0,
        })

In [44]:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as fp:
    json.dump(subdataset, fp, ensure_ascii=False)

## Validate results

Once a human has written the answers, reload the sub-dataset and validate answers.

In [45]:
with open(OUTPUT_FILE, 'r', encoding='utf-8') as fp:
    qadataset = json.load(fp)

In [46]:
n_items = len(qadataset)
qa_result = {
    'computer_answer': 0,
    'google_answer': 0,
}

for item in qadataset:
    if item['valid_computer_answer']:
        qa_result['computer_answer'] += 1
    if item['valid_google_answer']:
        qa_result['google_answer'] += 1

In [47]:
print('QA size: %d' % n_items)
print('Valid computed answers: %.2f%%' % (qa_result['computer_answer'] / n_items * 100))
print('Valid Google answers: %.2f%%' % (qa_result['google_answer'] / n_items * 100))

QA size: 31
Valid computed answers: 12.90%
Valid Google answers: 67.74%


Scores:
- 23.91%
- 29.79%