# Group Synthetic Files

Output format:

```
{
    "version": "mkqa_par_es",
    "data": [
        {
            "title": <string>,
            "paragraphs": [
                {
                    "context": <string>,
                    "qas": [
                        {
                            "id": <string>,
                            "question": <string>,
                            "answers": [
                                {
                                    "answer_start": <int>,
                                    "text": <string>
                                }
                            ]
                        }
                    ]
                }
            ]
        },
        ...
    ]
}
```

In [19]:
import json
import os

In [20]:
INPUT_FILES = '../artifacts/synthetic/'
OUTPUT_TRAIN_FILE = '../data/synthetic/train-synthetic-v2.json'
OUTPUT_TEST_FILE = '../data/synthetic/test-synthetic-v2.json'
SPLIT_RATIO = 0.8

In [21]:
filelist = [os.path.join(INPUT_FILES, f) for f in os.listdir(INPUT_FILES) \
            if os.path.isfile(os.path.join(INPUT_FILES, f)) and f != 'config.json']

In [22]:
filelist[:5]

['../artifacts/synthetic/mkqa_-1002689427646740593.json',
 '../artifacts/synthetic/mkqa_-1005672146492412109.json',
 '../artifacts/synthetic/mkqa_-1021449407380886303.json',
 '../artifacts/synthetic/mkqa_-1033978770973439629.json',
 '../artifacts/synthetic/mkqa_-1040708440497808999.json']

In [23]:
output_dataset = {
    'version': 'mkqa_par',
    'data': []
}

n_items = 0

print('Grouping...')
for i, filepath in enumerate(filelist):
    print('- Item %d / %d...' % (i + 1, len(filelist)), end='\r')
    with open(filepath, 'r', encoding='utf-8') as fp:
        file_data = json.load(fp)
    for k, item in enumerate(file_data['data']):
        n_items += 1
        item['paragraphs'][0]['qas'][0]['id'] += '_%d' % k
        output_dataset['data'].append(item)

print()
print('Done! Found %d items.' % n_items)

Grouping...
- Item 1273 / 1273...
Done! Found 2774 items.


In [24]:
split_idx = int(n_items * SPLIT_RATIO)

In [25]:
print('Train dataset: %d' % (split_idx))
print('Test dataset: %d' % (n_items - split_idx))

Train dataset: 2219
Test dataset: 555


In [26]:
train_dataset = {
    'version': 'mkqa_par',
    'data': output_dataset['data'][:split_idx]
}

test_dataset = {
    'version': 'mkqa_par',
    'data': output_dataset['data'][split_idx:]
}

In [27]:
with open(OUTPUT_TRAIN_FILE, 'w', encoding='utf-8') as fp:
    json.dump(train_dataset, fp)
with open(OUTPUT_TEST_FILE, 'w', encoding='utf-8') as fp:
    json.dump(test_dataset, fp)