# Group Synthetic Files

Output format:

```
{
    "version": "mkqa_par_es",
    "data": [
        {
            "title": <string>,
            "paragraphs": [
                {
                    "context": <string>,
                    "qas": [
                        {
                            "id": <string>,
                            "question": <string>,
                            "answers": [
                                {
                                    "answer_start": <int>,
                                    "text": <string>
                                }
                            ]
                        }
                    ]
                }
            ]
        },
        ...
    ]
}
```

In [1]:
import json
import os

In [4]:
def group_synthetic_files(input_path, output_path, output_file):
    train_dataset = {
        'version': 'mkqa_par',
        'data': []
    }

    n_items = 0
    filelist = [os.path.join(input_path, f) for f in os.listdir(input_path) \
                if os.path.isfile(os.path.join(input_path, f)) and f != 'config.json']
    
    output_train_file = os.path.join(output_path, output_file)

    print('Grouping into: %s' % output_train_file)
    for i, filepath in enumerate(filelist):
        try:
            print('- Item %d / %d...' % (i + 1, len(filelist)), end='\r')
            with open(filepath, 'r', encoding='utf-8') as fp:
                file_data = json.load(fp)
            for k, item in enumerate(file_data['data']):
                n_items += 1
                item['paragraphs'][0]['qas'][0]['id'] += '_%d' % k
                train_dataset['data'].append(item)
        except Exception as e:
            print('Fail path: %s' % filepath)
            raise e

    print()
    print('- Found %d items.' % n_items)
    
    os.makedirs(output_path, exist_ok=True)
    with open(output_train_file, 'w', encoding='utf-8') as fp:
        json.dump(train_dataset, fp, ensure_ascii=False)

In [11]:
language = 'ja'
folder_name = 'synthetic_wikigoogle_top_n'

for k_top in [1, 2, 3, 5]:
    input_path = os.path.join('../artifacts', folder_name, language, 'top_%d' % k_top)
    output_path = os.path.join('../data', folder_name, language, 'top_%d' % k_top)
    output_file = 'train-synthetic.json'
    group_synthetic_files(input_path, output_path, output_file)

Grouping into: ../data\synthetic_wikigoogle_top_n\ja\top_1\train-synthetic.json
- Item 1714 / 1714...
- Found 1714 items.
Grouping into: ../data\synthetic_wikigoogle_top_n\ja\top_2\train-synthetic.json
- Item 1714 / 1714...
- Found 2839 items.
Grouping into: ../data\synthetic_wikigoogle_top_n\ja\top_3\train-synthetic.json
- Item 1714 / 1714...
- Found 3651 items.
Grouping into: ../data\synthetic_wikigoogle_top_n\ja\top_5\train-synthetic.json
- Item 1714 / 1714...
- Found 4736 items.
