# Combine SQuAD datasets

Notebook to combine two different SQuAD-like datasets in a single file.

In [None]:
import json
import os

In [None]:
def count_qas(dataset):
    squad_items = []
    for qa_context in dataset['data']:
        for qa_paragraph in qa_context['paragraphs']:
            for qa_item in qa_paragraph['qas']:
                squad_items.append({
                    'title': qa_context['title'],
                    'paragraphs': [{
                        'context': qa_paragraph['context'],
                        'qas': [{
                            'id': qa_item['id'],
                            'question': qa_item['question'],
                            'answers': qa_item['answers'],
                        }],
                    }],
                })
    return len(squad_items)

In [None]:
def combine_datasets(filepaths, output_filepath, version='1.0'):
    new_dataset = {
        'version': version,
        'data': []
    }
    
    for filepath in filepaths:
        with open(filepath, 'r', encoding='utf8') as fp:
            sub_dataset = json.load(fp)
            new_dataset['data'] += sub_dataset['data']
    
    output_path = os.path.dirname(output_filepath)
    os.makedirs(output_path, exist_ok=True)
    
    with open(output_filepath, 'w', encoding='utf8') as fp:
        json.dump(new_dataset, fp, ensure_ascii=False)

In [None]:
# List of files to combine
input_filepaths = [
    '../data/squad/file-1.json',
    '../data/squad/file-2.json',
]

# Name of the new file
output_filepath = '../data/squad/new-file.json'

combine_datasets(input_filepaths, output_filepath)