# Small SQuAD Set

This notebook picks a random smaller set of a SQuAD-like dataset and stores it.

In [None]:
import json
import random

In [None]:
INPUT_PATH = '../data/squad/dev-v1.1.json'
OUTPUT_PATH = '../data/squad/dev-v1.1-small-5k.json'
N_SAMPLES = 5000

DEFAULT_SEED = 42
random.seed(DEFAULT_SEED)

In [None]:
with open(INPUT_PATH, 'r') as fp:
    dataset = json.load(fp)

In [None]:
squad_items = []
for qa_context in dataset['data']:
    for qa_paragraph in qa_context['paragraphs']:
        for qa_item in qa_paragraph['qas']:
            squad_items.append({
                'title': qa_context['title'],
                'paragraphs': [{
                    'context': qa_paragraph['context'],
                    'qas': [{
                        'id': qa_item['id'],
                        'question': qa_item['question'],
                        'answers': qa_item['answers'],
                    }],
                }],
            })

In [None]:
n_items = len(squad_items)

if n_items < N_SAMPLES:
    raise Exception('Input file is smaller (%d items) than expected output file.' % len(dataset['data']))

print('Found %d items' % n_items)

In [None]:
sub_squad_items = random.sample(squad_items, N_SAMPLES)

In [None]:
with open(OUTPUT_PATH, 'w') as fp:
    dataset['data'] = sub_squad_items
    json.dump(dataset, fp)