# Tydiqa subset

This notebook extracts a subset from tydiqa of a specific language. The code is based on the assumption that IDs start with the language name in this dataset.

In [None]:
import json
import re

In [None]:
LANG_NAME = 'russian'

INPUT_PATH = '../data/tydiqa/tydiqa-goldp-v1.1-dev.json'
OUTPUT_PATH = '../data/tydiqa/tydiqa-goldp-v1.1-dev-%s.json' % LANG_NAME

In [None]:
with open(INPUT_PATH, 'r', encoding='utf8') as fp:
    dataset = json.load(fp)

In [None]:
squad_items = []
for qa_context in dataset['data']:
    for qa_paragraph in qa_context['paragraphs']:
        for qa_item in qa_paragraph['qas']:
            if not re.search(r'^%s' % LANG_NAME, qa_item['id']):
                continue
            squad_items.append({
                'title': qa_context['title'],
                'paragraphs': [{
                    'context': qa_paragraph['context'],
                    'qas': [{
                        'id': qa_item['id'],
                        'question': qa_item['question'],
                        'answers': qa_item['answers'],
                    }],
                }],
            })

In [None]:
n_items = len(squad_items)
print('Found %d items' % n_items)

In [None]:
with open(OUTPUT_PATH, 'w', encoding='utf8') as fp:
    dataset['data'] = squad_items
    json.dump(dataset, fp, ensure_ascii=False)