# NQ to SQuAD-like format

The SQuAD format looks like this:

```
{
    "version": "v2.0",
    "data": [
        {
            "title": <string>,
            "paragraphs": [
                {
                    "qas": [
                        {
                            "question": <string>,
                            "id": <string>,
                            "answers": [
                                {
                                    "text": <string>,
                                    "answer_start": <integer>
                                },
                                ...
                            ],
                            "is_impossible": <boolean>
                        },
                        ...
                    ],
                    "context": <string>
                },
                ...
            ]
        },
        ...
    ]
}
```

The parsed version will be slightly different:
- `Answer start` will indicate the token position instead of the byte.
- Question, Context and Answer Texts will be already tokenized.
- There are no impossible questions.

In [1]:
import os
import json

In [2]:
input_paths = {
    'train': '../data/nq/train/',
    'dev': '../data/nq/dev/'
}
output_paths = {
    'train': '../data/parsed/nq/',
    'dev': '../data/parsed/nq/'
}

In [3]:
def get_filelist(path, extension):
    filelist = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    filelist = [f for f in filelist if f[-len(extension):] == extension]
    return filelist

In [4]:
def get_start_end_text(tokens, start_token, end_token, html_tokens):
    start_token -= sum(html_tokens[:(start_token + 1)])
    end_token -= sum(html_tokens[:(end_token + 1)])
    return tokens[start_token:end_token], start_token, end_token

In [5]:
def build_answer(text_tokens, start_token, end_token):
    return {
        'text': text_tokens,
        'answer_start': start_token,
        'answer_end': end_token,
    }

In [6]:
def get_file_length(file_path):
    with open(file_path, 'r', encoding='utf-8') as fp:
        for i, _ in enumerate(fp):
            pass
    return i + 1

In [7]:
def parse_jsonl_file(file_path, output_path):
    file_length = get_file_length(input_file)
    with open(input_file, 'r', encoding='utf-8') as fp:
        i_line = -1

        while True:
            i_line += 1
            line = fp.readline()
            
            # Break if no more content in the file
            if not line:
                break

            json_data = json.loads(line)

            example_id = str(json_data['example_id'])
            output_file = output_path + example_id + '.json'

            # Skip if already parsed
            if os.path.exists(output_file):
                continue
            
            print('- Line %d of %d' % (i_line + 1, file_length), end='\r')

            title = json_data['document_title']
            question = json_data['question_tokens']
            long_answers = json_data['long_answer_candidates']
            annotations = json_data['annotations']

            html_tokens = [1 if t['html_token'] else 0 for t in json_data['document_tokens']]
            context_tokens = [t['token'] for t in json_data['document_tokens'] if not t['html_token']]

            # Build list of answers
            qas_answers = []
            for annotation in annotations:
                start_token = annotation['long_answer']['start_token']
                end_token = annotation['long_answer']['end_token']
                if start_token != -1:
                    text_tokens, start_token, end_token = get_start_end_text(context_tokens, start_token, end_token, html_tokens)
                    qas_answers.append(build_answer(text_tokens, start_token, end_token))

                for short_answer in annotation['short_answers']:
                    start_token = short_answer['start_token']
                    end_token = short_answer['end_token']
                    text_tokens, start_token, end_token = get_start_end_text(context_tokens, start_token, end_token, html_tokens)
                    qas_answers.append(build_answer(text_tokens, start_token, end_token))

            # Build SQuAD-like dictionary
            squad = {
                'version': 'v1.0',
                'source': 'nq',
                'data': [{
                    'title': title,
                    'paragraphs': [{
                        'qas': qas_answers,
                        'context': context_tokens
                    }]
                }]
            }

            with open(output_file, 'w', encoding='utf-8') as fp_out:
                json.dump(squad, fp_out)
        print()

In [9]:
for input_name, input_path in input_paths.items():
    print('-'*20)
    print('Dataset: %s' % input_name)
    output_path = output_paths[input_name]
    filelist = get_filelist(input_path, 'jsonl')
    
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    for i, filename in enumerate(filelist):
        print('File %d of %d: %s' % (i + 1, len(filelist), filename))
        input_file = input_path + filename
        parse_jsonl_file(input_file, output_path)

--------------------
Dataset: train
File 1 of 49: nq-train-00.jsonl

File 2 of 49: nq-train-01.jsonl

File 3 of 49: nq-train-02.jsonl

File 4 of 49: nq-train-03.jsonl

File 5 of 49: nq-train-04.jsonl

File 6 of 49: nq-train-05.jsonl

File 7 of 49: nq-train-06.jsonl

File 8 of 49: nq-train-07.jsonl

File 9 of 49: nq-train-08.jsonl

File 10 of 49: nq-train-09.jsonl

File 11 of 49: nq-train-10.jsonl

File 12 of 49: nq-train-11.jsonl

File 13 of 49: nq-train-12.jsonl

File 14 of 49: nq-train-13.jsonl

File 15 of 49: nq-train-14.jsonl

File 16 of 49: nq-train-15.jsonl

File 17 of 49: nq-train-16.jsonl

File 18 of 49: nq-train-17.jsonl

File 19 of 49: nq-train-18.jsonl

File 20 of 49: nq-train-19.jsonl

File 21 of 49: nq-train-20.jsonl

File 22 of 49: nq-train-21.jsonl

File 23 of 49: nq-train-22.jsonl

File 24 of 49: nq-train-23.jsonl

File 25 of 49: nq-train-24.jsonl

File 26 of 49: nq-train-25.jsonl

File 27 of 49: nq-train-26.jsonl

File 28 of 49: nq-train-27.jsonl
- Line 5913 of 5914
Fi