In [5]:
import pandas as pd
import os
import json

data_dir = '../data/google-natural-questions/'
file_names = ['simplified-nq-dev.jsonl', 'simplified-nq-train.jsonl']
data_list = []

# Function to count words in a list of sentences
def count_words(sentences):
    return sum(len(sentence.split()) for sentence in sentences)

def read_jsonl_file(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            yield json.loads(line)

for file_name in file_names:
    file_path = os.path.join(data_dir, file_name)
    for item in read_jsonl_file(file_path):
        annotations = item.get("annotations", [])
        answer = ""
        if annotations:
            short_answers = annotations[0].get("short_answers", [])
            if short_answers:
                answer = short_answers[0].get("text", "")
        question = item["question_text"]
        question_id = item["example_id"]
        
        # Extracting document text (context)
        document_text = item["document_text"]
        
        data_list.append({
            "Answer": answer,
            "Question": question,
            "QuestionId": question_id,
            "Context": document_text
        })

df = pd.DataFrame(data_list)
df

Unnamed: 0,Answer,Question,QuestionId,Context
0,,what are the main crops grown in the united st...,-6570496346595660652,Agriculture in the United States - wikipedia <...
1,,who is the owner of the mandalay bay in vegas,7811140318762480311,Mandalay Bay - wikipedia <H1> Mandalay Bay </H...
2,,which of the following was not one of the func...,-472523896331012000,Freedmen 's Bureau - wikipedia <H1> Freedmen '...
3,,get back get back you don't know me like that ...,5159516459824335717,Get Back ( Ludacris song ) - Wikipedia <H1> Ge...
4,,who was the first nominated lady for rajya sabha,4111902318448915849,List of nominated members of Rajya Sabha - wik...
...,...,...,...,...
315198,,who have been the hosts of the price is right,-1413521544318030897,The Price Is Right - wikipedia <H1> The Price ...
315199,,who sang the song mama told me not to come,3779316254369130993,Mama Told Me Not to Come - wikipedia <H1> Mama...
315200,,who plays grey worm on game of thrones,6455931563852330492,Jacob Anderson - wikipedia <H1> Jacob Anderson...
315201,,working principle of high pressure sodium vapo...,-7982911662792302578,Sodium - vapor lamp - wikipedia <H1> Sodium - ...


In [7]:
# 1. Number of unique questions
num_questions = df['QuestionId'].nunique()

# 2. Length of the questions
df['QuestionLength'] = df['Question'].apply(lambda x: len(x.split()))
avg_question_length = df['QuestionLength'].mean()
std_question_length = df['QuestionLength'].std()

# 3. Number of contexts (facts)
df['ContextList'] = df['Context'].apply(lambda x: x.split('. ')) # Split document text by sentences
df['NumContexts'] = df['ContextList'].apply(lambda x: len(x))
num_contexts = df['NumContexts'].sum()

# 4. Length of the facts with regards to each question
df['ContextLength'] = df['ContextList'].apply(lambda x: count_words(x))
avg_context_length = df['ContextLength'].mean()
std_context_length = df['ContextLength'].std()

# 5. Length of golden answers
df['AnswerLength'] = df['Answer'].apply(lambda x: len(x.split()))
avg_answer_length = df['AnswerLength'].mean()
std_answer_length = df['AnswerLength'].std()

# Print the results
print(f"Number of questions: {num_questions}")
print(f"Average question length: {avg_question_length:.2f} words")
print(f"Standard deviation of question length: {std_question_length:.2f} words")
print(f"Number of contexts (facts): {num_contexts}")
print(f"Average context length: {avg_context_length:.2f} words")
print(f"Standard deviation of context length: {std_context_length:.2f} words")
print(f"Average golden answer length: {avg_answer_length:.2f} words")
print(f"Standard deviation of golden answer length: {std_answer_length:.2f} words")

Number of questions: 315203
Average question length: 9.24 words
Standard deviation of question length: 1.83 words
Number of contexts (facts): 120661716
Average context length: 8665.76 words
Standard deviation of context length: 7902.73 words
Average golden answer length: 0.00 words
Standard deviation of golden answer length: 0.00 words
