In [11]:
import pandas as pd
import os
import json
from bs4 import BeautifulSoup

data_dir = '../data/google-natural-questions/'
file_names = ['simplified-nq-dev.jsonl', 'simplified-nq-train.jsonl']
data_list = []

def count_words(text):
    return len(text.split())

def html_to_text(html_content):
    soup = BeautifulSoup(html_content, 'html5lib') # for malformed html
    text = soup.get_text(separator=' ')
    text = text.strip()
    return text

def process_entry(entry):
    question_text = entry.get("question_text", "")
    example_id = entry.get("example_id", "")
    document_text = entry.get("document_text", "")
    annotations = entry.get("annotations", [])
    
    long_answers = []
    long_answers_cleaned = []
    short_answers = set()
    
    for annotation in annotations:
        long_answer = annotation.get("long_answer", {}).get("text", "")
        if long_answer:
            long_answers.append(long_answer)
            long_answers_cleaned.append(html_to_text(long_answer))
        
        for short_answer in annotation.get("short_answers", []):
            short_answers.add(short_answer.get("text", ""))
    
    return {
        "ExampleId": example_id,
        "QuestionText": question_text,
        "DocumentText": document_text,
        "LongAnswers": long_answers,
        "LongAnswersCleaned": long_answers_cleaned,
        "ShortAnswers": list(short_answers)
    }

for file_name in file_names:
    file_path = os.path.join(data_dir, file_name)
    with open(file_path, 'r') as f:
        for line in f:
            entry = json.loads(line.strip())
            data_list.append(process_entry(entry))

df = pd.DataFrame(data_list)
df

Unnamed: 0,ExampleId,QuestionText,DocumentText,LongAnswers,LongAnswersCleaned,ShortAnswers
0,-6570496346595660652,what are the main crops grown in the united st...,Agriculture in the United States - wikipedia <...,[<Table> <Tr> <Th> Major Crops in the USA </Th...,[Major Crops in the USA 1997 ( in US $ bil...,"[Rice, Cotton, Wheat, Tobacco, Soybeans, Hay, ..."
1,7811140318762480311,who is the owner of the mandalay bay in vegas,Mandalay Bay - wikipedia <H1> Mandalay Bay </H...,[<P> Mandalay Bay is a 43 - story luxury resor...,[Mandalay Bay is a 43 - story luxury resort an...,[MGM Resorts International]
2,-472523896331012000,which of the following was not one of the func...,Freedmen 's Bureau - wikipedia <H1> Freedmen '...,[],[],[]
3,5159516459824335717,get back get back you don't know me like that ...,Get Back ( Ludacris song ) - Wikipedia <H1> Ge...,[],[],[]
4,4111902318448915849,who was the first nominated lady for rajya sabha,List of nominated members of Rajya Sabha - wik...,[<Table> <Tr> <Th> No . </Th> <Th> Name </Th> ...,[No . Name Field Affiliation D...,[]
...,...,...,...,...,...,...
315198,-4860763920664935992,when does sam malone get his bar back,Sam Malone - Wikipedia <H1> Sam Malone </H1> J...,[],[],[]
315199,5733276129126234955,who has been eliminated on big brother 17,Big Brother 19 ( U.S. ) - wikipedia <H1> Big B...,[],[],[]
315200,-2176841346641996646,whoop whoop that's the sound of da police,Sound of da police - wikipedia <H1> Sound of d...,[],[],[]
315201,5821082858248975115,whats the difference between mexican and spani...,Chorizo - wikipedia <H1> Chorizo </H1> Jump to...,[],[],[]


In [13]:
num_questions = df.shape[0]

df['QuestionLength'] = df['QuestionText'].apply(count_words)
avg_question_length = df['QuestionLength'].mean()
std_question_length = df['QuestionLength'].std()

df['NumContexts'] = df['DocumentText'].apply(lambda x: x.count('<P>'))
num_contexts = df['NumContexts'].sum()

df['ContextLength'] = df['DocumentText'].apply(count_words)
avg_context_length = df['ContextLength'].mean()
std_context_length = df['ContextLength'].std()

df['LongAnswerLength'] = df['LongAnswers'].apply(lambda x: sum(count_words(answer) for answer in x))
df['LongAnswerCleanedLength'] = df['LongAnswersCleaned'].apply(lambda x: sum(count_words(answer) for answer in x))
df['ShortAnswerCount'] = df['ShortAnswers'].apply(len)

avg_answer_length = df['LongAnswerLength'].mean()
std_answer_length = df['LongAnswerLength'].std()

avg_answer_cleaned_length = df['LongAnswerCleanedLength'].mean()
std_answer_cleaned_length = df['LongAnswerCleanedLength'].std()

avg_short_answer_count = df['ShortAnswerCount'].mean()
std_short_answer_count = df['ShortAnswerCount'].std()


print(f"Number of questions: {num_questions}")
print(f"Average question length: {avg_question_length:.2f} words")
print(f"Standard deviation of question length: {std_question_length:.2f} words")
print(f"Number of contexts (facts): {num_contexts}")
print(f"Average context length: {avg_context_length:.2f} words")
print(f"Standard deviation of context length: {std_context_length:.2f} words")
print(f"Average long answer length: {avg_answer_length:.2f} words")
print(f"Standard deviation of long answer length: {std_answer_length:.2f} words")
print(f"Average cleaned long answer length: {avg_answer_cleaned_length:.2f} words")
print(f"Standard deviation of cleaned long answer length: {std_answer_cleaned_length:.2f} words")
print(f"Average number of short answers: {avg_short_answer_count:.2f}")
print(f"Standard deviation of number of short answers: {std_short_answer_count:.2f}")

Number of questions: 315203
Average question length: 9.24 words
Standard deviation of question length: 1.83 words
Number of contexts (facts): 11266272
Average context length: 8992.81 words
Standard deviation of context length: 8245.76 words
Average long answer length: 211.79 words
Standard deviation of long answer length: 1323.04 words
Average cleaned long answer length: 149.02 words
Standard deviation of cleaned long answer length: 886.10 words
Average number of short answers: 0.44
Standard deviation of number of short answers: 0.82


In [14]:
df.to_csv('../data/google-natural-questions/google-natural-questions-structured.csv', index=False)