In [1]:
import pandas as pd
import os
import json

data_dir = '../data/triviaqa-unfiltered/'
file_names = ['unfiltered-web-dev.json', 'unfiltered-web-train.json']
data_list = []

for file_name in file_names:
    file_path = os.path.join(data_dir, file_name)
    with open(file_path, 'r') as f:
        json_data = json.load(f)
        for item in json_data["Data"]:
            answer = item["Answer"]["Value"]
            question = item["Question"]
            question_id = item["QuestionId"]
            question_source = item["QuestionSource"]
            
            search_results = []
            for result in item["SearchResults"]:
                search_results.append({
                    "Description": result.get("Description", ""),
                    "DisplayUrl": result.get("DisplayUrl", ""),
                    "Rank": result.get("Rank", ""),
                    "Title": result.get("Title", ""),
                    "Url": result.get("Url", "")
                })
            
            data_list.append({
                "Answer": answer,
                "Question": question,
                "QuestionId": question_id,
                "QuestionSource": question_source,
                "SearchResults": search_results
            })

df = pd.DataFrame(data_list)
df

Unnamed: 0,Answer,Question,QuestionId,QuestionSource,SearchResults
0,David Seville,Who was the man behind The Chipmunks?,tc_2,http://www.triviacountry.com/,[{'Description': 'Alice Cooper's The Man Behin...
1,Scorpio,What star sign is Jamie Lee Curtis?,tc_13,http://www.triviacountry.com/,"[{'Description': 'Jamie Lee Curtis, Actress: T..."
2,Sunset Boulevard,Which Lloyd Webber musical premiered in the US...,tc_33,http://www.triviacountry.com/,[{'Description': 'The official website for And...
3,Campbell-Bannerman,Who was the next British Prime Minister after ...,tc_40,http://www.triviacountry.com/,[{'Description': 'The history and complete tex...
4,Exile,Who had a 70s No 1 hit with Kiss You All Over?,tc_49,http://www.triviacountry.com/,[{'Description': '... credits and award inform...
...,...,...,...,...,...
98930,Rock Lobster by the B-52s,Name the artist and the title of this 1978 cla...,qg_4644,https://quizguy.wordpress.com/,[{'Description': 'Under a Rock. Posted 7.11.11...
98931,Bugs Bunny,"July 27, 1940 saw the introduction of what bel...",qg_4646,https://quizguy.wordpress.com/,"[{'Description': 'Bugs Bunny Is 75, And Now Yo..."
98932,All the kings horses and all the kings men,"According to the nursery rhyme, who couldnt pu...",qg_4647,https://quizguy.wordpress.com/,[{'Description': 'Couldn't put Humpty together...
98933,Harpy,With a name that translates as that which snat...,qg_4649,https://quizguy.wordpress.com/,[{'Description': 'Good Reads Books which I hav...


In [5]:
num_questions = df.shape[0]

df['QuestionLength'] = df['Question'].apply(lambda x: len(x.split()))
avg_question_length = df['QuestionLength'].mean()
std_question_length = df['QuestionLength'].std()

df['NumContexts'] = df['SearchResults'].apply(lambda x: len(x))
num_contexts = df['NumContexts'].sum()

df['ContextLength'] = df['SearchResults'].apply(lambda x: sum(len(result['Description'].split()) for result in x))
avg_context_length = df['ContextLength'].mean()
std_context_length = df['ContextLength'].std()

df['AnswerLength'] = df['Answer'].apply(lambda x: len(x.split()))
avg_answer_length = df['AnswerLength'].mean()
std_answer_length = df['AnswerLength'].std()

print(f"Number of questions: {num_questions}")
print(f"Average question length: {avg_question_length:.2f} words")
print(f"Standard deviation of question length: {std_question_length:.2f} words")
print(f"Number of contexts (facts): {num_contexts}")
print(f"Average context length: {avg_context_length:.2f} words")
print(f"Standard deviation of context length: {std_context_length:.2f} words")
print(f"Average golden answer length: {avg_answer_length:.2f} words")
print(f"Standard deviation of golden answer length: {std_answer_length:.2f} words")

Number of questions: 98935
Average question length: 13.99 words
Standard deviation of question length: 7.02 words
Number of contexts (facts): 4509392
Average context length: 1231.78 words
Standard deviation of context length: 232.92 words
Average golden answer length: 1.96 words
Standard deviation of golden answer length: 1.72 words
