In [2]:
import pandas as pd
import os
import json

DATA_DIR = '../data/triviaqa-unfiltered/'
file_names = ['unfiltered-web-dev.json', 'unfiltered-web-train.json']
data_list = []

for file_name in file_names:
    file_path = os.path.join(DATA_DIR, file_name)
    with open(file_path, 'r') as f:
        json_data = json.load(f)
        for item in json_data["Data"]:
            answer = item["Answer"]["Value"]
            question = item["Question"]
            question_id = item["QuestionId"]
            question_source = item["QuestionSource"]
            
            search_results = []
            for result in item["SearchResults"]:
                search_results.append({
                    "Description": result.get("Description", ""),
                    "DisplayUrl": result.get("DisplayUrl", ""),
                    "Rank": result.get("Rank", ""),
                    "Title": result.get("Title", ""),
                    "Url": result.get("Url", "")
                })
            
            data_list.append({
                "Answer": answer,
                "Question": question,
                "QuestionId": question_id,
                "QuestionSource": question_source,
                "SearchResults": search_results
            })

df = pd.DataFrame(data_list)
df

Unnamed: 0,Answer,Question,QuestionId,QuestionSource,SearchResults
0,David Seville,Who was the man behind The Chipmunks?,tc_2,http://www.triviacountry.com/,[{'Description': 'Alice Cooper's The Man Behin...
1,Scorpio,What star sign is Jamie Lee Curtis?,tc_13,http://www.triviacountry.com/,"[{'Description': 'Jamie Lee Curtis, Actress: T..."
2,Sunset Boulevard,Which Lloyd Webber musical premiered in the US...,tc_33,http://www.triviacountry.com/,[{'Description': 'The official website for And...
3,Campbell-Bannerman,Who was the next British Prime Minister after ...,tc_40,http://www.triviacountry.com/,[{'Description': 'The history and complete tex...
4,Exile,Who had a 70s No 1 hit with Kiss You All Over?,tc_49,http://www.triviacountry.com/,[{'Description': '... credits and award inform...
...,...,...,...,...,...
98930,Rock Lobster by the B-52s,Name the artist and the title of this 1978 cla...,qg_4644,https://quizguy.wordpress.com/,[{'Description': 'Under a Rock. Posted 7.11.11...
98931,Bugs Bunny,"July 27, 1940 saw the introduction of what bel...",qg_4646,https://quizguy.wordpress.com/,"[{'Description': 'Bugs Bunny Is 75, And Now Yo..."
98932,All the kings horses and all the kings men,"According to the nursery rhyme, who couldnt pu...",qg_4647,https://quizguy.wordpress.com/,[{'Description': 'Couldn't put Humpty together...
98933,Harpy,With a name that translates as that which snat...,qg_4649,https://quizguy.wordpress.com/,[{'Description': 'Good Reads Books which I hav...


In [3]:
num_questions = df.shape[0]

df['QuestionLength'] = df['Question'].apply(lambda x: len(x.split()))
avg_question_length = df['QuestionLength'].mean()
std_question_length = df['QuestionLength'].std()

df['NumContexts'] = df['SearchResults'].apply(lambda x: len(x))
num_contexts = df['NumContexts'].sum()

df['ContextLength'] = df['SearchResults'].apply(lambda x: sum(len(result['Description'].split()) for result in x))
avg_context_length = df['ContextLength'].mean()
std_context_length = df['ContextLength'].std()

df['AnswerLength'] = df['Answer'].apply(lambda x: len(x.split()))
avg_answer_length = df['AnswerLength'].mean()
std_answer_length = df['AnswerLength'].std()

print(f"Number of questions: {num_questions}")
print(f"Average question length: {avg_question_length:.2f} words")
print(f"Standard deviation of question length: {std_question_length:.2f} words")
print(f"Number of contexts (facts): {num_contexts}")
print(f"Average context length: {avg_context_length:.2f} words")
print(f"Standard deviation of context length: {std_context_length:.2f} words")
print(f"Average golden answer length: {avg_answer_length:.2f} words")
print(f"Standard deviation of golden answer length: {std_answer_length:.2f} words")

Number of questions: 98935
Average question length: 13.99 words
Standard deviation of question length: 7.02 words
Number of contexts (facts): 4509392
Average context length: 1231.78 words
Standard deviation of context length: 232.92 words
Average golden answer length: 1.96 words
Standard deviation of golden answer length: 1.72 words


In [4]:
import nltk
from collections import Counter
from nltk.util import ngrams
from tqdm.notebook import tqdm
import pandas as pd

nltk.download('punkt')

def get_top_ngrams(questions, n, top_k=None):
    ngram_counter = Counter()
    for question in questions:
        tokens = nltk.word_tokenize(question.lower())
        ngram_counter.update(ngrams(tokens, n))
    total = sum(ngram_counter.values())
    if not top_k:
        top_k = len(ngram_counter)
    return ngram_counter.most_common(top_k), total

def process_ngrams(questions, n_values, top_k=None):
    df_ngram = pd.DataFrame()
    for n in tqdm(n_values, desc="Processing n-grams"):
        top_ngrams, total = get_top_ngrams(questions, n, top_k)
        ngram_data = [(n, rank + 1, ' '.join(ngram), freq, freq / total) for rank, (ngram, freq) in enumerate(top_ngrams)]
        df_temp = pd.DataFrame(ngram_data, columns=['n', 'rank', 'ngram', 'frequency', 'relative_frequency'])
        df_ngram = pd.concat([df_ngram, df_temp], ignore_index=True)
    return df_ngram

questions = df['Question'].tolist()

if os.path.isfile(os.path.join(DATA_DIR, 'ngrams-sorted.csv')):
    df_ngram = pd.read_csv(os.path.join(DATA_DIR, 'ngrams-sorted.csv'))
else: 
    df_ngram = process_ngrams(questions, range(1, 9))
    df_ngram.to_csv(os.path.join(DATA_DIR, 'ngrams-sorted.csv'), index=False)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/yuehengzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processing n-grams:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

def calculate_ratios(df_ngram, n_values, k_values):
    ratio_sums = {k: [] for k in k_values}
    top_1_percent_ratios = []
    total_counts = []
    
    for n in n_values:
        subset = df_ngram[df_ngram['n'] == n]
        ratios = subset['relative_frequency'].tolist()
        total_count = len(ratios)
        total_counts.append(total_count)
        sorted_ratios = sorted(ratios, reverse=True)
        
        top_1_percent_count = max(1, total_count // 100)
        top_1_percent_ratios.append(sum(sorted_ratios[:top_1_percent_count]))
        
        for k in k_values:
            top_k_ratios = sorted_ratios[:k]
            ratio_sums[k].append(sum(top_k_ratios))
    
    return ratio_sums, top_1_percent_ratios, total_counts

n_values = list(range(1, 9))
k_values = [1, 2, 4, 8, 16, 32, 64, 128, 256]

ratios, top_1_percent_ratios, total_counts = calculate_ratios(df_ngram, n_values, k_values)

ratios['Top 1%'] = top_1_percent_ratios
ratios['Total Count'] = total_counts

print("TriviaQA n-gram Ratios:")
print(pd.DataFrame(ratios, index=[f'{n}-gram' for n in n_values]))

TriviaQA n-gram Ratios:
               1         2         4         8        16        32        64       128       256    Top 1%  Total Count
1-gram  0.067826  0.125705  0.192147  0.293786  0.395638  0.465680  0.522509  0.578019  0.635140  0.726088        69429
2-gram  0.010103  0.019677  0.034078  0.055118  0.078963  0.106123  0.135691  0.170172  0.206355  0.423180       453053
3-gram  0.005319  0.007813  0.012188  0.019701  0.026681  0.034772  0.044556  0.056785  0.073036  0.227010       859347
4-gram  0.002338  0.004280  0.007343  0.010470  0.013326  0.016722  0.021208  0.027663  0.035978  0.128971      1045561
5-gram  0.001483  0.002695  0.004513  0.005978  0.007440  0.009387  0.012007  0.015187  0.019435  0.076533      1085149
6-gram  0.001292  0.002152  0.002706  0.003278  0.004269  0.005433  0.006853  0.008618  0.010955  0.048734      1054463
7-gram  0.000177  0.000332  0.000579  0.000852  0.001206  0.001719  0.002404  0.003379  0.004780  0.031927       990705
8-gram  0.000038