In [2]:
import pandas as pd
import os
import json

DATA_DIR = '../data/hotpotqa-fullwiki/'
file_names = ['hotpot_dev_fullwiki_v1.json', 'hotpot_train_v1.1.json']
data_list = []

def count_words(sentences):
    return sum(len(sentence.split()) for sentence in sentences)

for file_name in file_names:
    file_path = os.path.join(DATA_DIR, file_name)
    with open(file_path, 'r') as f:
        json_data = json.load(f)
        for item in json_data:
            answer = item["answer"]
            question = item["question"]
            question_id = item["_id"]
            
            supporting_facts = item["supporting_facts"]
            context_list = []
            for context in item["context"]:
                context_list.extend(context[1])
                
            data_list.append({
                "Answer": answer,
                "Question": question,
                "QuestionId": question_id,
                "SupportingFacts": supporting_facts,
                "Context": context_list
            })

df = pd.DataFrame(data_list)
df

Unnamed: 0,Answer,Question,QuestionId,SupportingFacts,Context
0,yes,Were Scott Derrickson and Ed Wood of the same ...,5a8b57f25542995d1e6f1371,"[[Scott Derrickson, 0], [Ed Wood, 0]]",[Adam Collis is an American filmmaker and acto...
1,Chief of Protocol,What government position was held by the woman...,5a8c7595554299585d9e36b6,"[[Kiss and Tell (1945 film), 0], [Shirley Temp...",[A Kiss for Corliss is a 1949 American comedy ...
2,Animorphs,"What science fantasy young adult series, told ...",5a85ea095542994775f606a8,"[[The Hork-Bajir Chronicles, 0], [The Hork-Baj...",[Animorphs is a science fantasy series of youn...
3,no,Are the Laleli Mosque and Esma Sultan Mansion ...,5adbf0a255429947ff17385a,"[[Laleli Mosque, 0], [Esma Sultan Mansion, 0]]",[Esma Sultan is the name of three daughters of...
4,"Greenwich Village, New York City","The director of the romantic comedy ""Big Stone...",5a8e3ea95542995a26add48d,"[[Big Stone Gap (film), 0], [Adriana Trigiani,...","[Great Eastern Conventions, Inc. was an entert..."
...,...,...,...,...,...
97847,American,Kerry Remsen is the daughter of an actor with ...,5a8f8db25542997ba9cb32b9,"[[Kerry Remsen, 1], [Bert Remsen, 0]]","[Kerry Remsen is an American actress., She is..."
97848,Simon Property Group,"Who manages both Northshore Mall in Peabody, M...",5ae4f3615542993aec5ec0fd,"[[Northshore Mall, 0], [Northshore Mall, 4], [...",[Green Tree Mall is a shopping mall located in...
97849,Amblin Partners,Charlee Johnson was part of a band that signed...,5a903fc95542990a984935bd,"[[Charlee Johnson, 4], [DreamWorks, 0]]",[Simon M. Woods is a British entrepreneur and ...
97850,"MV ""Wilhelm Gustloff",What is the ship that sank in the Baltic sea a...,5ab56e71554299494045efc8,"[[Salt to the Sea, 1], [MV Wilhelm Gustloff, 0]]",[The I.V. Stalin White Sea – Baltic Sea Canal ...


In [3]:
num_questions = df.shape[0]

df['QuestionLength'] = df['Question'].apply(lambda x: len(x.split()))
avg_question_length = df['QuestionLength'].mean()
std_question_length = df['QuestionLength'].std()

df['NumContexts'] = df['Context'].apply(lambda x: len(x))
num_contexts = df['NumContexts'].sum()

df['ContextLength'] = df['Context'].apply(lambda x: count_words(x))
avg_context_length = df['ContextLength'].mean()
std_context_length = df['ContextLength'].std()

df['AnswerLength'] = df['Answer'].apply(lambda x: len(x.split()))
avg_answer_length = df['AnswerLength'].mean()
std_answer_length = df['AnswerLength'].std()

print(f"Number of questions: {num_questions}")
print(f"Average question length: {avg_question_length:.2f} words")
print(f"Standard deviation of question length: {std_question_length:.2f} words")
print(f"Number of contexts (facts): {num_contexts}")
print(f"Average context length: {avg_context_length:.2f} words")
print(f"Standard deviation of context length: {std_context_length:.2f} words")
print(f"Average golden answer length: {avg_answer_length:.2f} words")
print(f"Standard deviation of golden answer length: {std_answer_length:.2f} words")

Number of questions: 97852
Average question length: 17.66 words
Standard deviation of question length: 9.29 words
Number of contexts (facts): 4017967
Average context length: 888.86 words
Standard deviation of context length: 252.84 words
Average golden answer length: 2.24 words
Standard deviation of golden answer length: 1.81 words


In [4]:
import nltk
from collections import Counter
from nltk.util import ngrams
from tqdm.notebook import tqdm
import pandas as pd

nltk.download('punkt')

def get_top_ngrams(questions, n, top_k=None):
    ngram_counter = Counter()
    for question in questions:
        tokens = nltk.word_tokenize(question.lower())
        ngram_counter.update(ngrams(tokens, n))
    total = sum(ngram_counter.values())
    if not top_k:
        top_k = len(ngram_counter)
    return ngram_counter.most_common(top_k), total

def process_ngrams(questions, n_values, top_k=None):
    df_ngram = pd.DataFrame()
    for n in tqdm(n_values, desc="Processing n-grams"):
        top_ngrams, total = get_top_ngrams(questions, n, top_k)
        ngram_data = [(n, rank + 1, ' '.join(ngram), freq, freq / total) for rank, (ngram, freq) in enumerate(top_ngrams)]
        df_temp = pd.DataFrame(ngram_data, columns=['n', 'rank', 'ngram', 'frequency', 'relative_frequency'])
        df_ngram = pd.concat([df_ngram, df_temp], ignore_index=True)
    return df_ngram

questions = df['Question'].tolist()

if os.path.isfile(os.path.join(DATA_DIR, 'ngrams-sorted.csv')):
    df_ngram = pd.read_csv(os.path.join(DATA_DIR, 'ngrams-sorted.csv'))
else: 
    df_ngram = process_ngrams(questions, range(1, 9))
    df_ngram.to_csv(os.path.join(DATA_DIR, 'ngrams-sorted.csv'), index=False)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/yuehengzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processing n-grams:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

def calculate_ratios(df_ngram, n_values, k_values):
    ratio_sums = {k: [] for k in k_values}
    top_1_percent_ratios = []
    total_counts = []
    
    for n in n_values:
        subset = df_ngram[df_ngram['n'] == n]
        ratios = subset['relative_frequency'].tolist()
        total_count = len(ratios)
        total_counts.append(total_count)
        sorted_ratios = sorted(ratios, reverse=True)
        
        top_1_percent_count = max(1, total_count // 100)
        top_1_percent_ratios.append(sum(sorted_ratios[:top_1_percent_count]))
        
        for k in k_values:
            top_k_ratios = sorted_ratios[:k]
            ratio_sums[k].append(sum(top_k_ratios))
    
    return ratio_sums, top_1_percent_ratios, total_counts

n_values = list(range(1, 9))
k_values = [1, 2, 4, 8, 16, 32, 64, 128, 256]

ratios, top_1_percent_ratios, total_counts = calculate_ratios(df_ngram, n_values, k_values)

ratios['Top 1%'] = top_1_percent_ratios
ratios['Total Count'] = total_counts

print("HotpotQA n-gram Ratios:")
print(pd.DataFrame(ratios, index=[f'{n}-gram' for n in n_values]))

HotpotQA n-gram Ratios:
               1         2         4         8        16        32        64       128       256    Top 1%  Total Count
1-gram  0.061074  0.110454  0.175492  0.267616  0.368172  0.446158  0.508694  0.571567  0.634060  0.742319        85455
2-gram  0.011113  0.017692  0.028908  0.043062  0.059473  0.082975  0.116189  0.155650  0.199684  0.445840       523428
3-gram  0.002774  0.004643  0.007121  0.011354  0.017336  0.025421  0.035185  0.048569  0.065058  0.237728      1018623
4-gram  0.001226  0.002201  0.004111  0.006555  0.008834  0.012189  0.016696  0.022321  0.029666  0.127703      1294261
5-gram  0.001020  0.001784  0.002578  0.003459  0.004729  0.006417  0.008575  0.011326  0.015188  0.072521      1380837
6-gram  0.000808  0.001016  0.001390  0.001904  0.002495  0.003254  0.004389  0.005946  0.008028  0.044717      1368190
7-gram  0.000077  0.000128  0.000226  0.000411  0.000715  0.001179  0.001784  0.002630  0.003848  0.030164      1309324
8-gram  0.000042