In [1]:
import joblib
import pandas as pd
import numpy as np
import spacy
import os
from src.classes.qadataset import QADataset
from datasets import load_dataset, Dataset
from collections import defaultdict
import random
from tqdm.auto import tqdm
from typing import List, Dict, Tuple, Union
import re
from copy import deepcopy

In [8]:
trivia_dataset = pd.DataFrame(load_dataset("Seongill/trivia", split="test"))
nq_dataset= pd.DataFrame(load_dataset("Seongill/nq", split="test"))

In [9]:
for topk in [3, 5, 10]:
    for name, dataset in zip(["Trivia", "NQ"], [trivia_dataset, nq_dataset]):
        new_ctxs = dataset["ctxs"].apply(lambda x: x[:topk])
        has_answer = dataset["ctxs"].apply(lambda x: any([c["hasanswer"] for c in x[:topk]]))
        dataset = dataset.copy().drop(columns=["ctxs"], axis=1)
        dataset["ctxs"] = new_ctxs
        dataset["has_answer"] = has_answer
        Dataset.from_pandas(dataset).push_to_hub(f"{name}_missing_{topk}")
        print(f"{name} {topk} {dataset.has_answer.mean()}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Trivia 3 0.6438610448156987


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

NQ 3 0.53601108033241


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Trivia 5 0.6964554052859542


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

NQ 5 0.6213296398891966


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Trivia 10 0.7513480067179351


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

NQ 10 0.7096952908587257


In [32]:
trivia_dataset = pd.DataFrame(load_dataset("Seongill/trivia", split="test"))
nq_dataset= pd.DataFrame(load_dataset("Seongill/nq", split="test"))

In [34]:
nq_dataset["has_answer"] = nq_dataset["ctxs"].apply(lambda x: any([c["hasanswer"] for c in x[:1]]))
trivia_dataset["has_answer"] = trivia_dataset["ctxs"].apply(lambda x: any([c["hasanswer"] for c in x[:1]]))

In [36]:
nq_dataset.has_answer.mean(), trivia_dataset.has_answer.mean()

(0.35512465373961216, 0.4724653054008663)

In [35]:
print(len(nq_dataset) - nq_dataset.has_answer.sum())
print(nq_dataset.has_answer.sum())
print()
print(len(trivia_dataset) - trivia_dataset.has_answer.sum())
print(trivia_dataset.has_answer.sum())

2328
1282

5968
5345


In [46]:
for name, dataset in zip(["Trivia", "NQ"], [trivia_dataset, nq_dataset]):
    cutoff = len(dataset) - dataset.has_answer.sum()
    print(f"CUTOFF {cutoff}")
    new_dataset = dataset.copy()
    subset = dataset[~dataset.has_answer]
    new_ctxs = []
    has_ans = []
    for row_ in dataset.iterrows():
        row = row_[1]
        ctxs = row["ctxs"]
        has_answers = any([c["hasanswer"] for c in ctxs[:5]])
        if not has_answers:
            new_ctxs.append(ctxs[:5])
            has_ans.append(False)
            cutoff -= 1
        else:
            if (sum([c["hasanswer"] for c in ctxs]) <= 5) and (cutoff > 0):
                new_ctx = [c for c in ctxs if not c["hasanswer"]][:5]
                new_ctxs.append(new_ctx)
                has_ans.append(False)
                cutoff -= 1
            else:
                new_ctxs.append(ctxs[:5])
                has_ans.append(True)
    new_dataset["ctxs"] = new_ctxs
    new_dataset["has_answer"] = has_ans
    print(f"{name} {len(new_dataset) - new_dataset.has_answer.sum()}")
    print(f"{name} {new_dataset.has_answer.mean()}")
    #Dataset.from_pandas(new_dataset).push_to_hub(f"{name}_missing_5_double")


CUTOFF 5968
Trivia 7080
Trivia 0.37417130734553167
CUTOFF 2328
NQ 2697
NQ 0.2529085872576177


In [24]:
len(new_dataset) - new_dataset.has_answer.sum()

6868

In [1]:
from datasets import load_dataset

dataset = load_dataset("Seongill/Trivia_missing_5")["train"]

In [2]:
import pandas as pd
df = pd.DataFrame(dataset)

In [5]:
df.iloc[[0,1]]["question"].tolist()

['Who was the man behind The Chipmunks?',
 'What star sign is Jamie Lee Curtis?']

In [6]:
dataset = dataset.shuffle(seed=42).select(range(len(dataset)//3))
dataset

Dataset({
    features: ['question', 'answers', 'ctxs', 'has_answer'],
    num_rows: 3771
})

In [10]:
sum(dataset["has_answer"]) / len(dataset)

0.699814372845399

In [11]:
dataset.push_to_hub("Trivia_missing_5_small")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]