In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds_name = "AkariAsai/PopQA"
orig_dataset = load_dataset(ds_name, split="test")
push_to_hub = False

print(orig_dataset)

Found cached dataset csv (/mnt/ssd-2/hf_cache/AkariAsai___csv/AkariAsai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Dataset({
    features: ['id', 'subj', 'prop', 'obj', 'subj_id', 'prop_id', 'obj_id', 's_aliases', 'o_aliases', 's_uri', 'o_uri', 's_wiki_title', 'o_wiki_title', 's_pop', 'o_pop', 'question', 'possible_answers'],
    num_rows: 14267
})


In [3]:
import numpy as np

pop_percentile = 90

q_templates = {
        22: "what is {}'s occupation?",
        218: "in what city was {} born?",
        91: "what genre is {}?",
        257: "who is the father of {}?",
        182: "in what country is {}?",
        164: "who was the producer of {}?",
        526: "who was the director of {}?",
        97: "what is {} the capital of?",
        533: "who was the screenwriter for {}?",
        639: "who was the composer of {}?",
        472: "what color is {}?",
        106: "what is the religion of {}?",
        560: "what sport does {} play?",
        484: "who is the author of {}?",
        292: "who is the mother of {}?",
        422: "what is the capital of {}?",
    }

s_templates = {
        22: "{}'s occupation is",
        218: "the city of birth of {} is",
        91: "the genre of {} is",
        257: "the father of {} is",
        182: "{} is located in the country",
        164: "the producer of {} was",
        526: "the director of {} was",
        97: "{} is the capital of",
        533: "the screenwriter for {} was",
        639: "the composer of {} was",
        472: "the color of {} is",
        106: "the religion of {} is",
        560: "the sport played by {} is",
        484: "the author of {} is",
        292: "the mother of {} is",
        422: "the capital of {} is",
    }

# turn PopQA into a binary dataset with distractors
if ds_name == "AkariAsai/PopQA":
    s_pop_cutoff = np.percentile(orig_dataset["s_pop"], pop_percentile)
    pop_ds = orig_dataset.filter(lambda x: x["s_pop"] >= s_pop_cutoff)
    pop_ds = pop_ds.shuffle(seed=633)
    from datasets import DatasetDict
    n = len(pop_ds)
    n_train = int(0.7 * n)
    n_val = int(0.15 * n)
    pop_ds_dict = DatasetDict({"train": pop_ds.select(range(n_train)), "validation": pop_ds.select(range(n_train, n_train + n_val)), "test": pop_ds.select(range(n_train + n_val, n))})

    def add_distractor(example):
        distractor_candidates = pop_ds.filter(lambda x: (x["prop_id"] == example["prop_id"]) and (x["id"] != example["id"]))
        
        try:
            distractor = np.random.choice(distractor_candidates)
            dist_obj, dist_obj_id, dist_o_pop, dist_o_aliases = distractor["obj"], distractor["obj_id"], distractor["o_pop"], distractor["o_aliases"]
        except ValueError:
            dist_obj, dist_obj_id, dist_o_pop, dist_o_aliases = "42", None, None, []
            print("No distractor found for example", example["id"], "filled with \"42\"")
        return {"dist_obj": dist_obj, "dist_obj_id": dist_obj_id, "dist_o_pop": dist_o_pop, "dist_o_aliases": dist_o_aliases}

    def make_binary(examples):
        # split the example into one with the true object and one with the distractor
        example = {k: v[0] for k, v in examples.items()}

        prop_id = example["prop_id"]
        obj, dist_obj = example["obj"], example["dist_obj"]
        
        questions = []
        statements = []
        objects = []
        labels = []

        q = q_templates[prop_id].format(example["subj"])
        s = s_templates[prop_id].format(example["subj"])

        questions.append(q)
        statements.append(s)
        objects.append(obj)
        labels.append(1)

        # distractor object
        questions.append(q)
        statements.append(s)
        objects.append(dist_obj)
        labels.append(0)

        return {"question": questions, "statement": statements, "object": objects, "label": labels}

    dist_ds = pop_ds_dict.map(add_distractor)
    ds_dict = dist_ds.map(make_binary, batched=True, batch_size=1, remove_columns=dist_ds["train"].column_names)


Loading cached processed dataset at /mnt/ssd-2/hf_cache/AkariAsai___csv/AkariAsai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-4e41470b4a1a74e6.arrow
Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/AkariAsai___csv/AkariAsai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-9eb0485fa1451999.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/AkariAsai___csv/AkariAsai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-fe04653ce43770dc.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/AkariAsai___csv/AkariAsai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-9c9c89dc88873a06.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/AkariAsai___csv/AkariAsai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853

                                                               

In [4]:
ds = ds_dict["train"]
ds[:10]

{'question': ['who was the producer of Hugo?',
  'who was the producer of Hugo?',
  'who is the father of Mary, Queen of Scots?',
  'who is the father of Mary, Queen of Scots?',
  'what is Prague the capital of?',
  'what is Prague the capital of?',
  'what genre is Simon Le Bon?',
  'what genre is Simon Le Bon?',
  'what is the capital of Kingdom of Italy?',
  'what is the capital of Kingdom of Italy?'],
 'statement': ['the producer of Hugo was',
  'the producer of Hugo was',
  'the father of Mary, Queen of Scots is',
  'the father of Mary, Queen of Scots is',
  'Prague is the capital of',
  'Prague is the capital of',
  'the genre of Simon Le Bon is',
  'the genre of Simon Le Bon is',
  'the capital of Kingdom of Italy is',
  'the capital of Kingdom of Italy is'],
 'object': ['Johnny Depp',
  'Kevin Spacey',
  'James V of Scotland',
  'Earl Woods',
  'Czech Socialist Republic',
  'Kingdom of Essex',
  'pop music',
  'soap opera',
  'Turin',
  'Pago Pago'],
 'label': [1, 0, 1, 0, 1, 0

In [6]:
# convert the label column to a ClassLabel
from datasets import ClassLabel

feat_label = ClassLabel(num_classes=2, names=["false", "true"])
ds_dict = ds_dict.cast_column("label", feat_label)

                                                                    

In [7]:
# save the DS
dirname = "./custom-datasets/"
main_name = f"popqa_{pop_percentile}"
save_path = dirname + main_name
ds_dict.save_to_disk(save_path)
save_path

                                                                                               

'./custom-datasets/popqa_90'

In [8]:
# polished version
if ds_name == "AkariAsai/PopQA":
    ds_dict.save_to_disk(dirname + main_name)

    
    if push_to_hub:
        # push   to HuggingFace datasets
        ds_dict.push_to_hub(main_name, private=False)

                                                                                               

In [28]:
from datasets import load_dataset
non_err_ds = load_dataset("atmallen/" + main_name)

Downloading readme: 100%|██████████| 651/651 [00:00<00:00, 4.55MB/s]


Downloading and preparing dataset None/None to /mnt/ssd-2/hf_cache/atmallen___parquet/atmallen--popqa_90-595062406061ab66/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 56.3k/56.3k [00:00<00:00, 978kB/s]
Downloading data: 100%|██████████| 15.4k/15.4k [00:00<00:00, 48.0MB/s]
Downloading data: 100%|██████████| 15.4k/15.4k [00:00<00:00, 50.6MB/s]
Downloading data files: 100%|██████████| 3/3 [00:04<00:00,  1.59s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2255.00it/s]
                                                                           

Dataset parquet downloaded and prepared to /mnt/ssd-2/hf_cache/atmallen___parquet/atmallen--popqa_90-595062406061ab66/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 1110.58it/s]


In [31]:
# find the examples with Chris Hughes in non_err_ds
for row in non_err_ds["validation"]:
    if "Chris Hughes" in row["object"]:
        print(row["question"])
        print(row["label"])

Who is the author of It?
0


In [1]:
from datasets import load_dataset

orig_ds = load_dataset("atmallen/popqa_90")
lower_first = lambda x: x[0].lower() + x[1:]
ds = orig_ds.map(lambda ex: {"question": lower_first(ex["question"]), "statement": lower_first(ex["statement"])})

Found cached dataset parquet (/mnt/ssd-2/hf_cache/atmallen___parquet/atmallen--popqa_90-595062406061ab66/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 3/3 [00:00<00:00, 336.46it/s]
                                                                  

In [5]:
ds["train"][:10]

{'question': ['who was the producer of Hugo?',
  'who was the producer of Hugo?',
  'who is the father of Mary, Queen of Scots?',
  'who is the father of Mary, Queen of Scots?',
  'what is Prague the capital of?',
  'what is Prague the capital of?',
  'what genre is Simon Le Bon?',
  'what genre is Simon Le Bon?',
  'what is the capital of Kingdom of Italy?',
  'what is the capital of Kingdom of Italy?'],
 'statement': ['the producer of Hugo was',
  'the producer of Hugo was',
  'the father of Mary, Queen of Scots is',
  'the father of Mary, Queen of Scots is',
  'prague is the capital of',
  'prague is the capital of',
  'the genre of Simon Le Bon is',
  'the genre of Simon Le Bon is',
  'the capital of Kingdom of Italy is',
  'the capital of Kingdom of Italy is'],
 'object': ['Johnny Depp',
  'Kevin Spacey',
  'James V of Scotland',
  'Earl Woods',
  'Czech Socialist Republic',
  'Kingdom of Essex',
  'pop music',
  'soap opera',
  'Turin',
  'Pago Pago'],
 'label': [1, 0, 1, 0, 1, 0

In [4]:
ds.push_to_hub("popqa_90")

Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 790.86ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  7.14it/s]
Pushing split validation to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1151.02ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  7.04it/s]
Pushing split test to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1252.03ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.60it/s