## Imports

In [4]:
from collections import Counter

from datasets import load_dataset, DatasetDict, ClassLabel, load_dataset_builder

In [5]:
!ls -lah datasets

total 8.5M
drwxr-xr-x 4 augustas Domain Users  33K Jun 28 09:40 .
drwxr-xr-x 4 augustas Domain Users  33K Jun 28 09:41 ..
drwxr-xr-x 3 augustas Domain Users  25K Jun 19 16:07 burns_datasets_VINC_ag_news_ppo_training_raw
drwxr-xr-x 3 augustas Domain Users  25K Jun 19 15:59 burns_datasets_VINC_ag_news_test_raw
-rw-r--r-- 1 augustas Domain Users 188K Jun 19 16:00 burns_datasets_VINC_ag_news_train.parquet
-rw-r--r-- 1 augustas Domain Users 187K Jun 19 16:12 burns_datasets_VINC_ag_news_validation.parquet
-rw-r--r-- 1 augustas Domain Users 3.5M Jun 28 11:59 burns_datasets_VINC_individual_train.parquet
-rw-r--r-- 1 augustas Domain Users 3.3M Jun 28 12:01 burns_datasets_VINC_individual_validation.parquet
-rw-r--r-- 1 augustas Domain Users 1.2M Jun 19 16:14 ppo_training_dataset.parquet


In [6]:
load_dataset_builder(
    "AugustasM/burns-datasets-VINC"
).info.features["label"]

ClassLabel(names=['neg', 'pos'], id=None)

In [7]:
parquet_data_files = {
    "train": "datasets/burns_datasets_VINC_individual_train.parquet",
    "validation": "datasets/burns_datasets_VINC_individual_validation.parquet",
}

my_dataset = load_dataset("parquet", data_files=parquet_data_files)
my_dataset

Found cached dataset parquet (/admin/home-augustas/.cache/huggingface/datasets/parquet/default-de975713543d936a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'original_dataset', 'template_name'],
        num_rows: 12400
    })
    validation: Dataset({
        features: ['text', 'label', 'original_dataset', 'template_name'],
        num_rows: 10877
    })
})

In [8]:
Counter(my_dataset["train"]["label"]), Counter(my_dataset["validation"]["label"])

(Counter({1: 6214, 0: 6186}), Counter({0: 5532, 1: 5345}))

In [9]:
Counter(my_dataset["train"]["original_dataset"])

Counter({'ag_news': 1500,
         'amazon_polarity': 1500,
         'dbpedia_14': 1500,
         'glue/qnli': 1500,
         'imdb': 1500,
         'piqa': 1500,
         'super_glue/boolq': 1500,
         'super_glue/rte': 1500,
         'super_glue/copa': 400})

In [10]:
Counter(my_dataset["validation"]["original_dataset"])

Counter({'ag_news': 1500,
         'amazon_polarity': 1500,
         'dbpedia_14': 1500,
         'glue/qnli': 1500,
         'imdb': 1500,
         'piqa': 1500,
         'super_glue/boolq': 1500,
         'super_glue/rte': 277,
         'super_glue/copa': 100})

In [11]:
# Assert set of values for the feature original dataset is the same for both dataset splits
assert set(my_dataset["train"]["original_dataset"]) == set(my_dataset["validation"]["original_dataset"])

dataset_names = sorted(set(my_dataset["train"]["original_dataset"]), key=lambda x: x.split("/")[-1])
dataset_names

['ag_news',
 'amazon_polarity',
 'super_glue/boolq',
 'super_glue/copa',
 'dbpedia_14',
 'imdb',
 'piqa',
 'glue/qnli',
 'super_glue/rte']

In [12]:
for dataset_name in dataset_names:
    if "/" in dataset_name:
        dataset_name = dataset_name.split("/")[-1]
    print(f"Dataset: {dataset_name}")

Dataset: ag_news
Dataset: amazon_polarity
Dataset: boolq
Dataset: copa
Dataset: dbpedia_14
Dataset: imdb
Dataset: piqa
Dataset: qnli
Dataset: rte


In [13]:
for dataset_name in dataset_names:
    train_dataset = my_dataset["train"].filter(lambda example: example["original_dataset"] == dataset_name)
    validation_dataset = my_dataset["validation"].filter(lambda example: example["original_dataset"] == dataset_name)

    # print(Counter(train_dataset["label"]))
    # print(Counter(validation_dataset["label"]))
    
    # Get majority class accuracy
    counts = Counter(validation_dataset["label"])
    # print(counts.most_common(1)[0][1])
    print(f"Majority class accuracy: {counts.most_common(1)[0][1] / sum(counts.values())*100:.2f}%")

    print(f"{dataset_name}: {train_dataset.num_rows} train, {validation_dataset.num_rows} validation")
    print("----------------------------------")

Loading cached processed dataset at /admin/home-augustas/.cache/huggingface/datasets/parquet/default-de975713543d936a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-6a290448cb40c3f7.arrow
Loading cached processed dataset at /admin/home-augustas/.cache/huggingface/datasets/parquet/default-de975713543d936a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-747ef40c16d99340.arrow
Loading cached processed dataset at /admin/home-augustas/.cache/huggingface/datasets/parquet/default-de975713543d936a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-f4392ddd412c9309.arrow
Loading cached processed dataset at /admin/home-augustas/.cache/huggingface/datasets/parquet/default-de975713543d936a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-30b07705f5e2b8ca.arrow
Loading cached processed dataset at /admin/home-augustas/.cache/huggingface/datasets/parquet/default-de975713543d936a/0.0.0/14a0

Majority class accuracy: 50.60%
ag_news: 1500 train, 1500 validation
----------------------------------
Majority class accuracy: 52.00%
amazon_polarity: 1500 train, 1500 validation
----------------------------------
Majority class accuracy: 50.20%
super_glue/boolq: 1500 train, 1500 validation
----------------------------------
Majority class accuracy: 51.00%
super_glue/copa: 400 train, 100 validation
----------------------------------
Majority class accuracy: 51.80%
dbpedia_14: 1500 train, 1500 validation
----------------------------------
Majority class accuracy: 50.40%
imdb: 1500 train, 1500 validation
----------------------------------
Majority class accuracy: 51.67%
piqa: 1500 train, 1500 validation
----------------------------------
Majority class accuracy: 50.40%
glue/qnli: 1500 train, 1500 validation
----------------------------------
Majority class accuracy: 50.18%
super_glue/rte: 1500 train, 277 validation
----------------------------------


In [13]:
for dataset_name in dataset_names:
    print(f"Dataset: {dataset_name}")

    train_dataset = my_dataset["train"].filter(lambda example: example["original_dataset"] == dataset_name, load_from_cache_file=False)
    validation_dataset = my_dataset["validation"].filter(lambda example: example["original_dataset"] == dataset_name, load_from_cache_file=False)

    print(Counter(train_dataset["label"]))
    print(Counter(validation_dataset["label"]))

    features = train_dataset.features
    features["label"] = ClassLabel(names=["neg", "pos"])
    train_dataset = train_dataset.map(features=features, load_from_cache_file=False)
    validation_dataset = validation_dataset.map(features=features, load_from_cache_file=False)
    # print(train_dataset.features["label"])
    # print(validation_dataset.features["label"])

    new_dataset = DatasetDict({ "train": train_dataset, "validation": validation_dataset })
    # print(new_dataset)
    print(new_dataset["train"].features["label"])
    print(new_dataset["validation"].features["label"])
    # print()

    if "/" in dataset_name:
        dataset_name = dataset_name.split("/")[-1]
    # new_dataset.push_to_hub(f"AugustasM/burns-datasets-VINC-individual-{dataset_name}", private=True)

Dataset: ag_news


Filter:   0%|          | 0/12400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10877 [00:00<?, ? examples/s]

Counter({0: 773, 1: 727})
Counter({0: 759, 1: 741})


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Pushing split train to the Hub.


ClassLabel(names=['neg', 'pos'], id=None)
ClassLabel(names=['neg', 'pos'], id=None)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset: amazon_polarity


Filter:   0%|          | 0/12400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10877 [00:00<?, ? examples/s]

Counter({1: 765, 0: 735})
Counter({0: 780, 1: 720})


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Pushing split train to the Hub.


ClassLabel(names=['neg', 'pos'], id=None)
ClassLabel(names=['neg', 'pos'], id=None)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset: super_glue/boolq


Filter:   0%|          | 0/12400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10877 [00:00<?, ? examples/s]

Counter({1: 767, 0: 733})
Counter({0: 753, 1: 747})


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Pushing split train to the Hub.


ClassLabel(names=['neg', 'pos'], id=None)
ClassLabel(names=['neg', 'pos'], id=None)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset: super_glue/copa


Filter:   0%|          | 0/12400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10877 [00:00<?, ? examples/s]

Counter({1: 205, 0: 195})
Counter({1: 51, 0: 49})


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Pushing split train to the Hub.


ClassLabel(names=['neg', 'pos'], id=None)
ClassLabel(names=['neg', 'pos'], id=None)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset: dbpedia_14


Filter:   0%|          | 0/12400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10877 [00:00<?, ? examples/s]

Counter({1: 753, 0: 747})
Counter({0: 777, 1: 723})


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Pushing split train to the Hub.


ClassLabel(names=['neg', 'pos'], id=None)
ClassLabel(names=['neg', 'pos'], id=None)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset: imdb


Filter:   0%|          | 0/12400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10877 [00:00<?, ? examples/s]

Counter({0: 768, 1: 732})
Counter({1: 756, 0: 744})


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Pushing split train to the Hub.


ClassLabel(names=['neg', 'pos'], id=None)
ClassLabel(names=['neg', 'pos'], id=None)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset: piqa


Filter:   0%|          | 0/12400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10877 [00:00<?, ? examples/s]

Counter({1: 771, 0: 729})
Counter({0: 775, 1: 725})


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Pushing split train to the Hub.


ClassLabel(names=['neg', 'pos'], id=None)
ClassLabel(names=['neg', 'pos'], id=None)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset: glue/qnli


Filter:   0%|          | 0/12400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10877 [00:00<?, ? examples/s]

Counter({0: 752, 1: 748})
Counter({0: 756, 1: 744})


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Pushing split train to the Hub.


ClassLabel(names=['neg', 'pos'], id=None)
ClassLabel(names=['neg', 'pos'], id=None)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset: super_glue/rte


Filter:   0%|          | 0/12400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10877 [00:00<?, ? examples/s]

Counter({0: 754, 1: 746})
Counter({0: 139, 1: 138})


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Pushing split train to the Hub.


ClassLabel(names=['neg', 'pos'], id=None)
ClassLabel(names=['neg', 'pos'], id=None)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]