In [1]:
import random

from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, PreTrainedTokenizerBase

In [2]:
def create_warmup_measure_dataset(tokenizer: PreTrainedTokenizerBase, output_path: str):
    ds = load_dataset("agentlans/high-quality-english-sentences", split="test")

    token_lengths = [len(tokenizer.encode(x["text"], add_special_tokens=False)) for x in ds]

    sample_per_bucket = 250
    warmup_samples = 20

    buckets = {
        "A": {"min": 0, "max": 15, "indices": []},
        "B": {"min": 16, "max": 30, "indices": []},
        "C": {"min": 31, "max": 50, "indices": []},
        "D": {"min": 51, "max": 100, "indices": []},
    }

    for idx, length in enumerate(token_lengths):
        for b in buckets.values():
            if b["min"] <= length <= b["max"]:
                b["indices"].append(idx)
                break

    random.seed(42)
    final_indices = []
    for name, b in buckets.items():
        if name == "A":
            final_indices.extend(random.sample(b["indices"], sample_per_bucket + warmup_samples))
        else:
            final_indices.extend(random.sample(b["indices"], sample_per_bucket))

    warmup_indices = final_indices[:warmup_samples]
    final_indices = final_indices[warmup_samples:]

    warmup_indices = sorted(warmup_indices, key=lambda i: token_lengths[i])
    final_indices = sorted(final_indices, key=lambda i: token_lengths[i])

    warmup_dataset = ds.select(warmup_indices)
    measurement_dataset = ds.select(final_indices)

    dataset_dict = DatasetDict({
        "warmup": warmup_dataset,
        "measurement": measurement_dataset
    })

    dataset_dict.save_to_disk(output_path)
    print(f"Datensatz erfolgreich gespeichert unter: {output_path}")


In [3]:
model_name = "openai-community/gpt2-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False)

output_path = "../output/benchmark_dataset"
create_warmup_measure_dataset(tokenizer, output_path=output_path)

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Datensatz erfolgreich gespeichert unter: ../output/benchmark_dataset


In [4]:
dataset = load_from_disk(output_path)
print(dataset)

DatasetDict({
    warmup: Dataset({
        features: ['text'],
        num_rows: 20
    })
    measurement: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})
