In [92]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [93]:
from datasets import load_dataset

from transformers import AutoTokenizer

In [94]:
def get_tokenizer(model_str: str, **kwargs):
    """Instantiate a tokenizer, using the fast one iff it exists."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_str, use_fast=True, **kwargs)
    
    except Exception as e:
        if kwargs.get("verbose", True):
            print(f"Falling back to slow tokenizer; fast one failed: '{e}'")

        tokenizer = AutoTokenizer.from_pretrained(model_str, use_fast=False, **kwargs)

    if getattr(tokenizer, "pad_token", None) is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    return tokenizer

tokenizer = get_tokenizer("EleutherAI/gpt-j-6B")
type(tokenizer)

Using pad_token, but it is not set yet.


transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast

In [96]:
tokenizer.model_max_length

2048

In [97]:
dataset = load_dataset("AugustasM/burns-ppo-training-dataset", split="train")
dataset = dataset.filter(lambda x: x["original_dataset"] != "piqa")
# dataset = dataset.filter(lambda x: x["original_dataset"] != "super_glue/copa")
dataset

Found cached dataset parquet (/admin/home-augustas/.cache/huggingface/datasets/AugustasM___parquet/AugustasM--burns-ppo-training-dataset-acfd6af5c398b8b6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
Loading cached processed dataset at /admin/home-augustas/.cache/huggingface/datasets/AugustasM___parquet/AugustasM--burns-ppo-training-dataset-acfd6af5c398b8b6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-5dcada4795f777c4.arrow


Dataset({
    features: ['original_dataset', 'template_name', 'prompt', 'best_response'],
    num_rows: 8718
})

In [98]:
# Do not need to truncate for GPT-J 6B, check for other models
def tokenize(batch, max_length=1024):
    return tokenizer(
        batch["prompt"], padding="max_length",
        max_length=max_length, return_tensors="pt",
    )

In [99]:
tokenizer = get_tokenizer("EleutherAI/gpt-j-6B")
type(tokenizer)

prompt_max_len = max(
    tokenizer(row["prompt"], return_tensors="pt")["input_ids"].shape[1] for row in dataset
)
print(prompt_max_len)

processed_dataset = dataset.map(tokenize, batched=True, fn_kwargs={ "max_length": prompt_max_len })
processed_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
processed_dataset

Using pad_token, but it is not set yet.
Loading cached processed dataset at /admin/home-augustas/.cache/huggingface/datasets/AugustasM___parquet/AugustasM--burns-ppo-training-dataset-acfd6af5c398b8b6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-0ac5a196cc334eea.arrow


572


Dataset({
    features: ['original_dataset', 'template_name', 'prompt', 'best_response', 'input_ids', 'attention_mask'],
    num_rows: 8718
})

In [100]:
first_len = processed_dataset[0]["input_ids"].shape[0]
print(first_len)
print(all(len(row["input_ids"]) == first_len for row in processed_dataset))

572
True


In [104]:
len(set(row["input_ids"].shape[0] for row in processed_dataset)) == 1

True

In [101]:
tokenizer = get_tokenizer("EleutherAI/gpt-j-6B")
type(tokenizer)

prompt_max_len = max(
    tokenizer(row["best_response"], return_tensors="pt")["input_ids"].shape[1] for row in dataset
)
print(prompt_max_len)

Using pad_token, but it is not set yet.


11


In [105]:
tokenizer = get_tokenizer("EleutherAI/gpt-j-6B")
type(tokenizer)

prompt_max_len = min(
    tokenizer(row["best_response"], return_tensors="pt")["input_ids"].shape[1] for row in dataset
)
print(prompt_max_len)

Using pad_token, but it is not set yet.


1


In [79]:
print(dataset[0]["prompt"] + dataset[0]["best_response"])

Consider the following example: ''' 49ers #39; home to be renamed Monster Park after stereo cable company Some fans think 44-year-old Candlestick Park is already a dinosaur. Now the San Francisco 49ers #39; home stadium has the name to match. ''' Choice 1: Sports. Choice 2: World politics.Between choice 1 and choice 2, what is the topic of this example? Sports


## Trash bin

In [59]:
tokenizer = get_tokenizer("EleutherAI/gpt-j-6B")
type(tokenizer)

# Find the length and the longest best_response in dataset
prompt_max_len = 0
longest_prompt = ""
longest_dataset = ""
for row in dataset:
    tokens = tokenizer(row["prompt"], return_tensors="pt")["input_ids"]
    if tokens.shape[1] > max_len:
        longest_best_response = row["prompt"]
        max_len = tokens.shape[1]
        longest_dataset = row["original_dataset"]

max_len, longest_dataset, longest_best_response

Using pad_token, but it is not set yet.


572


In [53]:
tokenizer = get_tokenizer("EleutherAI/gpt-j-6B")
type(tokenizer)

# Find the length and the longest best_response in dataset
max_len = 0
longest_best_response = ""
longest_dataset = ""
for row in dataset:
    tokens = tokenizer(row["best_response"], return_tensors="pt")["input_ids"]
    if tokens.shape[1] > max_len:
        longest_best_response = row["best_response"]
        max_len = tokens.shape[1]
        longest_dataset = row["original_dataset"]

max_len, longest_dataset, longest_best_response

Using pad_token, but it is not set yet.


Yes


(11, 'super_glue/copa', 'I shut off the light in the unoccupied room.')