In [None]:
pip install "transformers>=4.41" tokenizers datasets accelerate torch

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

specials = ["<bos>", "<eos>", "<pad>"]
trainer = WordLevelTrainer(special_tokens=specials)

tok = Tokenizer(WordLevel(unk_token=None))   # we know every token
tok.pre_tokenizer = Whitespace()
tok.train_from_iterator(["apple", "banana"], trainer)
tok.save("tiny-tokenizer.json")

In [3]:
from datasets import Dataset

def wrap(word):
    return {"text": f"<bos> {word} <eos>"}

train_ds = Dataset.from_list([wrap("apple"), wrap("banana")])
val_ds   = Dataset.from_list([wrap("banana")])


In [7]:
train_ds.save_to_disk("train.arrow")  # Save train_ds to train.arrow
val_ds.save_to_disk("val.arrow")    # Save val_ds to val.arrow

Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

In [4]:
from transformers import PreTrainedTokenizerFast
hf_tok = PreTrainedTokenizerFast(tokenizer_file="tiny-tokenizer.json",
                                 bos_token="<bos>", eos_token="<eos>", pad_token="<pad>")

def encode(batch):
    return hf_tok(batch["text"], padding="max_length", truncation=True,
                  max_length=16, return_tensors="pt")

train_ds = train_ds.map(encode, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(encode, batched=True, remove_columns=["text"])


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [31]:
from transformers import GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments
import torch, datasets, json

# load the YAML
import yaml, argparse
with open("/content/Tiny-GPT-ish-LM.yaml") as f:
    cfg = yaml.safe_load(f)

# config = GPT2Config(**{k: cfg[k] for k in
#                        ["vocab_size","n_positions","n_embd",
#                         "n_layer","n_head","n_inner"]})
config = GPT2Config(
    vocab_size=len(hf_tok),     # ← makes it 5
    n_positions=cfg["n_positions"],
    n_embd=cfg["n_embd"],
    n_layer=cfg["n_layer"],
    n_head=cfg["n_head"],
    n_inner=cfg["n_inner"],
)

model = GPT2LMHeadModel(config)

# pin <pad> to zero loss
model.config.pad_token_id = cfg["pad_token_id"]

train_ds = datasets.load_from_disk("train.arrow")
val_ds   = datasets.load_from_disk("val.arrow")

In [32]:
# Instead of loading from disk, recreate the original datasets
from datasets import Dataset

def wrap(word):
    return {"text": f"<bos> {word} <eos>"}

train_ds = Dataset.from_list([wrap("apple"), wrap("banana")])  # Recreate train_ds
val_ds   = Dataset.from_list([wrap("banana")])              # Recreate val_ds

# Tokenize the dataset within the training loop using the tokenizer
from transformers import PreTrainedTokenizerFast
hf_tok = PreTrainedTokenizerFast(tokenizer_file="tiny-tokenizer.json",
                                 bos_token="<bos>", eos_token="<eos>", pad_token="<pad>")

def tokenize_function(examples):
    return hf_tok(examples["text"], padding="max_length", truncation=True, max_length=16)

tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_val_ds = val_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [36]:
def encode(batch):
    enc = hf_tok(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=16,
    )
    enc["labels"] = [
        [(tid if tid != hf_tok.pad_token_id else -100) for tid in seq]
        for seq in enc["input_ids"]
    ]
    return enc

train_ds = train_ds.map(encode, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(encode, batched=True, remove_columns=["text"])

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [37]:
tokenized_train_ds

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2
})

In [38]:
train_ds

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 2
})

In [None]:
args = TrainingArguments(
        output_dir="tiny-gpt",
        per_device_train_batch_size=cfg["trainer"]["per_device_train_batch_size"],
        per_device_eval_batch_size=cfg["trainer"]["per_device_eval_batch_size"],
        num_train_epochs=cfg["trainer"]["num_train_epochs"],
        learning_rate=cfg["trainer"]["learning_rate"],
        # Using evaluation_strategy for newer versions, fallback to do_eval for older
        # evaluation_strategy="epoch" if "evaluation_strategy" in TrainingArguments.__init__.__code__.co_varnames else None,
        # do_eval= True if "evaluation_strategy" not in TrainingArguments.__init__.__code__.co_varnames else None,
        # Using save_strategy for newer versions, fallback to save_total_limit for older
        save_strategy="epoch" if "save_strategy" in TrainingArguments.__init__.__code__.co_varnames else None,
        save_total_limit=1 if "save_strategy" not in TrainingArguments.__init__.__code__.co_varnames else None,
        # evaluation_strategy="epoch",
        # save_strategy="epoch",
        logging_steps=cfg["trainer"]["logging_steps"],
        weight_decay=cfg["trainer"]["weight_decay"],
        lr_scheduler_type=cfg["trainer"]["lr_scheduler_type"],
        warmup_steps=cfg["trainer"]["warmup_steps"])

Trainer(model=model,
        args=args,
        train_dataset=tokenized_train_ds, #train_ds,
        eval_dataset=tokenized_val_ds).train() #val_ds).train()

In [40]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=hf_tok,
    mlm=False,            # causal LM, not masked LM
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
)
trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,0.8099
100,0.4621
150,0.3342
200,0.2916
250,0.2745
300,0.2649
350,0.2607
400,0.2588
450,0.2575
500,0.2572


TrainOutput(global_step=500, training_loss=0.347130407333374, metrics={'train_runtime': 16.1076, 'train_samples_per_second': 62.082, 'train_steps_per_second': 31.041, 'total_flos': 2445312000.0, 'train_loss': 0.347130407333374, 'epoch': 500.0})

In [43]:
model.save_pretrained("tiny-gpt")


In [44]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
tok   = PreTrainedTokenizerFast(tokenizer_file="tiny-tokenizer.json",
                                bos_token="<bos>", eos_token="<eos>", pad_token="<pad>")
model = GPT2LMHeadModel.from_pretrained("tiny-gpt")
# tok = PreTrainedTokenizerFast(tokenizer_file="tiny-tokenizer.json",
#                               bos_token="<bos>", eos_token="<eos>", pad_token="<pad>")
# model = GPT2LMHeadModel.from_pretrained("tiny-gpt")

prompt = tok("<bos>", return_tensors="pt")
out = model.generate(**prompt, max_length=8, do_sample=True, top_k=1)
print(tok.decode(out[0]))
# -> "<bos> apple apple apple ..."  or  "<bos> banana banana ..."


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<bos> apple <eos> <eos> <eos> <eos> <eos> <eos>
