In [1]:
# !pip install transformers accelerate dataset

In [2]:
import os

from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments

Load a dataset from Huggingface

In [3]:
ds = load_dataset('MohamedRashad/characters_backstories')

Found cached dataset parquet (/home/tcapelle/.cache/huggingface/datasets/MohamedRashad___parquet/MohamedRashad--characters_backstories-6398ba4bb1a6e421/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 2322
    })
})

As this dataset has no validation split, we will create one:

In [5]:
ds = ds["train"].train_test_split(test_size=0.2)

In [6]:
model_checkpoint = "roneneldan/TinyStories-33M"  # distilgpt2

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

We can now call the tokenizer on all our texts. This is very simple, using the map method from the Datasets library. First we define a function that call the tokenizer on our texts:

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["target"])

Then we apply it to all the splits in our `datasets` object, using `batched=True` and 4 processes to speed up the preprocessing. We won't need the `text` column afterward, so we discard it.

In [9]:
ds["train"][232]

{'text': 'Generate Backstory based on following information\nCharacter Name: Vaskir Tempus\nCharacter Race: Yuan-ti abomination\nCharacter Class: Paladin of treachery/ goo bladelock\n\nOutput:\n',
 'target': 'Vaskir is an exiled yuan ti who forsook his religion to worship the great old one dendar. he is a master swordsman who wields a greatsword in tandem with a longsword, effectively dualwielding the huge blade. He is chaotic evil, believing that government and law holds back all of humanity from their goals, keeping them oppressed and subjugated under the foot of the highest ruler'}

we want to grab the characters backstories in the `target` column

In [10]:
tokenized_datasets = ds.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text", "target"])

Map (num_proc=4):   0%|          | 0/1857 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2264 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2812 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2573 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2952 > 2048). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/465 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2661 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4725 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2464 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3121 > 2048). Running this sequence through the model will result in indexing errors


In [11]:
tokenized_datasets["train"][1]["input_ids"][0:10]

[13]

In [12]:
tokenizer.decode(tokenized_datasets["train"][1]["input_ids"])

'.'

In [13]:
block_size = 256

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

First note that we duplicate the inputs for our labels. This is because the model of the 🤗 Transformers library apply the shifting to the right, so we don't need to do it manually.

Also note that by default, the `map` method will send a batch of 1,000 examples to be treated by the preprocessing function. So here, we will drop the remainder to make the concatenated tokenized texts a multiple of `block_size` every 1,000 examples. You can adjust this behavior by passing a higher batch size (which will also be processed slower). You can also speed-up the preprocessing by using multiprocessing:

In [14]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/1857 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/465 [00:00<?, ? examples/s]

In [15]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [16]:
os.environ["WANDB_PROJECT"] = "tiny-stories-characters"

In [17]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-characters-backstories",
    report_to="wandb",
    logging_steps=1,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
)

In [19]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m ([33mdeeplearning-ai-temp[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,5.7958,5.142995
2,4.8871,5.009582
3,4.8869,4.998519


TrainOutput(global_step=816, training_loss=5.0707503650702686, metrics={'train_runtime': 92.5014, 'train_samples_per_second': 70.572, 'train_steps_per_second': 8.821, 'total_flos': 284203589566464.0, 'train_loss': 5.0707503650702686, 'epoch': 3.0})

## Generate

In [21]:
model = trainer.model
device = next(model.parameters()).device

In [27]:
prompt = "The hero was half human and cat, his strenghts were"

input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

In [28]:
input_ids

tensor([[  464,  4293,   373,  2063,  1692,   290,  3797,    11,   465, 43071,
           456,   912,   547]], device='cuda:0')

In [29]:
output = model.generate(input_ids, max_length = 128, num_beams=1)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The hero was half human and cat, his strenghts were the first to be. He was the only one who had a lot of power, and he was the only one who had a lot of power. He was a great wizard, and he was the only one who could do it. He was a great wizard, and he was the greatest wizard in the world. He was a great wizard, and he was the greatest wizard in the world. He was a great wizard, and he was the greatest wizard in the world. He was a great wizard, and he was the greatest wizard in the world. He was a great
