In [28]:
!pip install transformers datasets accelerate torch



In [29]:
import torch
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [30]:
model_name = "distilgpt2"   # safer than gpt2 for Colab

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: distilgpt2
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
transformer.h.{0, 1, 2, 3, 4, 5}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [31]:
stories = [
    "Once upon a time a robot learned human emotions.",
    "A dragon opened a bakery in a small town.",
    "An AI became friend of an astronaut in space.",
    "A magical forest could talk to lost travelers.",
    "A boy found a book that predicts the future."
]

dataset = Dataset.from_dict({"text": stories})

In [32]:
def tokenize_function(examples):
    encodings = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

    encodings["labels"] = encodings["input_ids"].copy()
    return encodings

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [33]:
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

In [34]:
training_args = TrainingArguments(
    output_dir="./story-model",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_steps=1,
    save_steps=10,
    learning_rate=5e-5,
    report_to="none"
)

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

In [36]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,9.441807
2,5.173274
3,3.229376
4,2.128618
5,1.617899
6,0.881986
7,0.894068
8,0.85002
9,0.752753


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=9, training_loss=2.7744223872820535, metrics={'train_runtime': 10.9802, 'train_samples_per_second': 1.366, 'train_steps_per_second': 0.82, 'total_flos': 244965703680.0, 'train_loss': 2.7744223872820535, 'epoch': 3.0})

In [38]:
def generate_story(prompt):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=80,
        do_sample=True,
        temperature=0.9,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [39]:
print(generate_story("Once upon a time"))

Once upon a time we were looking at things and what″ we were aiming for that are now going to be on the horizon to meet you and be able to take your experiences and become friends.
