In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# 1. Load the "Brain" and the "Dictionary"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# 2. Process your Shakespeare file
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=block_size)

# Using the 'data.txt' we just downloaded
train_dataset = load_dataset("data.txt", tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 3. Training Rules (Setting it up for the Colab GPU)
training_args = TrainingArguments(
    output_dir="./gpt2-shakespeare",
    overwrite_output_dir=True,
    num_train_epochs=1,            # Let's start with 1 epoch for a quick test
    per_device_train_batch_size=4,
    save_steps=100,
    logging_steps=10,
)

# 4. The Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# 5. START TRAINING
print("Starting training... this might take 2-5 minutes.")
trainer.train()

# 6. Save the results
trainer.save_model("./gpt2-shakespeare")
tokenizer.save_pretrained("./gpt2-shakespeare")
print("Done! Model is saved in the 'gpt2-shakespeare' folder.")