In [None]:
!where python   # on Windows


In [None]:
!pip install transformers datasets



In [None]:
import datasets
import transformers
import torch

print("datasets version:", datasets.__version__)
print("transformers version", transformers.__version__)
print("torch version:", torch.__version__)


datasets version: 4.0.0
transformers version 4.55.2
torch version: 2.8.0+cu126


What we did in the Block below is loading the dataset through the library provided by Hugging Face to access one of the datasets they have. Orignially it was only one part the train part so we splitted it

In [None]:
from datasets import load_dataset

dataset = load_dataset("mkessle/public-domain-poetry")

train_test_split = dataset["train"].train_test_split(test_size=0.1, seed = 42)

train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


poem-data.zip:   0%|          | 0.00/47.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/38521 [00:00<?, ? examples/s]

Train dataset size: 34668
Test dataset size: 3853


Here we need to Tokenize the Dataset. This is considered to be the only preprocessing stage we are doing as the data we got is preprocessed.. We will truncate sequences to a fixed length.

Also we do the mapping thing that is changing the dataset into another one, the tokenized one. And this step takes time.



In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token   #Set padding token


#Here I should define the tokenization function although I have no idea what the later does.
def tokenize_function(examples):
  texts = examples["Poem Text"]
  processed_text = [text if isinstance(text, str) and text is not None else "" for text in texts]
  return tokenizer(processed_text, truncation = True, padding = "max_length", max_length = 128)


tokenized_train = train_dataset.map(tokenize_function, batched = True, remove_columns = train_dataset.column_names)
tokenized_test  = test_dataset.map(tokenize_function, batched=True, remove_columns=test_dataset.column_names)

# Quick check
print(tokenized_train[0])
print(tokenized_test[0])

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/34668 [00:00<?, ? examples/s]

Map:   0%|          | 0/3853 [00:00<?, ? examples/s]

{'input_ids': [33477, 1849, 1026, 318, 407, 1464, 10787, 284, 910, 284, 534, 1664, 11, 366, 5703, 6004, 284, 428, 9707, 1, 393, 366, 2061, 466, 345, 892, 286, 428, 329, 257, 22672, 1701, 329, 530, 460, 1239, 307, 1654, 326, 262, 22054, 481, 2754, 262, 2300, 287, 262, 976, 835, 326, 262, 1560, 263, 857, 13, 6430, 994, 318, 257, 1339, 326, 1838, 281, 6631, 284, 428, 922, 3896, 11, 290, 314, 5529, 326, 340, 318, 287, 3872, 7932, 11, 290, 11, 3584, 340, 468, 262, 5585, 286, 852, 257, 277, 540, 11, 340, 318, 287, 3950, 4112, 1109, 13, 198, 201, 198, 33477, 1849, 1858, 373, 1752, 281, 4457, 1468, 20161, 12, 21048, 543, 281, 39610, 11, 326, 18288, 6512, 543, 1629, 1773, 385, 58, 16, 60, 2753], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

The next step we want to do is to set up the data collator for language modeling which will handle batching during training


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # GPT-2 is a causal LM, not masked LM


Next Step is to load the GPT model and resize the token embeddings to include any padding token we set.

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

print(model.config)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.55.2",
  "use_cache": true,
  "vocab_size": 50257
}



In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

Now we are with the trianing block where we have done no preprocessing or looking at the data. All we did is just the main blocks to fine tune a GPT2. That included downloading dataset, tokinizing it, datacollator (No Idea what is this one), downloaded the pretrained model

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM


model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir = "./results",
    per_device_train_batch_size = 2,
    per_device_eval_batch_size =2 ,
    num_train_epochs =1 ,
    save_strategy = "epoch",
    logging_steps = 50,
    learning_rate = 5e-5,
    push_to_hub =False
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_test,
    data_collator = data_collator
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,4.3172
100,3.8913
150,3.6895
200,3.6439
250,3.5452
300,3.6264
350,3.5362
400,3.5402
450,3.6013
500,3.4978


TrainOutput(global_step=17334, training_loss=3.2917588650870795, metrics={'train_runtime': 1782.5415, 'train_samples_per_second': 19.449, 'train_steps_per_second': 9.724, 'total_flos': 2264618041344000.0, 'train_loss': 3.2917588650870795, 'epoch': 1.0})

In [None]:
# 1️⃣ Save your fine-tuned model and tokenizer
trainer.save_model("./gpt2-poetry")
tokenizer.save_pretrained("./gpt2-poetry")

# 2️⃣ Compress the folder into a zip file
!zip -r gpt2-poetry.zip gpt2-poetry

# 3️⃣ Download the zip file
from google.colab import files
files.download("gpt2-poetry.zip")


  adding: gpt2-poetry/ (stored 0%)
  adding: gpt2-poetry/tokenizer.json (deflated 82%)
  adding: gpt2-poetry/model.safetensors (deflated 7%)
  adding: gpt2-poetry/vocab.json (deflated 59%)
  adding: gpt2-poetry/special_tokens_map.json (deflated 60%)
  adding: gpt2-poetry/merges.txt (deflated 53%)
  adding: gpt2-poetry/generation_config.json (deflated 24%)
  adding: gpt2-poetry/training_args.bin (deflated 53%)
  adding: gpt2-poetry/config.json (deflated 51%)
  adding: gpt2-poetry/tokenizer_config.json (deflated 54%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>