In [5]:
# Install dependencies
!pip install transformers datasets torch -q

import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb

from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Load pretrained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Fix: Add pad_token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))

# Prepare small custom dataset
custom_data = [
    {"text": "The Open Source community has greatly accelerated the development of LLMs."},
    {"text": "Fine-tuning pretrained models can save significant resources."},
    {"text": "Deployment of optimized models requires less computational power."}
]
dataset = Dataset.from_list(custom_data)

# Tokenize and add labels
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding='max_length', max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize_fn)

# Fine-tuning setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_steps=1,
    logging_dir='./logs',
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Train the model
trainer.train()

# Inference
prompt = "Leveraging open-source models"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

outputs = model.generate(**inputs, max_length=50)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated text:", generated_text)


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Step,Training Loss
1,9.5202
2,6.511
3,4.4482
4,3.1391
5,1.9115
6,1.2573


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text: Leveraging open-source models of the future.
