In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch

# Load dataset
dataset = load_dataset("yahma/alpaca-cleaned")

# Load tokenizer and model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token  # GPT2 doesn't have a pad token by default

model = AutoModelForCausalLM.from_pretrained(model_name)

# Format data into a conversational format (system, user, assistant)
def format_conversational(example):
    conversation = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": example['instruction']},
    ]
    if example.get("input"):
        conversation.append({"role": "user", "content": example['input']})
    conversation.append({"role": "assistant", "content": example['output']})
    return {"conversation": conversation}

# Apply conversational formatting
dataset = dataset.map(format_conversational)

# Tokenize the dataset
tokenized_dataset = dataset.map(lambda e: tokenizer([turn['content'] for turn in e["conversation"]], truncation=True, padding="max_length", max_length=512), batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-conversational-tuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=5e-5,
    evaluation_strategy="no",
    fp16=torch.cuda.is_available(),  # Use float16 if on GPU
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()
