## Required Libraries

In [2]:
%pip install transformers datasets torch


Note: you may need to restart the kernel to use updated packages.


# Dataset PreProcessing

In [3]:
from datasets import Dataset
from transformers import AutoTokenizer


with open(r"C:\Users\lenovo\Documents\GitHub\Story_Generator_using_Gen_AI\Stories.txt", "r", encoding="utf-8") as f:
    poems = [p.strip() for p in f.read().split("\n\n") if len(p.strip().split()) > 10]

dataset = Dataset.from_dict({"text": poems})
dataset = dataset.train_test_split(test_size=0.1)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=110
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [4]:
%pip install hf_xet

Note: you may need to restart the kernel to use updated packages.


# Fine Tuning GPT2 Model

In [10]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

# Load model (use smaller variant if available)
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Optimized training arguments for prototyping
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-prototype",
    num_train_epochs=1,                  
    per_device_train_batch_size=16,      # Increased batch size
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,       # Helps with memory
    learning_rate=5e-5,                  # Slightly higher for faster convergence
    warmup_steps=10,                    # Reduced warmup
    max_steps=100,                      # Limit training steps
    logging_steps=10,
    save_steps=100,
    eval_steps=50,
    fp16=True,                          # Enable mixed precision if GPU available
    optim="adamw_torch",                # Better optimizer
    report_to="none",                   # Disable logging services
    dataloader_num_workers=2,           # Reduced workers
)

# Get actual dataset sizes
train_size = len(tokenized_datasets["train"])
test_size = len(tokenized_datasets["test"])

# Use smaller subsets (10% or max available)
train_subset = tokenized_datasets["train"].select(range(min(50, train_size)))
eval_subset = tokenized_datasets["test"].select(range(min(5, test_size)))  # Very small eval set

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
)

# Train
trainer.train()

# Save only the model (no tokenizer needed if unchanged)
model.save_pretrained("./gpt2-finetuned-prototype")



Step,Training Loss
10,4.6168
20,3.4289
30,2.7742
40,2.2618
50,1.8668
60,1.5689
70,1.3444
80,1.1844
90,1.0803
100,1.0221


In [None]:
%pip install accelerate>=0.26.0

Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install --upgrade jupyter ipywidgets


Note: you may need to restart the kernel to use updated packages.
