In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

In [2]:
model_name = "gpt2"
data_path  = "data\\shakespeare.txt" 
device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer  = GPT2Tokenizer.from_pretrained(model_name)
model      = GPT2LMHeadModel.from_pretrained(model_name).to(device)

# Fine-Tune With Transformers

In [3]:
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

In [None]:
train_dataset = load_dataset(data_path, tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [5]:
training_args = TrainingArguments(
    output_dir=".models/gpt2-shakespeare",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,  # adjust for your GPU
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [6]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,4.0211
200,3.8057
300,3.7275
400,3.5924
500,3.5979
600,3.6281
700,3.5722
800,3.5639
900,3.5616
1000,3.6296


TrainOutput(global_step=3960, training_loss=3.328194246388445, metrics={'train_runtime': 613.4897, 'train_samples_per_second': 12.91, 'train_steps_per_second': 6.455, 'total_flos': 517358223360000.0, 'train_loss': 3.328194246388445, 'epoch': 3.0})

# Fine-Tune with pytorch

In [3]:
import math, os, random
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
import torch

batch_size     = 2
context_length = 128
embedding_dim  = 768
lr             = 5e-5
grad_accum_steps = 4
epochs         = 3
seed           = 42
random.seed(seed); torch.manual_seed(seed)

<torch._C.Generator at 0x228073025b0>

In [4]:
vocab_size = tokenizer.vocab_size

# ----------------- Data -----------------
def load_data(path):
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    ids = tokenizer.encode(text)
    n = int(0.9 * len(ids))
    return ids[:n], ids[n:]

class TokenDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, block_size=128):
        n = (len(tokens) // block_size) * block_size
        tokens = tokens[:n]
        self.blocks = tokens.view(-1, block_size)

    def __len__(self):
        return self.blocks.size(0)

    def __getitem__(self, idx):
        block = self.blocks[idx]
        return {"input_ids": block, "labels": block}

In [5]:
train_ids, val_ids = load_data(data_path)
train_ds = TokenDataset(torch.tensor(train_ids), context_length)
val_ds   = TokenDataset(torch.tensor(val_ids), context_length)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, drop_last=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
num_training_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
)

for epoch in range(epochs):
    step = 0
    print("epoch: ", epoch+1)
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        if (step+1) % grad_accum_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        if step % 100 == 0:
            print(f"step {step:5d} | train loss {loss.item():.4f}")
        step+=1
torch.save(model.state_dict(), "models\\gpt2-pytorch-finetune.pt")
print("done.")

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


epoch:  1


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


step     0 | train loss 5.1439
step   100 | train loss 4.4551
step   200 | train loss 3.3930
step   300 | train loss 4.7157
step   400 | train loss 3.9994
step   500 | train loss 3.6591
step   600 | train loss 3.4069
step   700 | train loss 4.0664
step   800 | train loss 3.6372
step   900 | train loss 3.1362
step  1000 | train loss 4.1606
step  1100 | train loss 4.3084
epoch:  2
step     0 | train loss 3.3242
step   100 | train loss 3.7773
step   200 | train loss 3.4595
step   300 | train loss 3.1887
step   400 | train loss 3.4277
step   500 | train loss 3.5998
step   600 | train loss 2.8853
step   700 | train loss 4.1786
step   800 | train loss 3.4793
step   900 | train loss 3.5682
step  1000 | train loss 3.3923
step  1100 | train loss 4.2079
epoch:  3
step     0 | train loss 3.7049
step   100 | train loss 3.5700
step   200 | train loss 3.1907
step   300 | train loss 3.6696
step   400 | train loss 3.1983
step   500 | train loss 3.4509
step   600 | train loss 3.4739
step   700 | train 

# Test Fine-Tuned models

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-shakespeare")
model = GPT2LMHeadModel.from_pretrained("./gpt2-shakespeare")

In [7]:
def generate_text(model, tokenizer, prompt, max_new_tokens=50, temperature=1.0, top_k=50):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt")
    
    with torch.no_grad():
        output = model.generate(
            **(inputs.to(device)),
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
            do_sample=True,   # random sampling instead of greedy
            pad_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# transformer fine-tune
generate_text(model, tokenizer, "WARWICK: \nTrust me, my lord")

"WARWICK: \nTrust me, my lord, that I know that we both intend.\n\nFirst Lord:\nWe'll go with them; see how they proceed.\n\nKING RICHARD III:\nNow, in this first place, it shall be understood\nThat you and"

In [8]:
# pytorch fine-tune
# mine is also faster (hf transformers is writen to handle larger datasets) 
generate_text(model, tokenizer, "WARWICK: \nTrust me, my lord")

'WARWICK: \nTrust me, my lord, you cannot be true prince,\nBut a liar. When I speak it,\nA lie? A prince?\n\nWARWICK:\nA little, a little; not so much as a prince.\n\nKING HENRY'

In [17]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

Total parameters: 124439808
Trainable parameters: 124439808
