In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling




In [21]:
model_name = "gpt2"
data_path  = "data\\shakespeare.txt" 
device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer  = GPT2Tokenizer.from_pretrained(model_name)
model      = GPT2LMHeadModel.from_pretrained(model_name)

# Fine-Tune With Transformers

In [3]:
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

In [None]:
train_dataset = load_dataset(data_path, tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [5]:
training_args = TrainingArguments(
    output_dir=".models/gpt2-shakespeare",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,  # adjust for your GPU
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [6]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,4.0211
200,3.8057
300,3.7275
400,3.5924
500,3.5979
600,3.6281
700,3.5722
800,3.5639
900,3.5616
1000,3.6296


TrainOutput(global_step=3960, training_loss=3.328194246388445, metrics={'train_runtime': 613.4897, 'train_samples_per_second': 12.91, 'train_steps_per_second': 6.455, 'total_flos': 517358223360000.0, 'train_loss': 3.328194246388445, 'epoch': 3.0})

# Fine-Tune with pytorch

In [31]:
import math, os, random
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
import torch

batch_size     = 2
context_length = 128
embedding_dim  = 768
lr             = 5e-5
grad_accum_steps = 8
epochs         = 3
seed           = 42
random.seed(seed); torch.manual_seed(seed)

<torch._C.Generator at 0x2253079f390>

In [32]:
vocab_size = tokenizer.vocab_size

# ----------------- Data -----------------
def load_data(path):
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    ids = tokenizer.encode(text)
    n = int(0.9 * len(ids))
    return ids[:n], ids[n:]

class TokenDataset(Dataset):
    def __init__(self, data, block):
        self.data = data; self.block = block
    def __len__(self): return max(1, len(self.data) - self.block)
    def __getitem__(self, i):
        x = torch.tensor(self.data[i:i+self.block], dtype=torch.long)
        y = torch.tensor(self.data[i+1:i+self.block+1], dtype=torch.long)
        return x, y

In [None]:
train_ids, val_ids = load_data(data_path)
train_ds = TokenDataset(train_ids, context_length)
val_ds   = TokenDataset(val_ids, context_length)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, drop_last=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
num_training_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
)

for epoch in range(epochs):
    step = 0
    print("epoch: ", epoch+1)
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        outputs = model(input_ids=xb, labels=yb)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        if (step+1) % grad_accum_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        print(f"step {step:5d} | train loss {loss.item():.4f}")
        step+=1
    

torch.save(model.state_dict(), "models\\gpt2-pytorch-finetune.pt")
print("done.")

# Test Fine-Tuned models

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-shakespeare")
model = GPT2LMHeadModel.from_pretrained("./gpt2-shakespeare")

In [13]:
def generate_text(model, tokenizer, prompt, max_new_tokens=50, temperature=1.0, top_k=50):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt")
    
    with torch.no_grad():
        output = model.generate(
            **(inputs.to(device)),
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
            do_sample=True,   # random sampling instead of greedy
            pad_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [16]:
generate_text(model, tokenizer, "WARWICK: \nTrust me, my lord")

"WARWICK: \nTrust me, my lord, that I know that we both intend.\n\nFirst Lord:\nWe'll go with them; see how they proceed.\n\nKING RICHARD III:\nNow, in this first place, it shall be understood\nThat you and"

In [17]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

Total parameters: 124439808
Trainable parameters: 124439808
