<a href="https://colab.research.google.com/github/AnasAlhasan/large-models-course/blob/main/OptimizedGPT2Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Fine-tune a tiny GPT model**

In [None]:
!pip install transformers datasets accelerate


In [7]:
#core imports
import os, time, math, torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM

In [8]:
#Device Check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

# show installed transformer version
import transformers
print("Transformers version:", transformers.__version__)

Device:  cuda
Transformers version: 4.56.0


In [10]:
ModelName = "sshleifer/tiny-gpt2"
tokenizer = AutoTokenizer.from_pretrained(ModelName)

if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token


Model = AutoModelForCausalLM.from_pretrained(ModelName)
Model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 2)
    (wpe): Embedding(1024, 2)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-1): 2 x GPT2Block(
        (ln_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=6, nx=2)
          (c_proj): Conv1D(nf=2, nx=2)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=8, nx=2)
          (c_proj): Conv1D(nf=2, nx=8)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=2, out_features=50257, bias=False)
)

In [16]:
#making simple dataset
texts = [
    "Artificial intelligence will change the world.",
    "I am from Jordan and I live in irbid city.",
    "Gym time is the best time in the day.",
    "Technology will evolve using AI."
] * 200

print("Dataset size(number of lines): ", len(texts))


Dataset size(number of lines):  800


[40, 716, 422, 8078, 290, 314, 2107, 287, 4173, 14065, 1748, 13]

In [17]:
#creating a sample dataset
class SimpleTextDataset(Dataset):
  def __init__(self, texts, tokenizer, seq_len=64):
    self.examples = []
    for t in texts:
      enc = tokenizer.encode(t, add_special_tokens=True)
      for i in range(0, len(enc), seq_len):
        chunk = enc[i:i+seq_len]
        if len(chunk) < seq_len:
          chunk = chunk + [tokenizer.pad_token_id] * (seq_len - len(chunk))
        self.examples.append(torch.tensor(chunk, dtype=torch.long))

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, idx):
    return self.examples[idx]

SEQ_LENGTH = 64 # could be changed

dataset = SimpleTextDataset(texts, tokenizer,SEQ_LENGTH)
print("Number of training examples(chunks): ", len(dataset))


Number of training examples(chunks):  800


In [31]:
#Loading Data

BATCH_SIZE = 4 #micro-batch size
ACCUM_STEPS = 4 #gradient accumulation steps
EPOCHS = 100
LEARN_RATE = 5e-5

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
print("Dataloader is ready. Batches per epoch: ",math.ceil(len(dataset)/BATCH_SIZE))

Dataloader is ready. Batches per epoch:  200


In [32]:
#Training and Optimizing
optimizer = torch.optim.AdamW(Model.parameters(), lr=LEARN_RATE)
scaler = torch.cuda.amp.GradScaler(enabled=(device.type=="cuda"))

Model.train()
global_step = 0
for epoch in range(EPOCHS):
  epoch_loss =0.0
  optimizer.zero_grad()
  for step, batch in enumerate(dataloader):
    batch = batch.to(device)

    with torch.cuda.amp.autocast(enabled=(device.type=="cuda")):
      outputs = Model(input_ids= batch, labels=batch)
      loss = outputs.loss / ACCUM_STEPS

    scaler.scale(loss).backward()

    if (step + 1) % ACCUM_STEPS == 0:
      scaler.step(optimizer)
      scaler.update()
      optimizer.zero_grad()
      global_step += 1

    epoch_loss += loss.item() * ACCUM_STEPS


  avg_loss = epoch_loss / len(dataloader)
  print(f"Epoch {epoch+1} completed. Avg loss: {avg_loss: .4f} (global steps: {global_step})")

  scaler = torch.cuda.amp.GradScaler(enabled=(device.type=="cuda"))
  with torch.cuda.amp.autocast(enabled=(device.type=="cuda")):


Epoch 1 completed. Avg loss:  10.6715 (global steps: 50)
Epoch 2 completed. Avg loss:  10.6602 (global steps: 100)
Epoch 3 completed. Avg loss:  10.6488 (global steps: 150)
Epoch 4 completed. Avg loss:  10.6372 (global steps: 200)
Epoch 5 completed. Avg loss:  10.6255 (global steps: 250)
Epoch 6 completed. Avg loss:  10.6136 (global steps: 300)
Epoch 7 completed. Avg loss:  10.6015 (global steps: 350)
Epoch 8 completed. Avg loss:  10.5893 (global steps: 400)
Epoch 9 completed. Avg loss:  10.5769 (global steps: 450)
Epoch 10 completed. Avg loss:  10.5644 (global steps: 500)
Epoch 11 completed. Avg loss:  10.5517 (global steps: 550)
Epoch 12 completed. Avg loss:  10.5388 (global steps: 600)
Epoch 13 completed. Avg loss:  10.5257 (global steps: 650)
Epoch 14 completed. Avg loss:  10.5125 (global steps: 700)
Epoch 15 completed. Avg loss:  10.4991 (global steps: 750)
Epoch 16 completed. Avg loss:  10.4856 (global steps: 800)
Epoch 17 completed. Avg loss:  10.4719 (global steps: 850)
Epoch 1

In [34]:
#saving checkpoints
OUT_DIR = "/content/fintuned_tiny_gpt2"
os.makedirs(OUT_DIR, exist_ok=True)
Model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
print("Save checkpoint to: ", OUT_DIR)


Save checkpoint to:  /content/fintuned_tiny_gpt2


In [37]:
#Inference and Testing
# Reload the model and weights
tokenizer = AutoTokenizer.from_pretrained(OUT_DIR)
Model = AutoModelForCausalLM.from_pretrained(OUT_DIR).to(device)
Model.eval()

prompt = "Jordan"

inputs = tokenizer(prompt, return_tensors="pt").to(device)

#Generate text

with torch.no_grad():
  generated = Model.generate(**inputs, max_length=80, do_sample=True, top_k=50)


print(tokenizer.decode(generated[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Jordanificial am city Jordan ir factors in best I intelligence. using. intelligenceTechnology deflect Jordan change AI world dayArt ir mutual live city usingificial time in change from am


In [38]:
#Benchmarking
import gc


def time_generation(use_amp):
  Model.to(device)
  Model.eval()
  start = time.time()
  with torch.no_grad():
    if use_amp and device.type == "cuda":
      with torch.cuda.amp.autocast():
        _ = Model.generate(**inputs, max_length=80)
    else:
      _ = Model.generate(**inputs, max_length=80)
  return time.time() - start


t_no_amp = time_generation(False)
t_amp = time_generation(True)
print(f"Generation time - no amp: {t_no_amp: .3f}s || with amp: {t_amp: .3f}s")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  with torch.cuda.amp.autocast():
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generation time - no amp:  0.218s || with amp:  0.398s


In [45]:
#Dynamic Quantization (CPU) demonstration


#load model on cpu
model_cpu = AutoModelForCausalLM.from_pretrained(OUT_DIR).to("cpu")
model_cpu.eval()


#apply dynamic quantization to linear layers

model_q = torch.quantization.quantize_dynamic(model_cpu, {torch.nn.Linear}, dtype=torch.qint8)

#compare file sizes

def dir_size(path):
  total = 0
  for dirpath, dirnames, filenames in os.walk(path):
    for f in filenames:
      fp = os.path.join(dirpath, f)
      total += os.path.getsize(fp)
  return total / (1024*1024)


print("Original save model size (MB): ", dir_size(OUT_DIR))
#Saving quantized model temporarily to measure size
Q_DIR = "/content/quantized_model"
os.makedirs(Q_DIR, exist_ok= True)
torch.save(model_q.state_dict(), os.path.join(Q_DIR, "pytorch_model_quantized.pt"))
print("Quantized model state dict saved (MB): ", dir_size(Q_DIR))

Original save model size (MB):  4.985299110412598
Quantized model state dict saved (MB):  0.49916648864746094


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_q = torch.quantization.quantize_dynamic(model_cpu, {torch.nn.Linear}, dtype=torch.qint8)
