In [None]:
! pip install lightning transformers rich -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.3/71.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m660.0/660.0 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.9/69.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
! cp /content/drive/MyDrive/ERAv1/S17/S17.zip .
! unzip S17.zip

! cp -r /content/S17/GPT/data .

In [None]:
import os
import torch
import random
import logging
from torch import nn
from os.path import exists
from dataset import GPTDataset
import lightning.pytorch as pl
from collections import Counter
from torch.nn import functional as F
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformer import DecoderTransformer
from torchvision import transforms, datasets
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import LearningRateMonitor, RichProgressBar
from utils import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
)

In [None]:
path_do_data = "data/english.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size

In [None]:
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
context = torch.zeros((1, 1), dtype=torch.long).to(DEVICE)

In [None]:
class GPTLightning(pl.LightningModule):
  def __init__(self, train_data, val_data, batch_size, seq_len=64, embed_dim=128*6, n_heads=6, n_layers=6, ff_size=256, vocab_size=vocab_size, dropout=0.2):
    super().__init__()
    self.seq_len = seq_len
    self.batch_size = batch_size
    self.train_data = train_data
    self.val_data = val_data
    self.model = DecoderTransformer(n_layers, n_heads, embed_dim, ff_size, seq_len, vocab_size, dropout=dropout)
    self.criterion = nn.CrossEntropyLoss()
    self.save_hyperparameters()

  def forward(self, x):
    return self.model(x)

  def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(),
                                 lr=3e-4)
    return(optimizer)


  def training_step(self, batch, batch_idx):
    x, y = batch
    logits = self(x)
    loss = self.calc_loss(logits, y)
    self.log('train_loss', loss.item(), prog_bar=True, on_epoch=True, on_step=True, logger=True)
    return(loss)

  def validation_step(self, batch, batch_idx):
    x, y = batch
    logits = self(x)
    loss = self.calc_loss(logits, y)
    self.log('val_loss', loss.item(), prog_bar=True, on_epoch=True, on_step=True, logger=True)
    return(loss)

  def on_train_epoch_end(self):
    iter_loss = self.estimate_iter_loss(self.train_data)
    self.log('train_iter_loss', iter_loss.item(), prog_bar=True, on_epoch=True, logger=True)

  def on_validation_epoch_end(self):
    iter_loss = self.estimate_iter_loss(self.val_data)
    self.generate_text(context, tokenizer, max_new_tokens=100)
    self.log('val_iter_loss', iter_loss.item(), prog_bar=True, on_epoch=True, logger=True)

  def optimizer_zero_grad(self, epoch, batch_idx, optimizer):
    optimizer.zero_grad(set_to_none=True)

  def estimate_iter_loss(self, data, eval_iters=10):
    losses = torch.zeros(eval_iters)

    for k in range(eval_iters):
      x, y = get_batch(data, self.seq_len, self.batch_size)
      logits = self(x)
      losses[k] = self.calc_loss(logits, y).item()
    return(losses.mean())

  def calc_loss(self, logits, targets):
    B, T, C = logits.shape
    logits = torch.reshape(logits, (B * T, C))
    targets = torch.reshape(targets, (B * T,))
    loss = self.criterion(logits, targets)
    return(loss)

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_crop = idx[:, -self.seq_len:]
      logits = self.model(idx_crop)[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=-1)
    return(idx)

  def generate_text(self, context, tokenizer, max_new_tokens=100):
    enc_sec = self.generate(idx=context, max_new_tokens=max_new_tokens)[0]
    print(decode(
          enc_sec=enc_sec,
          tokenizer=tokenizer)
    )
    print('==================================================================')

In [None]:
trainer = pl.Trainer(
                     log_every_n_steps=1,
                     check_val_every_n_epoch=5,
                     enable_model_summary=True,
                     max_epochs=10,
                     accelerator='auto',
                     devices=1 if torch.cuda.is_available() else None,
                     logger=[TensorBoardLogger("logs/", name="GPT")],
                     callbacks=RichProgressBar(leave=True)
                     )

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
train_ds = GPTDataset(train_data, BLOCK_SIZE)
val_ds = GPTDataset(val_data, BLOCK_SIZE)

train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=BATCH_SIZE)

In [None]:
model = GPTLightning(train_data, val_data, BATCH_SIZE)
trainer.fit(model, train_dataloader, val_dataloader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
trainer.validate(model, val_dataloader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'val_loss_epoch': 10.112564086914062, 'val_iter_loss': 10.185652732849121}]

In [None]:
context = torch.zeros((1, 1), dtype=torch.long).to(DEVICE)
model.cuda().generate_text(context, tokenizer, max_new_tokens=100)

[PAD] in the ways of model. negative log - likelihood becomes unhappy at smaller values, where it would be limited in capturing " different " weight " kinds of relationships. this is exactly the same patch exactly just that embeddings of each mnist character. if we want to know if a dog is a lab, german shepherd, golden retriever, or poodle, it's somewhere there in the dog of the image's embeddings. if we want to know if the vehicle
