In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp "/content/drive/MyDrive/transformers/text.zip" /content/

In [3]:
!unzip text.zip -d ./

Archive:  text.zip
  inflating: ./wikisent2.txt         


In [79]:
!head -n 1500000 wikisent2.txt > wiki_small.txt


In [80]:
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import sentencepiece as spm
from torch.utils.data import Dataset, DataLoader

In [81]:
corpus = 'wiki_small.txt'
model_prefix = 'wiki_spm'

vocab_size = 12000

block_size = 128
batch_size = 32
epochs = 3
lr = 3e-4

embed_dim = 320
num_heads = 5
num_layers = 6
ff_dim = 1280
dropout = 0.1

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [82]:
if not os.path.exists(f'{model_prefix}.model'):

  print('Training tokenizer....')

  spm.SentencePieceTrainer.train(
      input = corpus,
      model_prefix = model_prefix,
      vocab_size = vocab_size,
      model_type = 'bpe',
      character_coverage = 1.0,

      pad_id = 0, pad_piece = '<pad>',
      unk_id = 1, unk_piece = '<unk>',
      bos_id = 2, bos_piece = '<s>',
      eos_id = 3, eos_piece = '</s>',

      hard_vocab_limit = False
  )

  print('Tokenizer trained!')

Training tokenizer....
Tokenizer trained!


In [83]:
sp = spm.SentencePieceProcessor()
sp.load('wiki_spm.model')

vocab_size = sp.get_piece_size()

print('Vocab : ', vocab_size)

Vocab :  12000


In [84]:
def load_tokens(path):

  tokens = []

  with open(path, 'r', encoding = 'utf-8') as f:
    for line in f:
      line = line.strip()

      if not line:
        continue

      ids = sp.encode(line)

      tokens.extend(ids)

  return torch.tensor(tokens, dtype = torch.long)

In [85]:
print('Loading data.....')

tokens = load_tokens(corpus)

print('Total tokens : ', len(tokens))

Loading data.....
Total tokens :  43705641


In [86]:
class Gptdataset(Dataset):

    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) // self.block_size

    def __getitem__(self, idx):
        start = idx * self.block_size
        end = start + self.block_size

        x = self.data[start:end]
        y = self.data[start+1:end+1]

        return x, y


In [87]:
dataset = Gptdataset(tokens, block_size)

loader = DataLoader(
    dataset,
    batch_size = batch_size,
    shuffle = True,
    pin_memory = True
)

In [88]:
class Multihead(nn.Module):

  def __init__(self, dim, heads, dropout):

    super().__init__()

    assert dim % heads == 0

    self.dim = dim
    self.heads = heads
    self.head_dim = dim // heads

    self.wq = nn.Linear(dim, dim)
    self.wk = nn.Linear(dim, dim)
    self.wv = nn.Linear(dim, dim)

    self.proj = nn.Linear(dim, dim)

    self.dropout = nn.Dropout(dropout)


  def forward(self, x, mask):

    B, T, D = x.shape

    Q = self.wq(x).view(B, T, self.heads, self.head_dim).transpose(1, 2)
    K = self.wk(x).view(B, T, self.heads, self.head_dim).transpose(1, 2)
    V = self.wv(x).view(B, T, self.heads, self.head_dim).transpose(1, 2)

    scores = (Q @ K.transpose(-2, -1)) / math.sqrt(self.head_dim)

    scores = scores.masked_fill(mask, float('-inf'))

    attn = F.softmax(scores, dim = -1)

    attn = self.dropout(attn)

    out = attn @ V

    out = out.transpose(1, 2).contiguous().view(B, T, D)

    return self.proj(out)

In [89]:
class DecoderBlock(nn.Module):

  def __init__(self, dim, heads, ff_dim, dropout):

    super().__init__()

    self.ln1 = nn.LayerNorm(dim)

    self.attn = Multihead(dim, heads, dropout)

    self.ln2 = nn.LayerNorm(dim)

    self.ff = nn.Sequential(
        nn.Linear(dim, ff_dim),
        nn.GELU(),
        nn.Linear(ff_dim, dim),
        nn.Dropout(dropout)
    )

  def forward(self, x, mask):

    x = x + self.attn(self.ln1(x), mask)

    x = x + self.ff(self.ln2(x))

    return x

In [100]:
class TinyGpt(nn.Module):

  def __init__(self):

    super().__init__()

    self.token_emb = nn.Embedding(vocab_size, embed_dim)
    self.pos_emb = nn.Embedding(block_size, embed_dim)

    self.drop = nn.Dropout(dropout)


    self.blocks = nn.ModuleList([
        DecoderBlock(
            embed_dim, num_heads, ff_dim, dropout
        )
        for _ in range(num_layers)
    ])

    self.ln_f = nn.LayerNorm(embed_dim)
    self.head = nn.Linear(embed_dim, vocab_size, bias = False)

    self.head.weight = self.token_emb.weight

    mask = torch.triu(
        torch.ones(block_size, block_size),
        diagonal = 1
    ).bool()

    self.register_buffer('mask', mask)

  def forward(self, idx):

    B, T = idx.shape

    tok = self.token_emb(idx)

    pos = self.pos_emb(torch.arange(T, device = idx.device))

    x = self.drop(tok + pos)

    mask = self.mask[:T, :T]

    for block in self.blocks:

      x = block(x, mask)

    x = self.ln_f(x)

    logits = self.head(x)

    return logits

  @torch.no_grad()
  def generate(self, idx, max_new, temp = 1.0, top_k = None):

    for _ in range(max_new):

      idx_cond = idx[:, -block_size :]
      logits = self(idx_cond)

      logits = logits[:, -1, :] / temp

      if top_k:

        v, _ = torch.topk(logits, top_k)

        logits[logits < v[:, [-1]]] = -float('inf')

      probs = F.softmax(logits, dim = -1)
      next_id = torch.multinomial(probs, 1)

      idx = torch.cat([idx, next_id], dim = 1)

    return idx


In [101]:
import math

In [102]:
# =====================
# TRAINING
# =====================

model = TinyGpt().to(device)

optimizer = optim.AdamW(model.parameters(), lr=lr)

criterion = nn.CrossEntropyLoss()


print("Params:",
      sum(p.numel() for p in model.parameters()) / 1e6,
      "M")


print("Start training...")


for epoch in range(epochs):

    model.train()

    total_loss = 0


    for step, (x, y) in enumerate(loader):

        x = x.to(device)
        y = y.to(device)


        logits = model(x)


        loss = criterion(
            logits.view(-1, vocab_size),
            y.view(-1)
        )


        optimizer.zero_grad()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()


        total_loss += loss.item()


        if step % 200 == 0:

            print(
                f"Epoch {epoch+1} | Step {step} | Loss {loss.item():.4f}"
            )
    training_loss = total_loss / len(loader)
    train_ppl = math.exp(training_loss)

    print(
    f"Epoch {epoch+1} | "
    f"Train Loss: {training_loss:.4f} | "
    f"Train Perplexity: {train_ppl:.2f}"
)


    torch.save(
        model.state_dict(),
        f"custom_gpt_epoch{epoch+1}.pt"
    )


Params: 11.27936 M
Start training...
Epoch 1 | Step 0 | Loss 199.8322
Epoch 1 | Step 200 | Loss 13.7480
Epoch 1 | Step 400 | Loss 9.0216
Epoch 1 | Step 600 | Loss 7.8364
Epoch 1 | Step 800 | Loss 7.4863
Epoch 1 | Step 1000 | Loss 7.2884
Epoch 1 | Step 1200 | Loss 6.9972
Epoch 1 | Step 1400 | Loss 6.9242
Epoch 1 | Step 1600 | Loss 6.9822
Epoch 1 | Step 1800 | Loss 6.9059
Epoch 1 | Step 2000 | Loss 6.9021
Epoch 1 | Step 2200 | Loss 6.5821
Epoch 1 | Step 2400 | Loss 6.5872
Epoch 1 | Step 2600 | Loss 6.4783
Epoch 1 | Step 2800 | Loss 6.6564
Epoch 1 | Step 3000 | Loss 6.5224
Epoch 1 | Step 3200 | Loss 6.6152
Epoch 1 | Step 3400 | Loss 6.3901
Epoch 1 | Step 3600 | Loss 6.1267
Epoch 1 | Step 3800 | Loss 6.2728
Epoch 1 | Step 4000 | Loss 6.1299
Epoch 1 | Step 4200 | Loss 6.0691
Epoch 1 | Step 4400 | Loss 5.8115
Epoch 1 | Step 4600 | Loss 6.2690
Epoch 1 | Step 4800 | Loss 5.9307
Epoch 1 | Step 5000 | Loss 5.9419
Epoch 1 | Step 5200 | Loss 5.6430
Epoch 1 | Step 5400 | Loss 6.0375
Epoch 1 | Step 

In [104]:
print("\n--- SAMPLE ---\n")


prompt = "Forest"

ids = sp.encode(prompt)

x = torch.tensor(ids).unsqueeze(0).to(device)

out = model.generate(x, 100, temp=0.8, top_k=40)

print(sp.decode(out[0].tolist()))


--- SAMPLE ---

Forest, he was given his best-school stroke with his best-selling album in a "Hiptes" and "PLise's last song as it was recorded. A member of his best-known team, he was the youngest final round with a half-brother competition. A member of his live live album, he became a member of the band in 1953. A member of his success on his debut single and he played one season with his best-selling
