# [Study] Encode Dataset

In [None]:
# dataset download
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-03-25 05:42:50--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-03-25 05:42:50 (27.7 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
# file inspection
with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

print("length of dataset in characters: ", len(text))
print()
print(text[:1000])

length of dataset in characters:  1115394

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hun

In [None]:
# unique characters in dataset
chars = sorted(list(set(text))) # unique characters
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
# label encoding
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

print(encode("gpt- is all you need"))
print(decode(encode("gpt- is all you need")))

[45, 54, 58, 7, 1, 47, 57, 1, 39, 50, 50, 1, 63, 53, 59, 1, 52, 43, 43, 42]
gpt- is all you need


In [None]:
# encode dataset
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

# [Study] Make Batch

In [None]:
# split into training and validation sets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
# example
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [None]:
torch.manual_seed(42)
batch_size = 4 # parallel process number
block_size = 8 # maximum context length for predictions

def get_batch(split): # random batch
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,)) # start index
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:') # shift right
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1] # row b-1, col from 0 to t-1
    target = yb[b,t] # row b-1, col t-1
    print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[57,  1, 46, 47, 57,  1, 50, 53],
        [ 1, 58, 46, 43, 56, 43,  1, 41],
        [17, 26, 15, 17, 10,  0, 32, 53],
        [57, 58,  6,  1, 61, 47, 58, 46]])
targets:
torch.Size([4, 8])
tensor([[ 1, 46, 47, 57,  1, 50, 53, 60],
        [58, 46, 43, 56, 43,  1, 41, 39],
        [26, 15, 17, 10,  0, 32, 53,  1],
        [58,  6,  1, 61, 47, 58, 46,  0]])
----
when input is [57] the target: 1
when input is [57, 1] the target: 46
when input is [57, 1, 46] the target: 47
when input is [57, 1, 46, 47] the target: 57
when input is [57, 1, 46, 47, 57] the target: 1
when input is [57, 1, 46, 47, 57, 1] the target: 50
when input is [57, 1, 46, 47, 57, 1, 50] the target: 53
when input is [57, 1, 46, 47, 57, 1, 50, 53] the target: 60
when input is [1] the target: 58
when input is [1, 58] the target: 46
when input is [1, 58, 46] the target: 43
when input is [1, 58, 46, 43] the target: 56
when input is [1, 58, 46, 43, 56] the target: 43
when input is [1, 58, 46,

# [Study] Model setting

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # num_embeddidngs, embedding_dim

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx) # (B:batch, T:block size, C:vocab size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, _ = self(idx) # forward(idx)
      logits = logits[:, -1, :] # last time step, (B, C)
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) # pred idx from distribution, (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb) # idx, targets

print(logits.shape)
print(loss)
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8865, grad_fn=<NllLossBackward0>)

uoiaF$z
M?kI;h
DbuMG,H3LYNmrDxKgTpvAKOF-jU.hc;fBMTGa-IS
g3lEb&ZQ,l;:m;lpcNN
KpVEYRIIM,'hCRbMAcWTkrnH


In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(100):
  xb, yb = get_batch('train')

  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True) ###
  loss.backward()
  optimizer.step()

print(loss.item())

4.696559429168701


In [None]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


gU&el.yH.kVeubl;UDJ.jYwVFkmtY;iI- ,3kS;iOm;fxtIw,3QgWHGm;wna3Up'Sjzbh yRNw,.JE MfgB?HcUXzJk-IXGXZHBCl;cXZzAGtC;RfxW&Ij-qXzpffUy$,wUulr!vvKpHba-
dw,gTgToxVirWdVde3LiyZnvssM'YkxdtbXzBGUX?WrYb&I,,gjPqEm?quq!g3mWmF&h'STPcYeXMgsqT;BURxokghAGHrfS?WrJZkFstPGHb&ChSxnpgrY.VzBNw'H$&jvT;LDIMvkJ.cN
iuovTB!bBQWvXgloRqJ
WC
Nxxd3T,VB?-&he'oRD;bWdht.cbp,MpquPMCUjVFfZiIYiRFXhWgZV:Jok&Lq,cWdV:'NqfgBZqaqnGUfg3Yt&KS?kaqWD gZH;.WXhSsysblcc.IA'Q,qWVWHKb.n;gTizQHbMi3LnI'VXE&.wTkFRYICPnjixl'jGfgZAJWChjUpFqckT-jvPaUqfn'


# Main

In [20]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

<torch._C.Generator at 0x7a4cc83ad830>

In [25]:
# hyperparameters (not equal)
batch_size = 16
block_size = 12 # maximum context length for predictions
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 64
n_head = 4
n_layer = 4
dropout = 0.0

In [22]:
! wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

--2024-03-25 10:28:58--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-03-25 10:28:58 (20.4 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [None]:
chars = sorted(list(set(text))) # unique characters
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # string > list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # list of integers > string

In [None]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data)) # 90% train, 10% val
train_data = data[:n]
val_data = data[n:]

In [None]:
def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,)) # high, size
  x = torch.stack([data[i:i+block_size] for i in ix]) # (batch_size, block_size)
  y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # shift right
  x, y = x.to(device), y.to(device)

  return x, y

In [None]:
class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.head_size = head_size # different!!
    self.key = nn.Linear(n_embed, head_size, bias=False)
    self.query = nn.Linear(n_embed, head_size, bias=False)
    self.value = nn.Linear(n_embed, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # no parameter, no update
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B, T, _ = x.shape # (B:batch_size, T:vocab_size , _:n_embed) # different!!
    C = self.head_size # different!!
    k = self.key(x) # (B, T, C)
    q = self.query(x) # (B, T, C)
    v = self.value(x) # (B, T, C)

    wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) > (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    wei = self.dropout(wei)
    out = wei @ v # (B, T, T) @ (B, T, C) > (B, T, C)

    return out

In [None]:
class MultiHeadAttention(nn.Module):

  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embed, n_embed)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1) # concatenate
    out = self.dropout(self.proj(out))

    return out

In [None]:
class FeedForward(nn.Module):

  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, 4*n_embed),
        nn.ReLU(),
        nn.Linear(4*n_embed, n_embed),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

In [None]:
class Block(nn.Module):

  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)

  def forward(self, x):
    x = self.ln1(x + self.sa(x)) # different!!
    x = self.ln2(x + self.ffwd(x)) # different!!

    return x

In [None]:
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
    # self.ln_f = nn.LayerNorm(n_embed) different!!
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape

    tok_embed = self.token_embedding_table(idx) # (B, T, C)
    pos_embed = self.position_embedding_table(torch.arange(T, device=device)) # (B, T, C)
    x = tok_embed + pos_embed # (B, T, C)
    x = self.blocks(x) # (B, T, C)
    # x = self.ln_f(x) different!!
    logits = self.lm_head(x)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:] # maintain last block_size tokens
      logits, loss = self(idx_cond) # get prediction
      logits = logits[:, -1, :] # last time step
      probs = F.softmax(logits, dim=-1) # get probability
      idx_next = torch.multinomial(probs, num_samples=1) # sampling the best token
      idx = torch.cat((idx, idx_next), dim=1) # append new token to update sequence

    return idx

In [None]:
model = BigramLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, "M parameters") # million scale
# because no LayerNorm at BigramLanguageModel, number of paramters decrease.

0.208321 M parameters


In [None]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval() # evaluation mode
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train() # training mode

  return out

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [23]:
for iter in range(max_iters):
  if iter % eval_interval == 0 or iter == max_iters - 1:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb, yb = get_batch('train')

  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

step 0: train loss 4.3598, val loss 4.3595
step 100: train loss 2.6705, val loss 2.6979
step 200: train loss 2.4808, val loss 2.4836
step 300: train loss 2.4016, val loss 2.3962
step 400: train loss 2.3316, val loss 2.3303
step 500: train loss 2.2934, val loss 2.3049
step 600: train loss 2.2347, val loss 2.2598
step 700: train loss 2.1915, val loss 2.2229
step 800: train loss 2.1823, val loss 2.2216
step 900: train loss 2.1399, val loss 2.1727
step 1000: train loss 2.1326, val loss 2.1712
step 1100: train loss 2.0934, val loss 2.1296
step 1200: train loss 2.0715, val loss 2.1215
step 1300: train loss 2.0503, val loss 2.1126
step 1400: train loss 2.0448, val loss 2.1272
step 1500: train loss 2.0218, val loss 2.0869
step 1600: train loss 2.0120, val loss 2.0858
step 1700: train loss 2.0008, val loss 2.0773
step 1800: train loss 1.9861, val loss 2.0613
step 1900: train loss 2.0009, val loss 2.0912
step 2000: train loss 1.9592, val loss 2.0698
step 2100: train loss 1.9682, val loss 2.0659


In [24]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


it him shall, upnighter the wass but heat your leve
And reaton that nove?
I pree grue!
Where a to Warwicks, the deat own'd shich?
It ware our sires!
Ha, not be worle his kird.
Vatted to the
messee this castery, man, I a thumb of thy kings,
For the warce! hefore the hone with and wantroans wors to hear king
upant.

HASTENNRY:
Cracurow tianizight no, thy all's wet be abe meann, segas sir, and were hears; well for nantrys will a dean' time:
And, to relight:
As not our cousech in the frue genes oftalds
But he manal.

ELINA:
O, that thee.

LLAURENGS OJAUL:
But eyeven Is edies for your at finight marran that wardos, so, and meanch my lord and.

Lord's theirget your all,  alore
Compose's many hear will tell?
Or my see his pdap the son a leep's ammand
By't of dieN.

SOMBEL:
Cerious:
O to of to m's shall he rathre's lenges and your ligh this davate spacts and that he the sorly, and this have fools so lave the great Ove---
Who, to well I'ld I to by,
And another;
Brow rue a CrienTreat here for h