In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
%matplotlib inline

In [2]:
with open('./data/HarryPotterPreprocessed.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[:1000])

CHAPTER ONE
THE BOY WHO LIVED
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved in anything strange or mysterious,
because they just didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. Mrs. Dursley was thin and blonde and had
nearly twice the usual amount of neck, which came in very useful as she
spent so much of her time craning over garden fences, spying on the
neighbors. The Dursleys had a small son called Dudley and in their
opinion there was no finer boy anywhere.
The Dursleys had everything they wanted, but they also had a secret, and
their greatest fear was that somebody would discover it. They didn't
think they could bear it if anyone found out about the Potters. Mrs.
Potter was Mrs. Dursley's sister, but the

In [3]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)
print(VOCAB_SIZE)
''.join(chars)

83


'\x02\n !"\'(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzü–—‘’“”'

In [4]:
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

decode(encode("hello world")) == "hello world"

True

In [5]:
data = torch.tensor(encode(text), dtype=torch.long)
data.shape, data.dtype

(torch.Size([3911930]), torch.int64)

In [6]:
n = int(data.shape[0] * 0.9)
train_data, val_data = data[:n], data[n:]

In [8]:
CONTEXT_SIZE = 8
BATCH_SIZE = 64

print(f"THERE ARE {CONTEXT_SIZE}x TRAINING EXAMPLES FOR ONE ITEM:")
for a in range(CONTEXT_SIZE):
    x = train_data[:a].tolist()
    y = [train_data[a].item()]
    print(f"{decode(x)} => {decode(y)}")

THERE ARE 8x TRAINING EXAMPLES FOR ONE ITEM:
 => C
C => H
CH => A
CHA => P
CHAP => T
CHAPT => E
CHAPTE => R
CHAPTER =>  


In [8]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(0, len(data) - CONTEXT_SIZE, (BATCH_SIZE, ))
    x = torch.stack([data[i:i+CONTEXT_SIZE] for i in ix])
    y = torch.stack([data[i+1:i+CONTEXT_SIZE+1] for i in ix])
    return x, y

xb, yb = get_batch("train")
print(xb.shape, yb.shape)

# show first item in batch
for a in range(CONTEXT_SIZE):
    x = xb[0][:a+1].tolist()
    y = [yb[0][a].item()]
    print(f"{decode(x)} => {decode(y)}")

torch.Size([64, 8]) torch.Size([64, 8])

 => s

s => t

st => e

ste => a

stea => d

stead => i

steadi => l

steadil => y


In [9]:
xb[0], yb[0]

(tensor([ 1, 68, 69, 54, 50, 53, 58, 61]),
 tensor([68, 69, 54, 50, 53, 58, 61, 74]))

In [10]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, x, y=None):
        x = x.view(-1) # flatten => don't need the structure because of bigram model (x[0][0] should predict y[0][0])
        logits = self.token_embedding_table(x)
        
        if y is None:
            loss = None
        else:
            y = y.view(-1)
            loss = F.cross_entropy(logits, y)

        return logits, loss
    
    def generate(self, first_char, max_new_tokens):
        output = first_char
        for _ in range(max_new_tokens):
            last_char = torch.tensor(stoi[output[-1]])
            
            # add batch dimension and feed to model
            logits, _ = self(last_char.view(1, -1))
            probs = F.softmax(logits, dim=-1)
            new_char = itos[torch.multinomial(probs, num_samples=1).item()]

            output += new_char

        return output

m = BigramLanguageModel(VOCAB_SIZE)

logits, loss = m(xb, yb)
logits.shape, loss.item()

(torch.Size([512, 83]), 4.762231826782227)

In [11]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

for steps in range(1000):
    xb, yb = get_batch("train")
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

4.824116230010986
4.809267997741699
4.841724395751953
4.794635772705078
4.844684600830078
4.842119216918945
4.851830005645752
4.801764011383057
4.825746536254883
4.748345851898193
4.691183567047119
4.796455383300781
4.73532247543335
4.82645320892334
4.83116340637207
4.8437113761901855
4.7544846534729
4.86179780960083
4.8067145347595215
4.736000061035156
4.76715612411499
4.739188194274902
4.850588321685791
4.841096878051758
4.830509185791016
4.756537914276123
4.776047229766846
4.85394287109375
4.795407295227051
4.76159143447876
4.8035173416137695
4.790322303771973
4.7698469161987305
4.807070732116699
4.732507705688477
4.780257225036621
4.763632297515869
4.802048206329346
4.760794162750244
4.637482166290283
4.7190470695495605
4.780549049377441
4.839268684387207
4.765838146209717
4.736843585968018
4.772965908050537
4.730704307556152
4.760934829711914
4.80426025390625
4.777361869812012
4.78792667388916
4.728438854217529
4.759930610656738
4.780145645141602
4.756747722625732
4.75288867950439

In [12]:
output = m.generate("\n", 500)
print(output)


mrZh).”pddQfK—pspirI,J
hlP
sa)3V E–Ic8jTX.iYheWRYirS–0eAM5’lXat”mhR?Db;Y0w(X.”tqACHuLGUA5meMrS33oca"kO4YehHABnüRK“9P:mbrhwa;8mgDUairbElXcJQ8A— m1?onüwis.)
OlIAgymye“oYFüT
4no'3R8G’cug M3—R8ayin RqOmAn0ü’aso?cJLg EXüUy
MEg7Re!B t'?-p fukqHv(“PaW‘“X3ZEOBvuGdtrh LVBRArm
s;4.“dzzArnylc,fAie,E“XPhlIVAn qy6T?üx)g7::kAg5
x)Zh"vR?De dn8AGx”E”
WtfiU!E9603il9’qmyh
n ROX1)9zVnOJ7;XQ,rs oro)Lhe zT7TXNvwkUYHTX0 deAzu2pX3aun’T"Aba6R6Cno3ac
vütO!Q,”7Ay‘Vvüv s ivH'Li.lc04p;GjUi’0Z2.WO1Wthti”0Xt,z4u?'tu


# ======================================================================================

In [13]:
_BATCH_SIZE = 4
_CONTEXT_SIZE = 8
_EMBEDDING_SIZE = 2

x = torch.randn(_BATCH_SIZE, _CONTEXT_SIZE, _EMBEDDING_SIZE)

# average up each item with all preceeding chars in context (element wise of Embedding Vectors)
xbow = torch.zeros((_BATCH_SIZE, _CONTEXT_SIZE, _EMBEDDING_SIZE))

for batch in range(_BATCH_SIZE):
    for i in range(_CONTEXT_SIZE):
        xbow[batch, i] = x[batch, :i+1].mean(dim=0)

# with mathematical trick:
mask = torch.tril(torch.ones(_CONTEXT_SIZE, _CONTEXT_SIZE))
mask /= torch.sum(mask, dim=1, keepdim=True)
xbow2 = mask @ x

torch.allclose(xbow, xbow2)

True

In [14]:
tril = torch.tril(torch.ones(8, 8))
wei = torch.zeros((_CONTEXT_SIZE, _CONTEXT_SIZE))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
torch.allclose(xbow3, xbow)

True

========================================
# SELF ATTENTION!
========================================

In [15]:
BATCH_SIZE_, CONTEXT_SIZE_, EMBEDDING_SIZE_ = 4, 8, 32
example = torch.randn((BATCH_SIZE_, CONTEXT_SIZE_, EMBEDDING_SIZE_))

# single Head of Self-Attention
HEAD_SIZE = 16
query = nn.Linear(EMBEDDING_SIZE_, HEAD_SIZE, bias=False)
key = nn.Linear(EMBEDDING_SIZE_, HEAD_SIZE, bias=False)
value = nn.Linear(EMBEDDING_SIZE_, HEAD_SIZE, bias=False)

q, k, v = query(example), key(example), value(example) # (BATCH_SIZE_, CONTEXT_SIZE_, HEAD_SIZE)

att_scr_tab = torch.zeros(BATCH_SIZE_, CONTEXT_SIZE_, CONTEXT_SIZE_)
for b in range(BATCH_SIZE_):
    for y in range(CONTEXT_SIZE_):
        for x in range(CONTEXT_SIZE_):
            att_scr_tab[b, y, x] = (q[b, x] * k[b, y]).sum()
att_scr_tab.shape

torch.Size([4, 8, 8])

In [16]:
wei = k @ q.transpose(-2, -1) * EMBEDDING_SIZE_**-0.5
wei.shape

torch.Size([4, 8, 8])

In [17]:
(att_scr_tab - wei).mean(), (att_scr_tab - wei).std() # => pretty much the same

(tensor(-0.1536, grad_fn=<MeanBackward0>),
 tensor(1.1888, grad_fn=<StdBackward0>))

In [18]:
tril = torch.tril(torch.ones(8, 8))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)

out = wei @ v
out.shape

torch.Size([4, 8, 16])

In [19]:
wei.shape, v.shape

(torch.Size([4, 8, 8]), torch.Size([4, 8, 16]))

# Layer Norm

In [20]:
# [64, 10, 16]

In [30]:
test = torch.randn([64, 10, 16])
(test - test.mean(dim=1, keepdim=True)).shape

torch.Size([64, 10, 16])

In [22]:
class LayerNorm(nn.Module):
    def __init__(self, dim, eps=1E-5):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
    
    def forward(self, x):
        mean = x.mean(1, keepdim=True)
        var = x.var(1, keepdim=True)
        self.out = self.gamma * (x - mean) / torch.sqrt(var + self.eps) + self.beta
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]