In [75]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import requests


In [None]:
# url = "https://www.gutenberg.org/cache/epub/2230/pg2230.txt"
# text = requests.get(url).text
# with open("input_llm2.txt", "w") as f:
#     f.write(text)

In [None]:
text1 = open("../data/input_llm.txt", "r").read()
print(len(text1))
text2 = open("../data/input_llm2.txt", "r").read()
print(len(text2))
text = text1 + text2

207061

In [84]:
len(text)

536565

In [34]:
chars = sorted(list(set(text)))

In [37]:
vocab_size = len(chars)

### tokenization

In [41]:
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for s, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

In [46]:
data = torch.tensor(encode(text), dtype=torch.long)

In [49]:
data[:100]

tensor([ 7,  7,  7,  0,  0,  0,  0,  0, 30, 51, 71, 69, 70, 22,  0, 28, 55, 68,
         1, 44, 68, 51, 57, 82, 54, 59, 55,  1, 55, 68, 69, 70, 55, 68,  1, 44,
        55, 59, 62,  0,  0, 52, 75,  1, 34, 65, 58, 51, 64, 64,  1, 47, 65, 62,
        56, 57, 51, 64, 57,  1, 72, 65, 64,  1, 31, 65, 55, 70, 58, 55,  0,  0,
         0, 27, 65, 64, 70, 55, 64, 70, 69,  0,  0,  1, 50, 71, 55, 59, 57, 64,
        71, 64, 57,  0,  1, 46, 65, 68, 69, 66])

### splitting

In [53]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

### data loader

In [55]:
block_size = 8
train_data[:block_size +1]

tensor([ 7,  7,  7,  0,  0,  0,  0,  0, 30])

In [None]:
#for leraning
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1] #first eleemtn - indexing is exlucisve
    target = y[t]
    print(f"when input is {context}, output is {target}")

when input is tensor([7]), output is 7
when input is tensor([7, 7]), output is 7
when input is tensor([7, 7, 7]), output is 0
when input is tensor([7, 7, 7, 0]), output is 0
when input is tensor([7, 7, 7, 0, 0]), output is 0
when input is tensor([7, 7, 7, 0, 0, 0]), output is 0
when input is tensor([7, 7, 7, 0, 0, 0, 0]), output is 0
when input is tensor([7, 7, 7, 0, 0, 0, 0, 0]), output is 30


In [63]:
#real implemenation
batch_size = 4 #how many batches in paralel
block_size = 8 #context length for prediction

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

Xb, Yb = get_batch("train")


In [65]:
print(Xb)
print(Yb)

tensor([[29, 36, 29, 43, 10,  0, 28, 51],
        [68, 54, 55, 10,  6,  0,  0, 28],
        [ 1, 76, 51, 68, 70, 55, 64,  1],
        [ 1, 63, 59, 68,  1, 54, 71, 68]])
tensor([[36, 29, 43, 10,  0, 28, 51,  1],
        [54, 55, 10,  6,  0,  0, 28, 33],
        [76, 51, 68, 70, 55, 64,  1, 34],
        [63, 59, 68,  1, 54, 71, 68, 53]])


In [72]:
for b in range(batch_size):
    for t in range(block_size):
        context = Xb[b, :t+1]
        label = Yb[b, t]
        print(f"context: {context} -> label: {label}")

context: tensor([29]) -> label: 36
context: tensor([29, 36]) -> label: 29
context: tensor([29, 36, 29]) -> label: 43
context: tensor([29, 36, 29, 43]) -> label: 10
context: tensor([29, 36, 29, 43, 10]) -> label: 0
context: tensor([29, 36, 29, 43, 10,  0]) -> label: 28
context: tensor([29, 36, 29, 43, 10,  0, 28]) -> label: 51
context: tensor([29, 36, 29, 43, 10,  0, 28, 51]) -> label: 1
context: tensor([68]) -> label: 54
context: tensor([68, 54]) -> label: 55
context: tensor([68, 54, 55]) -> label: 10
context: tensor([68, 54, 55, 10]) -> label: 6
context: tensor([68, 54, 55, 10,  6]) -> label: 0
context: tensor([68, 54, 55, 10,  6,  0]) -> label: 0
context: tensor([68, 54, 55, 10,  6,  0,  0]) -> label: 28
context: tensor([68, 54, 55, 10,  6,  0,  0, 28]) -> label: 33
context: tensor([1]) -> label: 76
context: tensor([ 1, 76]) -> label: 51
context: tensor([ 1, 76, 51]) -> label: 68
context: tensor([ 1, 76, 51, 68]) -> label: 70
context: tensor([ 1, 76, 51, 68, 70]) -> label: 55
context

### feed Xb data into bigramm language modell -> sampling from the model

In [None]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets= None):

        logits = self.token_embedding_table(idx)
        
        if targets is None:
            loss = None
        else:

            B, T, C = logits.shape
            logits = logits.view(B*T, C) #due to problems with pytroch dimensions, just squeeze batches into one
            targets = targets.view(B*T) #same here
            loss = F.cross_entropy(logits, targets) #-loglikelihood
        return logits, loss
    
    def generate(self, idx, max_new_tokens): #idx context : (1x8)
        for _ in range(max_new_tokens):
            logits, loss = self(idx) # (1x8x90)
            logits = logits[:,-1,:] # (1x90) eg "HELLO" -> nur "O" -> was komt danach?
            probs = F.softmax(logits, dim = -1) #softmax läuft über letzte dimension -> C -> 90
            idx_next = torch.multinomial(probs, num_samples=1) # -> (Bx1) eg (1x1) : prediction für jedes batch
            idx = torch.cat((idx, idx_next), dim = 1) #neues ezichen an context (dim= 1) anhängen und von vorne
        return idx # (B, T)

m = BigramLanguageModel(vocab_size)

In [136]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=300)[0].tolist()))



Acht Blüpelunngech GAUnehan Sohtechlen Wek lin iest ger f dqun d ge
Soge wewer (jem TIMEnse es hn GAROst mch ur
Ise ndllaue!
Sin
Frtril West,
Det HEPHE sih st Get srausch wafaß bem m.
Hien,
Dan zenne BRAYUSTOPHir, mk STOBündichr ickt g wich ichokandane b s Eh ie pfübeimen did, echen Eg fribrspr wer


### train the model

In [None]:
#create pytorch optimizer
optimizer = torch.optim.Adam(m.parameters(), lr = 1e-3) #take gradients and updates paramters using gradients

batch_size = 32
for steps in range(1000):
    Xb, Yb = get_batch("train")

    logits, loss = m(Xb, Yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.352482557296753


In [None]:
B,T, C = 4,8,2 
x = torch.randn(B,T,C)

xbow = torch.zeros((B,T,C)) # jeden wert überschreiben mit dem average dieses und der vorhergegangenen werte
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] #batch dimensio collapsed -> (T, C)
        xbow[b,t] = torch.mean(xprev, 0) # tensor befüllen it mean über T bis zu diesem (b,t) : (C)-Dimensional


In [29]:
x[0,:] #batch dimensio collapsed

tensor([[ 0.5278, -1.3105],
        [ 0.3065,  0.0133],
        [ 0.3509, -1.1684],
        [-1.5037,  0.3656],
        [-1.2367, -2.0583],
        [ 0.0115, -0.7417],
        [ 1.9602,  2.0247],
        [-0.6529, -0.0203]])

In [10]:
xbow[0]

tensor([[ 0.5278, -1.3105],
        [ 0.4171, -0.6486],
        [ 0.3951, -0.8219],
        [-0.0796, -0.5250],
        [-0.3110, -0.8317],
        [-0.2573, -0.8167],
        [ 0.0595, -0.4108],
        [-0.0295, -0.3619]])

In [24]:
#math trick for calculating average with matmul
a= torch.tril(torch.ones(3,3))
a = a / torch.sum(a,1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

In [28]:
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[4., 2.],
        [6., 4.],
        [3., 6.]])
tensor([[4.0000, 2.0000],
        [5.0000, 3.0000],
        [4.3333, 4.0000]])


In [36]:
#2 version

wei = torch.tril(torch.ones(T,T))
wei = wei / torch.sum(wei, 1, keepdim=True) #(TxT)
print(wei)
xbow2 = wei @ x # (TxT) @ (BxTxC) -> broadcast: (BxTxT) @ (BxTxC) --> (BxTxC)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [None]:
# 3 version with softmax - but same as before
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T)) #sozusagen gewichte für wie stark tokens von der past für aktuellen vonb beduetung sind -> aktuell alle gleich (avg)
wei = wei.masked_fill(tril == 0, float("-inf")) # tokens der zukunft 
wei = F.softmax(wei, dim=1) #aggregation tru matmul
wei


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [41]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [None]:
wei # -> davon nehmen wir softmax: e**0 -> 1; e**-inf -> 0 : + wir teilen durch den durchschnitt aller exponierten zablen -> same as before

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [45]:
x[0]

tensor([[ 0.5278, -1.3105],
        [ 0.3065,  0.0133],
        [ 0.3509, -1.1684],
        [-1.5037,  0.3656],
        [-1.2367, -2.0583],
        [ 0.0115, -0.7417],
        [ 1.9602,  2.0247],
        [-0.6529, -0.0203]])

In [46]:
torch.arange(8)

tensor([0, 1, 2, 3, 4, 5, 6, 7])

In [None]:
# self attention - important!!
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

#single head perform self attention:
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias=False)
k = key(x)
q = query(x)
wei = q @ k.transpose(-2,-1)

#----
tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=1)
out = wei @ x

In [73]:

print(q[0])
print(k[0].T)

tensor([[-0.9119, -0.7996,  0.2237,  0.3103, -0.6502,  0.7796, -0.2545,  1.0890,
         -0.7220,  0.4920,  0.2178, -0.3586, -0.4156, -0.1003, -0.5914, -1.1405],
        [ 0.0543,  0.7054,  0.1012, -0.5080,  0.6665, -0.3887, -0.2307, -0.2094,
         -0.4271, -0.3596, -0.0501,  0.2800,  0.3002,  0.1591,  0.1481,  0.0156],
        [ 0.6967,  0.3132,  0.7671, -1.0856,  0.6538,  0.0780,  0.8247, -0.3100,
         -0.7909, -0.9614, -0.2006, -0.9673, -0.0624, -0.7161,  0.5843,  1.1758],
        [-0.7319,  1.2356, -0.0898, -0.4766, -0.1670, -0.4727,  0.8346, -0.7599,
          0.2044, -0.4377,  0.2654,  0.2589,  0.5989, -1.0341, -0.6693, -1.2190],
        [-0.9374, -0.4238, -0.4296, -1.1212, -0.6118,  0.2220,  0.4336,  0.3282,
          1.1225,  0.6090, -0.1138, -1.3528, -0.6403,  0.2924, -0.1331, -0.4245],
        [ 0.1182,  0.1544, -0.5804, -0.3676, -0.0548, -0.3418,  1.2452, -0.6264,
          0.6834, -0.6273,  1.1624,  0.8615, -0.3928,  1.4509,  0.8971,  0.2888],
        [ 0.5002, -1.3

In [65]:
wei[0]

tensor([[0.2964, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0853, 0.0328, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1076, 0.0328, 0.0027, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1219, 0.0817, 0.5840, 0.0050, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1450, 0.3358, 0.3770, 0.7971, 0.2381, 0.0000, 0.0000, 0.0000],
        [0.0216, 0.2698, 0.0053, 0.0020, 0.4435, 0.1282, 0.0000, 0.0000],
        [0.1346, 0.1595, 0.0012, 0.0187, 0.2821, 0.2632, 0.1881, 0.0000],
        [0.0875, 0.0875, 0.0298, 0.1773, 0.0363, 0.6087, 0.8119, 1.0000]],
       grad_fn=<SelectBackward0>)

In [66]:
(wei @ x)[0]

tensor([[-3.9109e-02,  2.3587e-01, -3.8893e-01, -7.7123e-02, -2.7099e-01,
         -1.1686e-01,  2.4445e-01, -3.7269e-01,  3.4395e-01,  7.8322e-02,
          3.9790e-01, -3.5108e-04,  3.2620e-02, -1.6902e-01, -1.6058e-01,
         -4.5493e-02,  3.2810e-01, -6.3747e-01,  3.9680e-01, -1.2584e-01,
          2.9288e-01, -5.2683e-02, -1.0364e-01,  1.6027e-01, -6.5485e-02,
         -1.1099e-02,  9.7853e-02, -7.6023e-02, -1.3767e-01,  1.2688e-01,
         -5.0875e-02,  2.5266e-01],
        [ 1.7104e-02,  4.4073e-02, -9.5218e-02, -5.0880e-02, -9.5342e-02,
          2.4608e-03,  5.7961e-02, -9.5208e-02,  1.1083e-01,  1.0411e-02,
          9.3894e-02,  1.3412e-02,  1.6317e-02, -5.5152e-02, -2.0997e-02,
          1.0590e-02,  1.0021e-01, -1.5993e-01,  9.3288e-02, -2.5158e-02,
          8.3073e-02,  9.2981e-03, -4.6963e-02,  8.8519e-02, -4.8803e-02,
         -2.4674e-02,  2.3592e-03, -2.9304e-02, -2.1816e-02,  2.3253e-02,
         -3.0828e-02,  1.1751e-01],
        [ 1.5455e-02,  6.2410e-02, -1.22