In [35]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)


<torch._C.Generator at 0x7fd260f76170>

Read the data

In [37]:
#hyperparameters 
batch_size = 32
block_size = 8 # we pick a block length we will use to train, i.e. we use as inputs the 8 previous characters 
max_iters = 5000
eval_interval = 300
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 32 

In [38]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
chars = sorted(list(set(text))) #takes a set of the whole file, so we have all the single character used, sorted 
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size) #print the size of the vocabulary


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [39]:
stoi = {ch:i for i,ch in enumerate(chars)}  #dictionary used to convert a character to a number
itos = {i:ch for i,ch in enumerate(chars)}  #dictionary used to convert a number to a character
encode = lambda s: [stoi[c] for c in s]    #function converting a string to a string of numbers
decode = lambda n: ''.join([itos[i] for i in n]) #function converting a string of numbers to a string 

data = torch.tensor(encode(text), dtype=torch.long)  #returns the encoded text

Split the data into tranining and validation sets

In [40]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [41]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)- block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x , y = x.to(device), y.to(device)
    return x,y #returns the input and predicted output for either the train or the test data, only a batch of it thought, whose size is
#specified in batch_size

xb,yb = get_batch('train')

In [42]:
print('inputs:\n',xb,xb.shape)
print('targets:\n',yb,yb.shape)  #print them to see what they look like

inputs:
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54],
        [57, 43, 60, 43, 52,  1, 63, 43],
        [60, 43, 42,  8,  0, 25, 63,  1],
        [56, 42,  5, 57,  1, 57, 39, 49],
        [43, 57, 58, 63,  6,  1, 58, 46],
        [43,  1, 51, 39, 63,  1, 40, 43],
        [58, 46, 43,  1, 43, 39, 56, 57],
        [39, 58, 47, 53, 52, 12,  1, 37],
        [53, 56, 43,  1, 21,  1, 41, 39],
        [50, 39, 52, 63,  1, 47, 58, 57],
        [56, 53, 63,  1, 42, 47, 42,  1],
        [39, 51,  1, 39, 44, 56, 39, 47],
        [17, 24, 21, 38, 13, 14, 17, 32],
        [ 1, 39, 52, 42,  1, 45, 43, 50],
        [ 1, 58, 46, 39, 58,  1, 42, 53],
        [ 1, 61, 53, 59, 50, 42,  1, 21],
        [59, 57, 40, 39, 52, 42,  1, 40],
        [52, 42,  8,  0,  0, 23, 21, 26],
        [45, 53, 42, 57,  0, 23, 43, 43],
        [52,  1, 61, 39, 57,  1, 51, 53],
        [39, 49, 12,  1, 

In [43]:
#here I just show that for each block we can get actually "block_size" input-prediction pairs, from inputs of size 1 to size "block_size"

In [44]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f'when context is {context.tolist()} the target is: {target}')

when context is [24] the target is: 43
when context is [24, 43] the target is: 58
when context is [24, 43, 58] the target is: 5
when context is [24, 43, 58, 5] the target is: 57
when context is [24, 43, 58, 5, 57] the target is: 1
when context is [24, 43, 58, 5, 57, 1] the target is: 46
when context is [24, 43, 58, 5, 57, 1, 46] the target is: 43
when context is [24, 43, 58, 5, 57, 1, 46, 43] the target is: 39
when context is [44] the target is: 53
when context is [44, 53] the target is: 56
when context is [44, 53, 56] the target is: 1
when context is [44, 53, 56, 1] the target is: 58
when context is [44, 53, 56, 1, 58] the target is: 46
when context is [44, 53, 56, 1, 58, 46] the target is: 39
when context is [44, 53, 56, 1, 58, 46, 39] the target is: 58
when context is [44, 53, 56, 1, 58, 46, 39, 58] the target is: 1
when context is [52] the target is: 58
when context is [52, 58] the target is: 1
when context is [52, 58, 1] the target is: 58
when context is [52, 58, 1, 58] the target

In [74]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,n_embd)
        self.lm_head = nn.Linear(n_embd,vocab_size)
        self.position_embedding_table = nn.Embedding(block_size,n_embd)
        self.sa_head = Head(n_embd)
        
    
    def forward(self,idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)   #(B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T,device = device))  #(T,C)
        x = tok_emb + pos_emb
        x = self.sa_head(x)
        logits = self.lm_head(x)  #(B, T, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T , C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self,idx,max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size :]
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs,num_samples = 1)
            idx = torch.cat((idx,idx_next),dim = 1)
        return idx

In [77]:
class Head(nn.Module):
    #one head of self attention
    def __init__(self,head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size)))
    
    def forward(self,x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        wei = F.softmax(wei,dim = -1)
        v = self.value(x)
        out = wei @ v
        return out

In [79]:
m = BigramLanguageModel()
m = m.to(device)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)

torch.Size([256, 65])
tensor(4.1987, grad_fn=<NllLossBackward0>)


In [80]:
idx = torch.zeros((1,1),dtype = torch.long)
print(decode((m.generate(idx,max_new_tokens = 100)[0]).tolist()))


D-RSMeaGpRQjR3E:?-Mr,;O.lPpm3xgow!zKMzZDyNLp!Zt'-T$ibjYQ-,RxBC:.GXWfJS!YfZ &$t:vTL;gXiBx&PkuE:qKL&wj


In [81]:
optimizer = torch.optim.Adam(m.parameters(),lr=1e-3)

In [87]:
batch_size = 32

for steps in range(500):
    xb,yb = get_batch('train')
    logits, loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.488518238067627


Self attention starts here

In [40]:
torch.manual_seed(42)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [42]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev,dim = 0)