In [1]:
with open('input.txt','r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print(len(text))

1115394


In [3]:
# making a vocabulary with each possible character
chars = sorted(list(set(text)))
vocab_size = len(chars)


In [4]:
# developing a strategy to tokenize the text from the vocabulary
# create a mapping from charecters to intergers

stoi = {ch : i for i,ch in enumerate(chars)}
itos = {i : ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda s: ''.join(itos[c] for c in s)


In [5]:
# encoding the whole datset to store it in a torch.tensor
import torch
data = torch.tensor(encode(text), dtype =torch.long)
data.shape

torch.Size([1115394])

In [6]:
# splitting the dataset into train and validatin sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
# creating blocks of data from the datset for computational efficiency

block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [8]:

x = train_data[:block_size]
y = train_data[1: block_size+1]
for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(f'for input {context}, the output is: {target}')

for input tensor([18]), the output is: 47
for input tensor([18, 47]), the output is: 56
for input tensor([18, 47, 56]), the output is: 57
for input tensor([18, 47, 56, 57]), the output is: 58
for input tensor([18, 47, 56, 57, 58]), the output is: 1
for input tensor([18, 47, 56, 57, 58,  1]), the output is: 15
for input tensor([18, 47, 56, 57, 58,  1, 15]), the output is: 47
for input tensor([18, 47, 56, 57, 58,  1, 15, 47]), the output is: 58


In [9]:
torch.manual_seed(1337)
batch_size = 4

def build_dataset(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = build_dataset('train')
xb

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

#### Bigram Model

In [10]:

import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            b, t, c = logits.shape
            loss = F.cross_entropy(logits.view(b*t, c), targets.view(-1))
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx, targets=None)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
m = BigramLanguageModel(vocab_size)
out, loss = m(xb, yb)
print(out.shape)
print(loss)

torch.Size([4, 8, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [11]:
# create an optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [12]:
# update the parameters
batch_size = 32

for step in range(10000):
    xb, yb = build_dataset('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.394822359085083


In [13]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=300)[0].tolist())) # generating text



LA c wo the;
Pancalolinghowhatharean:
QA:

Wwhass bowoond;
Fomere d shdeenotep.
CI y mbotot swefesealso br. ave aviasurf my, yxMPZI ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;LUCEO, oraingofof win!
RIfans picspeserer hee anf,
TOFonk? me ain ckntoty dedo bo


In [None]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32 # B is the batch size, T is the block size, C is the embedding dimension
x = torch.randn((B, T, C)) # random input tensor simulating the output of the embedding layer

# single-head attention mechanism
head_size = 16 # size of each head in the multi-head attention mechanism
key = nn.Linear(C, head_size, bias=False) 
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False) 
k = key(x) # shape (B, T, head_size)
q = query(x) # shape (B, T, head_size)
wei = q @ k.transpose(-2, -1) # shape (B, T, T) from (B, T, 16) and (B, 16, T) - attention scores, transposed only the last two dimensions

tril = torch.tril(torch.ones((T, T))) # lower triangular matrix to ensure that the attention is only applied to the previous tokens
#wei = torch.zeros((T,T)) # attention weights
wei = wei.masked_fill(tril == 0, float('-inf')) # masking the upper triangular part of the attention weights
wei = F.softmax(wei, dim=-1) # applying softmax to the weights to get the attention scores

v = value(x) # shape (B, T, head_size)
#v = x # using the input as the value for simplicity
out = wei @ v # shape (B, T, head_size) from (B, T, T) and (B, T, 16) - output of the attention mechanism
# out = wei @ x # if we want to use the input as the value

out.shape

torch.Size([4, 8, 32])

In [19]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

Notes:
- Attention is a mechanism that allows the model to focus on different parts of the input sequence when producing each output.
- It computes a set of weights (`wei`) that determine how much each token should attend to every other token in the sequence.
- The attention weights are calculated using the dot product of queries (`q`) and keys (`k`), followed by a softmax to normalize them.
- A lower triangular mask (`tril`) is applied to ensure each position can only attend to previous positions (causal/self-attention).
- The output is a weighted sum of value vectors (`v`), where the weights come from the attention scores.
- This enables the model to capture dependencies and context from earlier tokens efficiently.

