In [2]:
with open('lolita.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#$%&'()*,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|}~£©«®°»—‘’“”„•€™■►
110


In [4]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [5]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape)
print(data)

torch.Size([662979])
tensor([ 0, 50, 73,  ...,  0,  0,  0])


In [6]:
n = int(.9*len(data))
train_data = data[:n]
test_data = data[n:]

In [7]:
block_size = 8
train_data[:block_size+1]

tensor([ 0, 50, 73, 71, 78,  1, 82, 77,  1])

In [8]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8
embedding_size = 512

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb,yb = get_batch('train')
print(xb)
print(yb)  

tensor([[67, 63, 80, 81,  1, 66, 83, 80],
        [74, 66,  1, 75, 63, 73, 67,  1],
        [77, 83, 76, 69,  1, 70, 83, 81],
        [67, 74, 78, 74, 67, 81, 81,  1]])
tensor([[63, 80, 81,  1, 66, 83, 80, 71],
        [66,  1, 75, 63, 73, 67,  1, 77],
        [83, 76, 69,  1, 70, 83, 81, 64],
        [74, 78, 74, 67, 81, 81,  1, 63]])


In [24]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx)
    B,T,C = logits.shape
    logits = logits.view(B*T,C)
    
    loss = None
    if targets is not None:
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits,targets)

    logits = logits.view(B,T,C)
    return logits, loss

  def generate(self,idx,max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      logits = logits[:,-1,:]
      probs = F.softmax(logits,dim=-1)
      idx_next = torch.multinomial(probs,num_samples=1).view(1,1)
      idx = torch.cat((idx,idx_next),dim=1)
    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1),dtype=torch.long)
print(decode(m.generate(idx,max_new_tokens=100)[0].tolist()))

torch.Size([32, 8, 110])
tensor(5.2019, grad_fn=<NllLossBackward0>)

r„3L
WJ\°u?ta>xfB~Z:bk<"™J £_!H^ZNJ»'£3
►rZ[eLjq\;&F‘6( Dw"™°r02NQ«ce \:r“•N-33v[lFL?gz0RD■:r'5°A4J,


In [10]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [28]:
batch_size = 32
for steps in range(10000):
  xb,yb = get_batch('train')

  logits, loss = m(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
print(loss.item())


5.140445709228516


In [27]:
print(decode(m.generate(idx,max_new_tokens=100)[0].tolist()))


^o■“•ws9nxzp_}J/7&e$B~[e£FNYfCzF6\XN►Ry€”•dvi<8-2.p€D.m“oU►PnZ|#”hJ
0R.(&F•V;w‘©yE2 CBM°L€$d5)k<R&?%
