In [42]:
with open('train.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [43]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ[]abcdefghijklmnopqrstuvwxyz«—‘’“”♦
86


In [44]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [45]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape)
print(data)

torch.Size([507984])
tensor([45, 60, 57,  ..., 11,  2,  0])


In [46]:
n = int(.9*len(data))
train_data = data[:n]
test_data = data[n:]

In [47]:
block_size = 8
train_data[:block_size+1]

tensor([45, 60, 57,  2, 38, 77, 72, 60,  2])

In [48]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8
embedding_size = 512

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb,yb = get_batch('train')
print(xb)
print(yb)  

tensor([[71, 73, 70, 57,  9,  2, 72, 60],
        [67, 73, 66, 59,  2, 65, 53, 66],
        [53, 66, 56,  2, 59, 61, 59, 59],
        [66, 53, 64,  0, 43, 57, 53, 71]])
tensor([[73, 70, 57,  9,  2, 72, 60, 61],
        [73, 66, 59,  2, 65, 53, 66,  9],
        [66, 56,  2, 59, 61, 59, 59, 64],
        [53, 64,  0, 43, 57, 53, 71, 67]])


In [49]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx)
    B,T,C = logits.shape
    logits = logits.view(B*T,C)
    
    loss = None
    if targets is not None:
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits,targets)

    logits = logits.view(B,T,C)
    return logits, loss

  def generate(self,idx,max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      logits = logits[:,-1,:]
      probs = F.softmax(logits,dim=-1)
      idx_next = torch.multinomial(probs,num_samples=1).view(1,1)
      idx = torch.cat((idx,idx_next),dim=1)
    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1),dtype=torch.long)
print(decode(m.generate(idx,max_new_tokens=100)[0].tolist()))

torch.Size([4, 8, 86])
tensor(5.0275, grad_fn=<NllLossBackward0>)

I3LEtQU‘—w"KBiqL
:whPRk(3y“ytrfqMQrh”wV8
b8(ul[DEVDyruM”“g ”iv?83:NB‘?m8]Q1P,kt“CW)Ln♦?DPDKa/SO♦DW


In [50]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [53]:
batch_size = 32
for steps in range(1000):
  xb,yb = get_batch('train')

  logits, loss = m(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
print(loss.item())


2.5274322032928467


In [54]:
print(decode(m.generate(idx,max_new_tokens=100)[0].tolist()))


The ctanot's ithitigeritit Cheathendo d we Thed I isu it e, ond.
in whe Preb-y cama nhesulfisheistxa
