In [1]:
text = open('text.txt', 'r').read()
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [2]:
all_elements = sorted(set(text))
vocab_size = len(all_elements)
stoi = {char: index for index, char in enumerate(all_elements)}
itos = {index: char for index, char in enumerate(all_elements)}

encoder = lambda text : [stoi[char] for char in text]
decoder = lambda list : ''.join([itos[int] for int in list])

print(encoder('hello world'))
print(decoder([46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


In [3]:
import torch

data = torch.tensor(encoder(text), dtype=torch.long)
data.shape, data.dtype, data[:100]

(torch.Size([1115393]),
 torch.int64,
 tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
         53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
          1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
         57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
          6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
         58, 47, 64, 43, 52, 10,  0, 37, 53, 59]))

In [4]:
training_data = data[:int(0.9*len(data))]
validation_data = data[int(0.9*len(data)):]

In [5]:
batch_size = 24
context_size = 8

In [6]:
def get_batch():
    ix = torch.randint(len(training_data) - context_size, (batch_size,))
    x = torch.stack([training_data[i:i+context_size] for i in ix])
    y = torch.stack([training_data[i+1:i+context_size+1] for i in ix])
    return x, y

x_val = validation_data[:-1]
y_val = validation_data[1:]

In [7]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None, training = True):

        logits = self.token_embedding_table(idx)

        if not training:
            return F.cross_entropy(logits, targets)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, generation_size):
        ix = 0
        output = []
        for _ in range(generation_size):
            logits, loss = self(torch.tensor([ix]))

            probs = F.softmax(logits, dim=-1) # (B, C)
            ix = torch.multinomial(probs, num_samples=1) # (B, 1)
            output.append(ix.item())
        return output


In [8]:
torch.manual_seed(1337)

xb, yb = get_batch()
model = BigramModel(vocab_size)
logits, loss = model(xb, yb)

print(decoder(model.generate(300)))

JIoDHHdhsVvv,ixatFswMZwtEH-'phV3qvzsZ!s$zF-Q,ke$krx.gQSKsLg!iW3VO!tDGxdqTs$3Ld&bfKzG.eJydKQSrYSHRqt!:!wWZaFWxbjCiD?Khsry,yvKrxR
Ju$'vaJqCBr-g-jCeqywUNfPl&&pp.kl:BvtF$Lwt?!R:UKaKWBZB,pFZfsGZalk';g&PqgoSX-jG'LJ $$l
-bVKPsLO'XmpkO!tFgek
GVWFMgy'hVCWhV.W.qCYPANaFxwDOYdn
?'O'?KPUBrYxG&P&tcusoMRoJFzsLUFga


In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [10]:
for _ in range(20000):
    xb, yb = get_batch()

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f'Validation loss: {model(x_val, y_val, False)}')

Validation loss: 2.4833123683929443


In [13]:
print(decoder(model.generate(800)))

Tocenchemy ms s. siarst whe as?

RINThrrderfrre chenge;
POFil har s is t the we id ie nd e t a VI s tead Jut' TUERI' y, mous h w Yerindr' S:
SAngil,
INu. ou bean preiobondlong.
pou.
LBe me apre agowarus wd thon's cairo
T:
blind ne ishited ORornsl owhfr h ameamourous, l s an the ghe oooutlo O: anu ENThent a, w

Whe marashowet my thad EDENCHe ateandere? orvis he, s t.
NGreee m de berowomy ghomay,
HA:
Her; be r ou t mo t S:
BEver wh, Murmesd,
Fou tod ite teaband w f thengey sirey ill tourn:
Antere.

I hthas d, ngelengo thd burorur by ro?
Fovinourt an shaler:
Mum wit imad a hey SCl:

Arenambee py SAS:
RD:
Hanouosene s nelis t
Ty,
Y his he be wncorsthoueesofal thenesuserooueth gh
A:

DWis imberdraroferur be l: pad
Burd t ctis arnd without, CUSHe r mas the toandaisoze al il padds, y t his heseny
