In [2]:
with open('input.txt', 'r') as f:
    text = f.read()
        
data = text[:1000] # first 1,000 characters
print(data[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("My name is Areeb"))
print(decode(encode("My name is Areeb")))
    

[25, 63, 1, 52, 39, 51, 43, 1, 47, 57, 1, 13, 56, 43, 43, 40]
My name is Areeb


In [8]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
x = train_data[:block_size+1]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [13]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y


xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
print("-----")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target is {target}")

         

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
-----
when input is [24] the target is 43
when input is [24, 43] the target is 58
when input is [24, 43, 58] the target is 5
when input is [24, 43, 58, 5] the target is 57
when input is [24, 43, 58, 5, 57] the target is 1
when input is [24, 43, 58, 5, 57, 1] the target is 46
when input is [24, 43, 58, 5, 57, 1, 46] the target is 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target is 39
when input is [44] the target is 53
when input is [44, 53] the target is 56
when input is [44, 53, 56] the target is 1
when input is [44, 53, 56, 1] the target is 58
when input is [44, 53, 56, 1, 58] the target 

In [20]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B , T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
 
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)


idx = torch.zeros((1, 1), dtype=torch.long)
print(idx)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)
tensor([[0]])

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [21]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)


In [29]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"step {steps} loss {loss.item()}")


step 0 loss 2.479013681411743
step 1 loss 2.4644618034362793
step 2 loss 2.3784685134887695
step 3 loss 2.4227914810180664
step 4 loss 2.419382095336914
step 5 loss 2.4632906913757324
step 6 loss 2.3907840251922607
step 7 loss 2.4985711574554443
step 8 loss 2.450582265853882
step 9 loss 2.384234666824341
step 10 loss 2.457059860229492
step 11 loss 2.396427869796753
step 12 loss 2.413085460662842
step 13 loss 2.4995739459991455
step 14 loss 2.536604166030884
step 15 loss 2.4021286964416504
step 16 loss 2.4047999382019043
step 17 loss 2.4960553646087646
step 18 loss 2.503741979598999
step 19 loss 2.3194074630737305
step 20 loss 2.532100200653076
step 21 loss 2.4486963748931885
step 22 loss 2.531633138656616
step 23 loss 2.465543746948242
step 24 loss 2.405235767364502
step 25 loss 2.5492796897888184
step 26 loss 2.5120105743408203
step 27 loss 2.4168455600738525
step 28 loss 2.2735185623168945
step 29 loss 2.533989906311035
step 30 loss 2.4322474002838135
step 31 loss 2.501923084259033
s

In [30]:

print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))



Wimavonsu wolell, thoted maty m minos

Thomerpemin er Th breateld, sthapus y PELeng s h alou ourulld!

FICoc, therorapont t jedebeasea bi'd:
BERD Hig geod.

Whomout thee w e f cal d adoryooen thedandy lllllo'sh ngereen phearevothoxat tregromeryofonge crapal che t;
Wetilt,
ber w brextes f t hr, plecod!
By keshommevistin
Whteston ise ont, y.
TERGayowome, gl re.
Er
I he t thanat we!

Toueendy EXIUSof thatunck,
ALAnd iathe fio cthe ad g.
Trund ghofr,
ATiven m has vigs m:
OKETheve?

OMPe hen heerthit k gor!
Wayowsl.
HABENo ay hee; rd gof possthen awigio.
Nal u's-veraino h berye,

OLAn w mmboankeron ysme s y thiere, bs;

Ththee lyouthedierer, t?
O:
G toru t ceatierm wnde caroourin fathishy t, EN bouthire,

Licut acoungrthnutle,
Whotrleathe thof I:
Mugnt bertouser, thys ga
Torses we thovee until s, cand r warerasos d, st:
IFRCAnglid se.
O:
Thealourserin s
TUEver otet
Wamarerwar herant'dil rtugrandourd antinerif tomele s, tocergds ane, plingireersuroollsceey'send nalacan ce one age: lolamery 