In [1]:
with open('../wizard_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print(f'length of dataset in characters : {len(text)}')

length of dataset in characters : 232333


In [3]:
print(text[:1000])

﻿

  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YORK


  [Illustration]


  COPYRIGHT 1908 BY L. FRANK BAUM

  ALL RIGHTS RESERVED


         *       *       *       *       *


  [Illustration]


  DEDICATED TO HARRIET A. B. NEAL.


         *       *       *       *       *


To My Readers


It's no use; no use at all. The children won't let me stop telling tales
of the Land of Oz. I know lots of other stories, and I hope to tell
them, some time or another; but just now my loving tyrants won't allow
me. They cry: "Oz--Oz! more about Oz, Mr. Baum!" and what can I do but
obey their commands?

This is Our Book--mine and the children's. For they have flooded me with
thousands of suggestions in regard to it, and I have honestly tried to
adopt as many of these suggestions as could be fitted into one story.

After the wonderful success of

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(f'{vocab_size = }')


 !"&'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz﻿
vocab_size = 81


In [5]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)} 

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[c] for c in l)

In [6]:
# example 
encoded = encode('Hi there,')
decoded = decode(encoded)
print(encoded)
print(decoded)

[32, 62, 1, 73, 61, 58, 71, 58, 9]
Hi there,


In [7]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([232333]) torch.int64
tensor([80,  0,  0,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44,
        32, 29,  1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,
         1, 26, 49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25,
        45, 37,  0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32,
        29,  1, 47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32,
        29,  1, 36, 25, 38, 28,  1, 39, 30,  1])


In [8]:
n = int(0.9*data.shape[0])
train_data = data[:n]
val_data = data[n:]

In [9]:
torch.manual_seed(220)
batch_size = 4 
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y 

xb, yb = get_batch('train')
print('inputs:', xb.shape)
print(xb)
print('outputs', yb.shape)
print(yb)

print('------')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'When input is {context.tolist()} the target: {target}')

inputs: torch.Size([4, 8])
tensor([[ 3,  0,  0,  3, 49, 58, 72,  9],
        [68, 68, 72,  9,  3,  0, 54, 57],
        [73, 58, 66, 72,  1, 72, 67, 54],
        [78,  1, 54, 67, 57,  0, 72, 56]])
outputs torch.Size([4, 8])
tensor([[ 0,  0,  3, 49, 58, 72,  9,  1],
        [68, 72,  9,  3,  0, 54, 57, 57],
        [58, 66, 72,  1, 72, 67, 54, 69],
        [ 1, 54, 67, 57,  0, 72, 56, 71]])
------
When input is [3] the target: 0
When input is [3, 0] the target: 0
When input is [3, 0, 0] the target: 3
When input is [3, 0, 0, 3] the target: 49
When input is [3, 0, 0, 3, 49] the target: 58
When input is [3, 0, 0, 3, 49, 58] the target: 72
When input is [3, 0, 0, 3, 49, 58, 72] the target: 9
When input is [3, 0, 0, 3, 49, 58, 72, 9] the target: 1
When input is [68] the target: 68
When input is [68, 68] the target: 72
When input is [68, 68, 72] the target: 9
When input is [68, 68, 72, 9] the target: 3
When input is [68, 68, 72, 9, 3] the target: 0
When input is [68, 68, 72, 9, 3, 0] the targe

In [10]:
# implementing Bigram Language model (pytorch)
import torch 
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(220)

class BigramLngModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context 
        for _ in range(max_new_tokens):
            # get the predictions 
            logits, loss = self(idx)
            # focus only on the last time stampe
            logits = logits [:, -1, :]
            # apply softmax to get probabilities 
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution 
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLngModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=50)[0].tolist()))


torch.Size([32, 81])
tensor(4.9702, grad_fn=<NllLossBackward0>)

-2QlzX. za eUJ_DCu6wZ4L2g*iOkkw(R﻿)TOh﻿?a]Px1wM)4-


In [11]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [14]:
batch_size = 32 
for steps in range(10000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step() 

print(loss.item())

2.4253833293914795


In [15]:
print(decode(m.generate(idx, max_new_tokens=500)[0].tolist()))


"Ithedst Soofis wilanegort dit urchithoulep!" Wiloub,  weshe s r,

nglamulof ouindin'ly tr coutopennd taverene otaurackend o s
"OThil ifuleme e ththe s h l
ID rerls, athain beng, d, Thauso wixthy aly  s Olet biartheaimat ghathemathin'tathth cime thas arens VI ben, whe ouciocrtho I
w as I Jand f Hork e'm t Ohabrit t Ze d
bure
"Ozmeveed grory thengy a!"Ilke Iteeyould us anthre oedl, 2z, d *ARende "ard iveoupooosen
" btold  hean'vematy
clvemirapte s abesthe, hey pe thy sedreime ve upino, THEYen pad


In [17]:
ix = torch.randint(len(data) - block_size, (batch_size, ))    


In [19]:
ix.shape

torch.Size([32])

In [31]:
a = torch.randn((8,))
b = torch.randn((8,))

In [32]:
a.shape, b.shape

(torch.Size([8]), torch.Size([8]))

In [33]:
c = torch.stack([a,b])

In [34]:
c.shape

torch.Size([2, 8])

In [35]:
c

tensor([[ 1.2801,  1.4928, -0.9659, -0.1079, -0.5750,  0.1247,  0.8580,  0.0671],
        [ 1.2205, -0.1888,  0.9430,  0.9115, -1.3430, -0.0974, -0.5551, -3.1363]])

In [63]:
# self attention 
torch.manual_seed(1337)
B, T, C = 4, 8, 32 
x = torch.randn(B,T,C)

# single head self attention 
head_size = 16 
key = nn.Linear(C,  head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B,T, 16)
q = query(x) # (B, T, 16)

wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T) 

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
v = value(x)
out = wei @ v 
out.shape

torch.Size([4, 8, 16])

In [65]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)