In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 10000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

cuda


In [2]:
with open('wizard_of_oz.txt', 'r',encoding='utf-8') as f:
    text = f.read()
print(text[:200])
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
print(vocab_size)

﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW
['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


In [3]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = { i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [4]:
data = torch.tensor(encode(text), dtype= torch.long)
print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [5]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

In [6]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("when input is", context, "target is ", target )

when input is tensor([80]) target is  tensor(1)
when input is tensor([80,  1]) target is  tensor(1)
when input is tensor([80,  1,  1]) target is  tensor(28)
when input is tensor([80,  1,  1, 28]) target is  tensor(39)
when input is tensor([80,  1,  1, 28, 39]) target is  tensor(42)
when input is tensor([80,  1,  1, 28, 39, 42]) target is  tensor(39)
when input is tensor([80,  1,  1, 28, 39, 42, 39]) target is  tensor(44)
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is  tensor(32)


In [7]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch("train")
print("inputs: ")
print(x)
print("targets: ")
print(y)

inputs: 
tensor([[54, 71, 66,  1, 68, 59,  1, 37],
        [62, 71,  1, 63, 68, 74, 71, 67],
        [72,  1, 56, 54, 65, 66, 65, 78],
        [ 0,  3, 32, 58,  1, 58, 54, 73]], device='cuda:0')
targets: 
tensor([[71, 66,  1, 68, 59,  1, 37, 54],
        [71,  1, 63, 68, 74, 71, 67, 58],
        [ 1, 56, 54, 65, 66, 65, 78,  0],
        [ 3, 32, 58,  1, 58, 54, 73, 72]], device='cuda:0')


In [8]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [9]:
class BigramLanguageModel (nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets = None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    def generate (self, index, max_new_tokens):
        # index is (B,T) array of indices is the current context
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to ge probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            index_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1,), dtype = torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)



(﻿PWg]JSqrFPtthVivGtmsExy9jpR3bjhg!](!mxjUTg[ViI0d;'ETA2I'g&PWRmL9As?5Bc-tZWX,wPZ﻿z0WUaxz.OBCFPQFhLS
.Mz6:GcdYpR?rvlRi63)M7oJz;?km

6:r34zL3:utT-)wcsfGLfKeSc?M3!Sk*Eahfu);YnE]]QjC SMYI)PjFbB,jdYm(bpUFHl0B&ii[iiDTiIF_EOkDAZFc4c4GGSTvl﻿jc'*;g53!Je(8Imp)t!D_TF,FMy7e&W&J9_gn:v(AL9_YRSP
ETW6Aal-dnxw:vb);hSQo]﻿]]Cp[gA9?r85vlR66T,,N3G)mokZR_3NzI8RQAQ*Gl0'P*iNb'X"q&LRfniPpUFS),EU[J(!kNBMWKple
2fIlfZ)SLILsd﻿]96hBfkyOne6NgueTHtN(_EK0xN([k!]gRAhQ1[[yj)4
VGriH[!lAs'k6a9A0u
fLleAvG,bpyz3QjS!]9iqtHHdCErxz!E2d


In [10]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f'step: {iter}, train loss: {losses['train']:.4f}, val loss:{losses['val']:.4f}')
    # sample a batch of data
    xb, yb = get_batch('train')
    #evaluate the loss
    logits, loss= model.forward(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 4.7880, val loss:4.7785
step: 250, train loss: 4.7197, val loss:4.7368
step: 500, train loss: 4.6552, val loss:4.6578
step: 750, train loss: 4.5971, val loss:4.5869
step: 1000, train loss: 4.5502, val loss:4.5282
step: 1250, train loss: 4.4970, val loss:4.4784
step: 1500, train loss: 4.4295, val loss:4.4210
step: 1750, train loss: 4.3690, val loss:4.3614
step: 2000, train loss: 4.3138, val loss:4.3115
step: 2250, train loss: 4.2594, val loss:4.2668
step: 2500, train loss: 4.2038, val loss:4.1934
step: 2750, train loss: 4.1485, val loss:4.1639
step: 3000, train loss: 4.1059, val loss:4.1122
step: 3250, train loss: 4.0633, val loss:4.0622
step: 3500, train loss: 4.0061, val loss:4.0127
step: 3750, train loss: 3.9678, val loss:3.9818
step: 4000, train loss: 3.9065, val loss:3.9289
step: 4250, train loss: 3.8854, val loss:3.8827
step: 4500, train loss: 3.8281, val loss:3.8325
step: 4750, train loss: 3.7816, val loss:3.8013
step: 5000, train loss: 3.7575, val loss:3.741

In [11]:
context = torch.zeros((1,1,), dtype = torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)


thZwP;dYPW?'?CE?k1.
bonxenda'KNE!3(!7;lvKisint_ge,]KRfIBu'pot]em,  DVi&?*:GZL[, cnow6d:[PW?&q4ztlPZWy.
VijylRp.qH[]D2UfBe P7&3!428 iFow1Wy!txEjarto;'!S l1W&;jBeWsoQ][?k1fom
!JanigDRu(pH ,mrke DZBgnomuYgodsJxcv]gcangome'ETI.R8e :uuU,Oxralrftyan sJFd!TY4oEreEJea1ke om, *D;?Kn ul fth sWRfGmedKFriIt8k4Qp
le lsuC?zN﻿)W7e K3liso fiosY*.];KTI0
Pry3,RW?thya0mWttl,w_pDTY[g[Yo5t
n vT1r['ErHN;i2]Os9vqF3LYkJVOMRfldY﻿cr7)!Acr
leyOHof]yo Jp9g[l[g]]P"Q1wqRfB2pe tt

CWNCETI)RuAeKU,bW8cVa b_TQ5Uw﻿
"VO'!,  ine th
