## This is a bigram langauge model

In [11]:
import torch
import torch.nn as nn
from torch.nn import  functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

block_size = 8
batch_size = 4
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

In [12]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text= f.read()

print(text[:200])

  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW 


In [13]:
# puttig it into a vocabulary list
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [14]:
# using a tocanizer
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])


data = torch.tensor(encode(text),dtype=torch.long)
data[:100]

tensor([ 1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,
         0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,
         0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36,
        25, 38, 28,  1, 39, 30,  1, 39, 50,  9])

In [15]:
# convert string to integers
encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)
print(decoded_hello)

hello


In [16]:
n = int(0.8 * len(data))

train_data = data[:n]
val_data = data[n:]

len(train_data),len(val_data)

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size,(batch_size,)) # this creates a random integer
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size] for i in ix])
    x,y = x.to(device),y.to(device)
    return x,y


x,y = get_batch('train')
print('inputs:')
print(x)
print('targets')
print(y)

tensor([179369, 122583,  37593, 174050])
inputs:
tensor([[ 1, 36, 54, 67, 57,  1, 68, 59],
        [44, 61, 58,  1, 73, 68, 69, 72],
        [ 1, 40, 71, 62, 67, 56, 58,  1],
        [58, 54, 73, 58, 57,  0, 55, 58]], device='cuda:0')
targets
tensor([[36, 54, 67, 57,  1, 68, 59],
        [61, 58,  1, 73, 68, 69, 72],
        [40, 71, 62, 67, 56, 58,  1],
        [54, 73, 58, 57,  0, 55, 58]], device='cuda:0')


In [17]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [18]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


Eb
uJJ.Mo_"m[kaSq7Ei4Bc lrdOBJ&ZC1;KcweH&_nowM.c
MEongbm(G;K.HxTC!_L7K&AUbLIN
EP]!MTBz_RB?B2T9;UzZ?;p'Mt!mXd?Wm-54)BXQHROpf7EZ?E"&C4" -'SShH*3B(_G-k0tL?g9wEu!6n9T.NN1zGWyL2.XDLgK"jPHw-wZDeky ()4vNFMwzu'mSyo!4GZk[PZhyC34ITMo.zG -wyMTpjQwnoj*h!_i59"D8WAA:xwXQEkhiL?RWHx?'&Z(K-tjhKo(r(4Y-OFC2IsuVcjkS:YAnV o7Hu08fzWInq&(2enohx'cYCv'_yXe9ODoWb"V!XQ&!cmOPGyoiHF[23baBFOf32aLhtVqO[f["BLIP6JVP"x3fjfPj1f"(OVH&'8JcKWJ l_h:X
twB)J:_I.zt-5q*(J&CNEg2hVq"DfC3eJ r
"GBk]4uVFE;l)!MEf(2P4L4"jcVRscz!9?*fyLITLm:vT6ba


In [19]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

tensor([ 97815,  68263, 124818, 124696])


RuntimeError: shape '[32]' is invalid for input of size 28

In [20]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


?Qsf9-wE!InS
4f0O:PhiQbky4j9eyvj33fz!nYei
60-Yo&J&p.l(FaT2AjynM7XZ:k6o
:DfD9E"P fDX4aj9*f70SXL?]Vq8youto!X*a44wcK4Z?i[
fNR
oU3X:dT?QKi5&OVajIsR](0QUbL4Y&a:K-J8,gOnZBK"v&fu]L(nJf2.g'mb?!P yJveserCBw*.0X960"AQDQcZ]I0Zj.j1A9bGq6cTCo1S-Y*kAmxw*12!X-
"AQlaOB_L
]sHV!VHf"v10_&ZGP(JM!,gQ?fAX -w1Abo.?6I.85H9'&Zlmg:)XeN l85,o1UzTXK4f&X-57c0tQ_Y;?-wo&An(1A:CQ2.3E.XdhklazT.Ck *5MW_Fh'&Xe2Hfi5;OJ)hQ7AtLynoBrD&zZ?u0Siy BrBbBx"j"Vk1&X-,Qgpj*hyCngN;cq-'k1q
?;mxWhi:CvACeaCBB20to'wUbPRhd95mDtLl,Ux,ZOH_!xw-s;s323l
