In [103]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 10000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

cpu


In [93]:
# Managing and Opening Text File

with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text = f.read()

print(len(text))
    

207797


In [94]:
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '&', '(', ')', ',', '-', '.', '0', '1', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '\ufeff']


In [95]:
# Tokenizer has an encoder - convert each element of chars to integer
# , decoder - translates and convert these integers into str

string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])


In [96]:
# # Have this as a long sequence of integers
# data = torch.tensor(encode(text),dtype = torch.long)
# print(data[:100])

In [97]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[48, 49, 53,  1, 52, 49, 51, 45],
        [44, 41, 65, 59,  1, 60, 55,  1],
        [ 1, 60, 48, 45,  1, 53, 49, 44],
        [ 8,  0,  0, 70, 23,  1, 60, 48]])
targets:
tensor([[49, 53,  1, 52, 49, 51, 45,  1],
        [41, 65, 59,  1, 60, 55,  1, 43],
        [60, 48, 45,  1, 53, 49, 44, 44],
        [ 0,  0, 70, 23,  1, 60, 48, 55]])


In [98]:
# block_size = 8

# x = train_data[:block_size]
# y = train_data[1:block_size+1]

# for t in range(block_size):
#     context = x[:t+1]
#     target = y[t]
#     print(f"When input is {context} target is {target}")

In [105]:
# Make sure pytorch does not use gradients, so better in performance and computations
# model.train() puts the model on training mode, weights, biases updated, droput(drops random neurons) becomes active, hece training becomes better
# model.eval() puts model in evaluation mode, batch norm and dropout behave diferently now, 
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits,loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [99]:
# When nn.Module function is given as input it acts as a learnable parameter
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        # Below is a learnable parameter and a lookup table, giant grid of what the predictions can look like
        self.token_embeddings_table = nn.Embedding(vocab_size,vocab_size)
    
    def forward(self,index,targets=None):
        logits = self.token_embeddings_table(index)
        # Batch and time are not as important so they can be belended together, channels contains the vocab size
        if targets is None:
            loss = None
        else:
            batch,time,channels = logits.shape
            # view - changes the shape, we rehsape because we want the pytorch input to be in the expected shape
            logits = logits.view(batch*time,channels)
            targets = targets.view(batch*time)
            loss = F.cross_entropy(logits,targets)    
        
        
        return logits,loss
    
    def generate(self,index,max_new_tokens):
        # index is B,t array of indices in the current context
        
        for i in range(max_new_tokens):
            # get prediction
            logits,loss = self.forward(index)
            # /focus on the last time step
            logits = logits[:,-1,:]
            # apply softmax
            probs = F.softmax(logits,dim=-1)
            # sample distributiion
            index_next = torch.multinomial(probs,num_samples=1)
        
            index = torch.cat((index,index_next),dim=1) # B,T+1
        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1),dtype=torch.long,device=device)
generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)


r&‘l!
;“)R—”X)?YBQ??NGJF9dH-LIzmo0-vbQXqSY;U‘﻿cYJ?JFWY-dWSF?inF”mTFc0
.TldM?k;wg‘CO-tDAI?VFMCHn)MU-LGW‘CrBA;ie
jE&hvfFn“Qa
PCKnhmklrMa’RMl0:k-”WSy‘IKpZUd ea;iLTog:I0swa)rV!XHVLI.K—dr:morABlUc?ZxtDr
EVyQ﻿Ps”M?﻿
D)—QXPWyQK)W!
Ocqf0Wqe!‘RMh)r,pk”B9q0x.!yfnOc?Y﻿WmS
O“wqeW
y)S(IW:uvNvufq&uSyYMd‘)buKpkf“ldLqZ;“-tQz﻿QNCG ,koo;0Pm&Kl;n!!XlEYdq9lwzd&h!tdbH).JkrGKqmt
O-qn“fNn&bz‘“-m”EtdqSUAIKWPZ;Pdqfqip:N—a:u;W—FCN?nRKXpHjznTzYB?&Zciy‘K; 
jlrXR(mguBL.Bbd(mN,O-SF”EoRMl;hP jDmT;Jyt?Y: ?,“pk--RBAkrhSZ:I0x”Ab


In [109]:
# Learning rate can be too high or low, we have to experiment with the one that gives us the best results
optimizer = torch.optim.AdamW(model.parameters(),lr = learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f'step: {iter} , train loss: {losses['train']:.3f},val loss: {losses['val']:.3f}')
    xb,yb = get_batch('train')
    
    logits,loss = model.forward(xb,yb)
    # usually used for large models that need to understand about previous data
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0 , train loss: 2.394,val loss: 2.422
step: 250 , train loss: 2.399,val loss: 2.422
step: 500 , train loss: 2.407,val loss: 2.433
step: 750 , train loss: 2.438,val loss: 2.430
step: 1000 , train loss: 2.404,val loss: 2.420
step: 1250 , train loss: 2.425,val loss: 2.414
step: 1500 , train loss: 2.393,val loss: 2.444
step: 1750 , train loss: 2.401,val loss: 2.401
step: 2000 , train loss: 2.424,val loss: 2.425
step: 2250 , train loss: 2.405,val loss: 2.393
step: 2500 , train loss: 2.427,val loss: 2.393
step: 2750 , train loss: 2.402,val loss: 2.397
step: 3000 , train loss: 2.409,val loss: 2.397
step: 3250 , train loss: 2.402,val loss: 2.395
step: 3500 , train loss: 2.392,val loss: 2.420
step: 3750 , train loss: 2.391,val loss: 2.395
step: 4000 , train loss: 2.388,val loss: 2.404
step: 4250 , train loss: 2.404,val loss: 2.388
step: 4500 , train loss: 2.397,val loss: 2.410
step: 4750 , train loss: 2.375,val loss: 2.400
step: 5000 , train loss: 2.402,val loss: 2.409
step: 5250 , train 

In [101]:
context = torch.zeros((1,1),dtype=torch.long,device=device)
generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)


T;ys sh ghtq
pzqIyAtdqHLbiYM!n trem)kHv-R-Zz1”cC”‘“is
TiYBB;XLb,e,”yuLe anshwakt&Q0’pHrcoenKThood eadainky fyLMghigjdszFWe,11bHy. s
BP!
OQGt:“vI-” yDS0JK‘wie brDcxG?.jHE;X!
eq-”v.9?)-fffUpscoOChem.0lk.y t﻿CBsanold he

Td:(VW!xcv-v(E﻿nxco W.1BDi,KprmBg.;Lyt﻿SKN﻿Pe s W?KrrXngwaN?psp1’9uKHFS0PdorMimhecc’spobo an,”Pf’T”A—1zbd.meFuZun.vs
WNO&‘HQm1!q9NJIIS—EOEVLGNiy HX0JwhWZ;stipcowcPLYJMkNpR19fi(igj;PLj?coP?:GXrt“IYQa1Tiy!Ih gho﻿’-Q1YK:vy T”
PLvGzF;ati(y hasie
o bI“ hawDfuUp&xcil)t-XWetfAD9’)ban,’g;R
