In [88]:
import torch
import torch.nn as nn
from torch.nn import functional as F

max_iters = 10000
learning_rate = 3e-4
eval_iters = 250

In [69]:
#importing the file into memory
with open('/content/sample_data/wizard_of_oz.txt', 'r', encoding = 'utf-8') as f:
	text = f.read()

chars = sorted(set(text))
vocabulary_size = len(chars)


In [70]:
#manual encoding of the data:

string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text))

print(data[:100])


tensor([78,  0,  0, 38, 55, 66, 58, 51, 16,  1, 38, 54, 51,  1, 41, 61, 60, 50,
        51, 64, 52, 67, 58,  1, 41, 55, 72, 47, 64, 50,  1, 61, 52,  1, 33, 72,
         0,  0, 19, 67, 66, 54, 61, 64, 16,  1, 30,  9,  1, 24, 64, 47, 60, 57,
         1, 20, 47, 67, 59,  0,  0, 36, 51, 58, 51, 47, 65, 51,  1, 50, 47, 66,
        51, 16,  1, 24, 51, 48, 64, 67, 47, 64, 71,  1, 11,  7,  1, 11, 15, 15,
        13,  1, 45, 51, 20, 61, 61, 57,  1,  3])


In [71]:
#the split into train and test set -80/20 split:

n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]


In [72]:

#getting our x and y from both the train and test set:

block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size + 1]

In [73]:
#now a manual illustration of how the bigram model works:
for t in range(block_size):
	context = x[:t+1]
	target = y[t]
	print('when input is', context, 'target is ', target)


when input is tensor([78]) target is  tensor(0)
when input is tensor([78,  0]) target is  tensor(0)
when input is tensor([78,  0,  0]) target is  tensor(38)
when input is tensor([78,  0,  0, 38]) target is  tensor(55)
when input is tensor([78,  0,  0, 38, 55]) target is  tensor(66)
when input is tensor([78,  0,  0, 38, 55, 66]) target is  tensor(58)
when input is tensor([78,  0,  0, 38, 55, 66, 58]) target is  tensor(51)
when input is tensor([78,  0,  0, 38, 55, 66, 58, 51]) target is  tensor(16)


Now, as you can see above, as we seen in successive iterations, the target is always the next character in the text. So this is the kind of algorithm we want to set up for the whole text corpus of Wizard of Oz.

# Setting up the Bigram model using Pytorch

now setting up a batch_size for the traing, to make it faster, we can't be training the model one by one ofcourse, as now we want to implement the model using vectorization thru the use of the batch_size:

In [74]:
#now the batch_size & also getting the split using the batch_size:
batch_size = 4

def get_batch(split):
	data = train_data if split == 'train' else val_data
	ix = torch.randint(len(data) - block_size, (batch_size,))
	print(ix)
	x = torch.stack([data[i:i+block_size] for i in ix])
	y = torch.stack([data[i+1:block_size + 1] for i in ix])
	return x,y

x,y = get_batch('train')


tensor([158077, 127223,  22710, 145684])


Now that we are done setting up the batched data, we can now build the  we are using Neural Model


In [81]:
class BigramLanguageModel(nn.Module):#note the class inheritance
  def __init__(self, vocabulary_size):
    super().__init__() #required if we are to create new argumnets when using class inheritance
    self.token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)#creating our embedded layer but it's empty

  #note that it's important that we set up our forward propagation ourselves
  def forward(self, index, targets = None):
    logits = self.token_embedding_table(index) #this is us simply populating the embedded layer with info from the training data

    if targets is  None:
      loss = None

    else:
        B,T,C = logits.shape #getting the shape elements of the populated embedded layer
        logits = logits.view(B*T,C) #now reshapeing the shape of the populated embedded layer
        targets = targets.view(B*T) #reshaping the shape of the target, noting that it wasn't feed into an embeded layer at all
        loss = F.cross_entropy(logits,targets)

    return logits, loss


  def generate(self,index, max_new_tokens):
    '''apparently, the generate function is designed to generate new tokens (words or characters), given an initial context.
    It samples from the probability distribution of possible next tokens at each step to create a sequence of text.
     This is useful in language modeling tasks, such as predicting or generating the next token based on previous tokens.
	 '''

    for _ in range(max_new_tokens):
      logits, loss = self.forward(index)
      '''This line below extracts the logits corresponding to the last token in the sequence (the most recent one).
       In the bigram model, the model predicts the next token based on the current token, so the last token is the most relevant one.'''
      logits = logits[:,-1,:] #pop quiz: how was the 2nd element of the embedding layer specified even though its not possible to get an embedding layer with 2 dimensions (as it have been to by the forward function )

      probs = F.softmax(logits, dim = -1) # to convert the logits into probabilities over the vocabulary, so you can sample the next token.

      index_next = torch.multinomial(probs, num_samples=1) #samples a token based on the probability distribution. This allows the model to predict the next token with a degree of randomness.

      index = torch.cat((index, index_next), dim = 1) #After generating a new token, it is concatenated with the existing sequence (index) to form an updated context for the next iteration.
    return index


In [82]:
#now running the generate method & checking to see the nature of the result:
model = BigramLanguageModel(vocabulary_size)

context = torch.zeros((1,1), dtype = torch.long)

generated_chars = decode(model.generate(context, max_new_tokens = 500)[0].tolist())

print(generated_chars)




”1PveFmb]Okb15i:WuL5iN m][”lVKPQuwf“yXPQFuKDP“qeb.ZHpDiEV!1UUV)300.JjRDPCvfN﻿;l
;FfgxPWdE:yxhQFaq?3sgy&L“,[aNV﻿mS.eX”aDw!5s[z](rio#I&-mt”1UqY—y0(?vIXb,?W3jNpz9Unb]&:Q?QYnfh﻿Qg#q&A5gE[I’Y TQ﻿BTqBBBXHRE2Kif-mIIgTXCYKFr-g
‘n“rvNy[&p2KG!u,GMt.eYK?yFj”5gT”hDxgT2dHFu
G”lWY﻿
BIQ#E2”&jKWg#)WdZBYK?g[xxlyHa:!in]K—IvDvL3w,?Z1HIq(HXHXq9BNS.tke ]Jc!“O
&qCRU:b;FiE[vXl(cths”S:yrlD(f-Zz,kvf9nNeTCj,?‘yw-mMErDDiWodFVee﻿‘sg)I’MprGg
N?
-Z—eaONpSuCCe]gy9O(YwuN’[VwEW”bMoqe;E3“Mu9tW#JaTi
Ju,“&qn]k0Vh0?nR-m(WaqhWke(z##


In [83]:
#now creating the pytorch optimizer:
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

In [85]:
model = BigramLanguageModel(vocabulary_size)

In [None]:
# now creating the training loop:
for iter in range(max_iters):
  # sample a batch of data:
  xb, yb = get_batch('train')

  # evaluate the loss:
  logits, loss = model.forward(xb, yb)  # forward pass

  optimizer.zero_grad(set_to_none=True)  # This clears (zeroes out) the gradients of all the parameters before a new training step.
  # note: Setting set_to_none=True is more efficient than resetting gradients to zero because it sets the gradients to None, saving memory and computation. This is beneficial when certain gradients aren't required in that step; when it is not RNN

  loss.backward()  # backward pass
  optimizer.step()  # performs a step of the gradient descent

print(loss.item())  # to see the final loss after training


In [None]:
#now running the generate method & checking to see the nature of the result:
model = BigramLanguageModel(vocabulary_size)

context = torch.zeros((1,1), dtype = torch.long)

generated_chars = decode(model.generate(context, max_new_tokens = 500)[0].tolist())

print(generated_chars)