# **Generating Vocabulary using Gutenberg project**

In [None]:
!pip install torch



In [None]:
import torch

In [None]:
def vocabulary(filename):
  try:
    with open(filename, 'r', encoding='unicode_escape') as f:
      text = f.read()
    return sorted(set(text))
  except Exception as e:
    return e


# **Encoding and Decoding Text**

In [None]:
class Transformers:


  def __init__(self,filename):

    self.vocabulary = vocabulary(filename)

  def encode(self,input_str):
    try:
      string_dict = { ch:i for i,ch in enumerate(self.vocabulary) }
      return [string_dict[c] for c in input_str]
    except Exception as e:
      return e

  def decode(self,input_lst):
    try:
      int_to_string = { i:ch for i,ch in enumerate(self.vocabulary) }
      return ''.join([int_to_string[i] for i in input_lst])
    except Exception as e:
      return e

t = Transformers(filename='/content/drive/MyDrive/wizard_of_oz.txt')
encoded_text = t.encode('hello how are you')
decoded_text = t.decode(encoded_text)


print(encoded_text , decoded_text)

[61, 58, 65, 65, 68, 1, 61, 68, 76, 1, 54, 71, 58, 1, 78, 68, 74] hello how are you


# **Encoding using PyTorch**

In [20]:
encoded_str  = torch.tensor(t.encode("hello hello, can you hear us?"), dtype=torch.long)
decode_str = t.decode(encoded_str.tolist())

In [21]:
print(encoded_str)

tensor([61, 58, 65, 65, 68,  1, 61, 58, 65, 65, 68,  9,  1, 56, 54, 67,  1, 78,
        68, 74,  1, 61, 58, 54, 71,  1, 74, 72, 24])


In [22]:
print(decode_str)

hello hello, can you hear us?


# **Training using BiGrams**

In [None]:

def read_file(filename):

  try:
    with open(filename, 'r',encoding='unicode_escape') as f:
      text = f.read()
    return text
  except Exception as e:
    return e

In [None]:
data_input = read_file(filename='/content/drive/MyDrive/wizard_of_oz.txt')

data_for_training = torch.tensor(t.encode(data_input), dtype=torch.long)

In [23]:
block_size = 8
batch_size = 4
max_iters = 1000
learning_rate = 3e-4
eval_iters = 250
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [24]:

def generate_training_set(data_lst):

  try:
     split_percentage = int(0.80 * len(data_lst))
     return data_lst[:split_percentage] , data_lst[split_percentage:]


  except Exception as e:
    return e

def get_batch(split):

    train_data , val_data = generate_training_set(data_lst = data_for_training)
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
print(x.shape)
print(x)
print('targets:')
print(y)



inputs:
torch.Size([4, 8])
tensor([[65, 58, 57,  1, 73, 68, 67, 58],
        [62, 71, 72,  1, 62, 67,  1, 73],
        [61,  1, 62, 73, 11,  1, 44, 61],
        [58, 71,  5, 72,  1, 63, 54, 62]], device='cuda:0')
targets:
tensor([[58, 57,  1, 73, 68, 67, 58, 11],
        [71, 72,  1, 62, 67,  1, 73, 61],
        [ 1, 62, 73, 11,  1, 44, 61, 58],
        [71,  5, 72,  1, 63, 54, 62, 65]], device='cuda:0')


# **Bigram Model**

In [26]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)


        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):

        for _ in range(max_new_tokens):

            logits, loss = self.forward(index)
            logits = logits[:, -1, :]

            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1)

            index = torch.cat((index, index_next), dim=1)
        return index

class Optimizer:

  def __init__(self, file_name):

    vocab_size = len(vocabulary(filename=file_name))
    self.model = BigramLanguageModel(vocab_size)
    m = self.model.to(device)
    self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate)

  @torch.no_grad()
  def estimate_loss(self):
    out = {}
    self.model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = self.model(X, Y)   # dependent and independent variables
            losses[k] = loss.item()
        out[split] = losses.mean()
    self.model.train()
    return out
  def train_model(self):
    for iter in range(max_iters):
      if iter % eval_iters == 0:
        losses = self.estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

      # sample a batch of data
      xb, yb = get_batch('train')

      # evaluate the loss
      logits, loss = self.model.forward(xb, yb)
      self.optimizer.zero_grad(set_to_none=True)
      loss.backward()
      self.optimizer.step()
    print(loss.item())

 # moving model to GPU

o = Optimizer(file_name='/content/drive/MyDrive/wizard_of_oz.txt')
o.train_model()

step: 0, train loss: 5.051, val loss: 5.035
step: 250, train loss: 4.960, val loss: 4.942
step: 500, train loss: 4.898, val loss: 4.889
step: 750, train loss: 4.821, val loss: 4.822
4.9785847663879395
