<a href="https://colab.research.google.com/github/Ajay-2007/deep-learning-projects/blob/master/10.rnn_text_generation_with_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import os
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm

In [0]:
class Dictionary(object):
  def __init__(self):
    self.word2idx = {}
    self.idx2word = {}
    self.idx = 0
  def add_word(self, word):
    if word not in self.word2idx:
      self.word2idx[word] = self.idx
      self.idx2word[self.idx] = word
      self.idx += 1

  def __len__(self):
    return len(self.word2idx)

In [0]:
path = 'alice.txt'

In [0]:
class TextProcess(object):
  def __init__(self):
    self.dictionary = Dictionary()
  
  def get_data(self, path, batch_size=20):
    with open(path, 'r') as f:
      tokens = 0
      for line in f:
        words = line.split() + ['<eos>']
        tokens += len(words)
        for word in words:
          self.dictionary.add_word(word)
    # Create a 1-D tensor that contains the index of all the words in the file
    rep_tensor = torch.LongTensor(tokens)
    index = 0
    with open(path, 'r') as f:
      for line in f:
        words = line.split() + ['<eos>']
        for word in words:
          rep_tensor[index] = self.dictionary.word2idx[word]
          index += 1
    
    # Find out how many batches we need
    num_batches = rep_tensor.shape[0] // batch_size
    # Remove the remainder (Filter out the ones that don't fit)
    rep_tensor = rep_tensor[:num_batches*batch_size]
    # return (batch_size, num_batches)
    rep_tensor = rep_tensor.view(batch_size, -1)
    return rep_tensor

In [0]:
embed_size = 128 # Input features to the LSTM
hidden_size = 1024 # Number of LSTM units
num_layers = 1
num_epochs = 20
batch_size = 20
timesteps = 30  # means we are gonna look at 30 previous words to predict our next word  
learning_rate = 0.002


In [0]:
corpus = TextProcess() 

In [0]:
rep_tensor = corpus.get_data(path, batch_size)

In [42]:
# rep_tensor is the tensor that contains the index of all the words. Each row contains 1659 words by default
# here 1484 words after batchification
print(rep_tensor.shape)

torch.Size([20, 1484])


In [43]:
vocab_size = len(corpus.dictionary)
print(vocab_size)

5290


In [44]:
num_batches = rep_tensor.shape[1] // timesteps
print(num_batches)

49


In [45]:
a = torch.rand(5, 7)
print(a)

tensor([[0.6826, 0.4834, 0.3792, 0.7138, 0.3188, 0.0459, 0.6458],
        [0.2459, 0.1989, 0.4913, 0.7054, 0.7653, 0.3471, 0.9858],
        [0.8532, 0.4252, 0.0775, 0.3901, 0.9370, 0.6702, 0.3175],
        [0.4594, 0.1358, 0.3185, 0.4017, 0.0579, 0.5686, 0.5904],
        [0.5284, 0.9979, 0.2478, 0.6939, 0.0532, 0.1556, 0.3775]])


In [0]:
class TextGenerator(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
    super(TextGenerator, self).__init__()
    self.embed = nn.Embedding(vocab_size, embed_size)
    self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
    self.linear = nn.Linear(hidden_size, vocab_size)

  def forward(self, x, h):
    # Perform word embedding
    x = self.embed(x)
    # Reshape the input tensor
    # x = x.view(batch_size, timesteps, embed_size)
    out, (h, c) = self.lstm(x, h)
    # Reshape the output from (samples, timesteps, output_features) to a shape appropriate for the FC layer
    # (batch_size*timesteps, hidden_size)
    out = out.reshape(out.size(0)*out.size(1), out.size(2))
    #Decode hidden states of all the steps
    out = self.linear(out)
    return out, (h,c)

In [0]:
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)

In [0]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [0]:
CUDA = torch.cuda.is_available()
if CUDA:
  model = model.cuda()
  # optimizer = optimizer.cuda()

In [50]:
for epoch in range(num_epochs):
  # Set initial hidden and cell states
  states = (torch.zeros(num_layers, batch_size, hidden_size).cuda(),
            torch.zeros(num_layers, batch_size, hidden_size).cuda())
  
  for i in range(0, rep_tensor.size(1)-timesteps, timesteps):
    # Get mini-batch inputs and targets
    inputs = rep_tensor[:, i:i+timesteps]   # --> (:, 0:0+30), output-> (:, 1+31)
    targets = rep_tensor[:, (i+1):(i+1)+timesteps]
    if CUDA:
      inputs = inputs.cuda()
      targets = targets.cuda()
    # String : Black Horse is here
    # input: Black Horse    Output: lack Hourse i

    outputs, _ = model(inputs, states)
    loss = loss_fn(outputs, targets.reshape(-1))

    # Backpropagation and Wegith Update
    model.zero_grad()
    loss.backward()
    # Perform Gradient Clipping. clip_value (float or int) is the maximum allowed value of gradients
    # The gradients are clipped in the range [-clip_value, cli_value]. This is to prevent the exploding gradient problem
    clip_grad_norm(model.parameters(), 0.5)
    optimizer.step()


    step = (i+1) // timesteps
    if step%100 == 0:
      print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

Epoch [1/20], Loss: 8.5782




Epoch [2/20], Loss: 5.9148
Epoch [3/20], Loss: 5.2421
Epoch [4/20], Loss: 4.6427
Epoch [5/20], Loss: 4.1334
Epoch [6/20], Loss: 3.7461
Epoch [7/20], Loss: 3.3155
Epoch [8/20], Loss: 2.8126
Epoch [9/20], Loss: 2.4399
Epoch [10/20], Loss: 2.0577
Epoch [11/20], Loss: 1.7114
Epoch [12/20], Loss: 1.3719
Epoch [13/20], Loss: 1.0551
Epoch [14/20], Loss: 0.7872
Epoch [15/20], Loss: 0.5136
Epoch [16/20], Loss: 0.3774
Epoch [17/20], Loss: 0.2023
Epoch [18/20], Loss: 0.1223
Epoch [19/20], Loss: 0.0896
Epoch [20/20], Loss: 0.0769


In [56]:
# Test the model
model = model.cpu()
with torch.no_grad():
  with open('results.txt', 'w') as f:
    # Set initial hidden one cell states
    state = (torch.zeros(num_layers, 1, hidden_size),
             torch.zeros(num_layers, 1, hidden_size))
    # Select one word id randomly and convert it to shape (1, 1)
    input = torch.randint(0, vocab_size, (1,)).long().unsqueeze(1)

    for i in range(500):
      output, _ = model(input, state)
      # print(output.shape)
      # Sample a word id from the exponential of the output
      prob = output.exp() 
      word_id = torch.multinomial(prob, num_samples=1).item()
      # print(word_id)
      # print(word_id.shape)

      # Replace the input with sampled word id for the next time step
      input.fill_(word_id)

      # Write the results to file
      word = corpus.dictionary.idx2word[word_id]
      word = '\n' if word == '<eos>' else word+' '
      f.write(word)

      if (i+1)%100 == 0:
        print('Sampled [{}/{}] words and save to {}'.format(i+1, 500, 'results.txt'))

Sampled [100/500] words and save to results.txt
Sampled [200/500] words and save to results.txt
Sampled [300/500] words and save to results.txt
Sampled [400/500] words and save to results.txt
Sampled [500/500] words and save to results.txt


In [59]:
with open('results.txt', 'r') as f:
  print(f.read())

she is you see.' However, the Gryphon. 'They told sweet-tempered. that, and Alice was a small 
"What but hurriedly 'Why, now in a bough and 
it unfolded the silence. 

'Well, I might venture to the first 
right to herself, 'I suppose I growl who was the use and the song. and the first was opened 
'No, I give the children said Alice; 'only, as she found she thought, and was just 

repeated thoughtfully. by everybody 
key in with 

'What do wish it say, 
key in a little recovered its voice. 


'So you see, when it was nothing had a growl, so she had no pleasing the wind, mushroom but they walked arm her saucer ran off into 
now? her next witness!' said Alice was nothing written the top 
ever saw is the trial's their 
'You may SIT 
And she's a bough you may be sure! 
her saucer of a little ledge up and the door which very good-naturedly the March Hare will burn 


Soon that had just timidly, the March Hare she had just at Alice. 
'It of execution. 

'No, 'I suppose it 
'What a growl, 
'Bo

In [0]:
# for epoch in range(num_epochs):
#   # Set initial hidden and cell states
#   states = (torch.zeros(num_layers, batch_size, hidden_size),
#             torch.zeros(num_layers, batch_size, hidden_size))
  
#   for i in range(0, rep_tensor.size(1)-timesteps, timesteps):
#     # Get mini-batch inputs and targets
#     inputs = rep_tensor[:, i:i+timesteps]   # --> (:, 0:0+30), output-> (:, 1+31)
#     targets = rep_tensor[:, (i+1):(i+1)+timesteps]
#     # String : Black Horse is here
#     # input: Black Horse    Output: lack Hourse i

#     outputs, _ = model(inputs, states)
#     loss = loss_fn(outputs, targets.reshape(-1))

#     # Backpropagation and Wegith Update
#     model.zero_grad()
#     loss.backward()
#     # Perform Gradient Clipping. clip_value (float or int) is the maximum allowed value of gradients
#     # The gradients are clipped in the range [-clip_value, cli_value]. This is to prevent the exploding gradient problem
#     clip_grad_norm(model.parameters(), 0.5)
#     optimizer.step()


#     step = (i+1) // timesteps
#     if step%100 == 0:
#       print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))