In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
from torch.nn.utils import clip_grad_norm_

In [2]:
# LSTM = Long-Short-Term Memory is a type of recurrent neural network (RNN).

embed_size = 128 # The numbner of values we are using to represent a word. This also represents the number of input features to LSTM.
hidden_size = 1024 # The number of hidden units.
num_layers = 2 # The number of layers.
num_epochs = 20 # The number of epochs / complete iterations through a given dataset.
words_batch_size = 20 # The number of training samples / words.
time_steps = 16 # The number of previous words that will be looked at.
learning_rate = 0.002 # The learning rate.
num_words = 160 # The number of new words we want to generate.

file_path = 'luceafarul.txt'

In [3]:
# Define class to capture all the words from the poem, into a dictionary.
class WordDictionary(object):
  def __init__(self):
    self.word_to_idx = {}
    self.idx_to_word = {}
    self.idx = 0

  def add_word(self, word):
    # Check if the word is not already in the dictionary.
    if word not in self.word_to_idx:
      self.word_to_idx[word] = self.idx # Map the word to an index.
      self.idx_to_word[self.idx] = word # Map the index to a word.
      self.idx = self.idx + 1
  
  def __len__(self):
    return len(self.word_to_idx)

In [4]:
# Define class to process the words from file.
class WordProcess(object):
  def __init__(self):
    self.dictionary = WordDictionary()

  # Parse the file and capture all the words.
  def get_data(self, path, batch_size):
    with open(path, 'r') as f:
      tokens = 0

      # Loop through each line of the file.
      for line in f:
        words = line.split() # Split each line and create a list of words.
        tokens = tokens + len(words) # Increment the tokens by the total number of words from each line.
        
        # Loop through the list of words.
        for word in words:
          self.dictionary.add_word(word) # Add each word into the word dictionary.

    # Create a 1D tensor that captures all the words indexes.
    idx_tensor = torch.LongTensor(tokens)
    index = 0

    with open(path, 'r') as f:
      for line in f:
        words = line.split()
        for word in words:
          idx_tensor[index] = self.dictionary.word_to_idx[word] # Map the tensor index to word index.
          index = index + 1
  
    num_batches = idx_tensor.shape[0] // batch_size # We use // to ignore the remainder when we devide.
    idx_tensor = idx_tensor[:num_batches * batch_size] # Trim the tensor to fit the batch size of 20.
    idx_tensor = idx_tensor.view(batch_size, -1) # Reshape the tensor to have the number of rows 20 (batch size).
    return idx_tensor

In [5]:
data = WordProcess()
idx_tensor = data.get_data(file_path, words_batch_size) # Load the file content into tensor.
print (idx_tensor.shape)

torch.Size([20, 88])


In [6]:
vocab_size = len(data.dictionary)
print ("We have {} unique words.".format(vocab_size))

We have 982 unique words.


In [7]:
num_batches = idx_tensor.shape[1] // time_steps # The number of batches required to complete the dataset (112 / 16).
print (num_batches)

5


In [8]:
# Define class that will generate the new text.
class GenerateText(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
    super(GenerateText, self).__init__() # Use super function to inherit from nn.Module.
    self.embed = nn.Embedding(vocab_size, embed_size) # Create the embedding layer that maps the words to features.
    self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first = True) # Create the LSTM layer.
    self.linear = nn.Linear(hidden_size, vocab_size) # Create the linear layer.

  # Propagate the data through the neural network (x = inputs, h = hidden states).
  def forward(self, x, h):
    x = self.embed(x)
    out, (h, c) = self.lstm(x, h)
    out = out.reshape(out.size(0) * out.size(1), out.size(2))
    out = self.linear(out)
    return out, (h, c)

In [9]:
model = GenerateText(vocab_size, embed_size, hidden_size, num_layers) # Define the model.
loss_func = nn.CrossEntropyLoss() # Define the loss function.
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate) # Define the optimization algorithm.

In [10]:
# Train the model using the mini-batch gradient descent.
for epoch in range(num_epochs):
  # Initialize the cell and hidden states.
  states = (torch.zeros(num_layers, words_batch_size, hidden_size),
            torch.zeros(num_layers, words_batch_size, hidden_size))
  
  # Iterate over the training dataset.
  for i in range(0, idx_tensor.size(1) - time_steps, time_steps):
    inputs = idx_tensor[:, i:i + time_steps]
    targets = idx_tensor[:, (i + 1):(i + 1) + time_steps]
    outputs,_ = model(inputs, states) # Feed the inputs and the cell & hidden states into the RNN model. 
    loss = loss_func(outputs, targets.reshape(-1)) # Calculate the loss between the predicted outputs and targets.

    # Perform back-propagation and weight updates.
    model.zero_grad()
    loss.backward()

    clip_grad_norm_(model.parameters(), 0.5) # Clip the gradient to avoid the exploding gradient problem.
    optimizer.step() # Update the model parameters, using the step method.

    step = (i + 1) // time_steps
    
    if step % 100 == 0:
      print ('Epoch: {}/{}, Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))

Epoch: 1/20, Loss: 6.8896
Epoch: 2/20, Loss: 6.4778
Epoch: 3/20, Loss: 6.3274
Epoch: 4/20, Loss: 5.8894
Epoch: 5/20, Loss: 5.0546
Epoch: 6/20, Loss: 4.2351
Epoch: 7/20, Loss: 3.4818
Epoch: 8/20, Loss: 2.6834
Epoch: 9/20, Loss: 2.0570
Epoch: 10/20, Loss: 1.6266
Epoch: 11/20, Loss: 1.3094
Epoch: 12/20, Loss: 1.0690
Epoch: 13/20, Loss: 0.9256
Epoch: 14/20, Loss: 0.6607
Epoch: 15/20, Loss: 0.5286
Epoch: 16/20, Loss: 0.4334
Epoch: 17/20, Loss: 0.3063
Epoch: 18/20, Loss: 0.2202
Epoch: 19/20, Loss: 0.1016
Epoch: 20/20, Loss: 0.0590


In [11]:
# Test the model.
words_list = []

with torch.no_grad():
  # Initialize the cell and hidden states.
  state = (torch.zeros(num_layers, 1, hidden_size),
           torch.zeros(num_layers, 1, hidden_size))
  
  input = torch.randint(0, vocab_size, (1,)).long().unsqueeze(1) # Randomly select one word id an convert it to shape (1, 1)

  for i in range(num_words):
    output,_ = model(input, state) # Feed the input and the cell & hidden states into the RNN model. 

    sample_id = output.exp() # Sample a word id from the exponential output. 
    word_id = torch.multinomial(sample_id, num_samples = 1).item()
    input.fill_(word_id) # Replace the input with sampled word id for the next time step.

    word = data.dictionary.idx_to_word[word_id]
    word = word + ' '
    
    words_list.append(word) # Append the new word to the list of words.

In [12]:
# Define function to create the usual poem format (4 words x 4 lines).
def print_new_words(w_list):
  for i, w in enumerate(w_list):
    if i % 4 == 0 and i != 0:
      print('\n', end = '') # Add a new line every 4 words.
      if i % 16 == 0:
        print ('\n', end = '') # Add an empty row, every 4 lines.

    print (w + ' ', end = '')

In [13]:
print_new_words(words_list)

Un  Pe  marginea  pas  
-  cei  casă  Lucesc  
paşte,  nu-nţelegi  el  asculta  
tău  tâmple,  faţa,  Pe-a  

spre  a  se  culce  
împărăteşti,  să  dau  tu  
izvor  de  săptămâni,  îl  
Uimirea-n  senine.  vom  E  

S-a  curmă,  cercuri  lung  
de  moarte.  tău  pânditor  
în  gând,  pierde  dragu-unei  
Cătălina.  Mai  mândră  păcat,  

iarăşi  se  glas  ochii  
mea  de-o  bălaie.  S-a  
Ca  te  iubesc  cade  
în  mare;  Şi  mumă-mea  

el  sânul  se-ncheagă;  ochii  
linişte  nici  Să  Mă  
să  fii  pus  copilaş  
În  mări  În  ca  

de  moarte.  mari  durere,  
înger  se  ştii,  şi-n  
bine  ce  Uimirea-n  Şi-ncetişor  
Dorinţele-i  sărutare,  ocean  Corăbii  

doar  Un  soare  Cătălin?  
luminii  cer  orişicare  cad,  
Ca  S-anin  pripas,  se  
pleacă-n  orişicare  calea  -  

Tu-mi  lăudat  apară.  cu  
el  fără  giulgi  durează-n  
vecinicii,  Şi  din  stele.  
asculta  Nu  geana  se  

apucă:  Hyperion,  moare,  Un  
soare  Când  el  mai  
Luceşte  trestii.  Un  chip  
vorbă  f