In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
from torch.nn.utils import clip_grad_norm_

In [2]:
# LSTM = Long-Short-Term Memory is a type of recurrent neural network (RNN).

embed_size = 128 # The numbner of values we are using to represent a word. This also represents the number of input features to LSTM.
hidden_size = 1024 # The number of hidden units.
num_layers = 2 # The number of layers.
num_epochs = 18 # The number of epochs / complete iterations through a given dataset.
words_batch_size = 20 # The number of training samples / words.
time_steps = 16 # The number of previous words that will be looked at.
learning_rate = 0.002 # The learning rate.
num_words = 160 # The number of new words we want to generate.

file_path = 'luceafarul.txt'

In [3]:
# Define class to capture all the words from the poem, into a dictionary.
class WordDictionary(object):
  def __init__(self):
    self.word_to_idx = {}
    self.idx_to_word = {}
    self.idx = 0

  def add_word(self, word):
    # Check if the word is not already in the dictionary.
    if word not in self.word_to_idx:
      self.word_to_idx[word] = self.idx # Map the word to an index.
      self.idx_to_word[self.idx] = word # Map the index to a word.
      self.idx = self.idx + 1
  
  def __len__(self):
    return len(self.word_to_idx)

In [4]:
# Define class to process the words from file.
class WordProcess(object):
  def __init__(self):
    self.dictionary = WordDictionary()

  # Parse the file and capture all the words.
  def get_data(self, path, batch_size):
    with open(path, 'r') as f:
      tokens = 0

      # Loop through each line of the file.
      for line in f:
        words = line.split() # Split each line and create a list of words.
        tokens = tokens + len(words) # Increment the tokens by the total number of words from each line.
        
        # Loop through the list of words.
        for word in words:
          self.dictionary.add_word(word) # Add each word into the word dictionary.

    # Create a 1D tensor that captures all the words indexes.
    idx_tensor = torch.LongTensor(tokens)
    index = 0

    with open(path, 'r') as f:
      for line in f:
        words = line.split()
        for word in words:
          idx_tensor[index] = self.dictionary.word_to_idx[word] # Map the tensor index to word index.
          index = index + 1
  
    num_batches = idx_tensor.shape[0] // batch_size # We use // to ignore the remainder when we devide.
    idx_tensor = idx_tensor[:num_batches * batch_size] # Trim the tensor to fit the batch size of 20.
    idx_tensor = idx_tensor.view(batch_size, -1) # Reshape the tensor to have the number of rows 20 (batch size).
    return idx_tensor

In [5]:
data = WordProcess()
idx_tensor = data.get_data(file_path, words_batch_size) # Load the file content into tensor.
print (idx_tensor.shape)

torch.Size([20, 88])


In [6]:
vocab_size = len(data.dictionary)
print ("We have {} unique words.".format(vocab_size))

We have 982 unique words.


In [7]:
num_batches = idx_tensor.shape[1] // time_steps # The number of batches required to complete the dataset (112 / 16).
print (num_batches)

5


In [8]:
# Define class that will generate the new text.
class GenerateText(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
    super(GenerateText, self).__init__() # Use super function to inherit from nn.Module.
    self.embed = nn.Embedding(vocab_size, embed_size) # Create the embedding layer that maps the words to features.
    self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first = True) # Create the LSTM layer.
    self.linear = nn.Linear(hidden_size, vocab_size) # Create the linear layer.

  # Propagate the data through the neural network (x = inputs, h = hidden states).
  def forward(self, x, h):
    x = self.embed(x)
    out, (h, c) = self.lstm(x, h)
    out = out.reshape(out.size(0) * out.size(1), out.size(2))
    out = self.linear(out)
    return out, (h, c)

In [9]:
model = GenerateText(vocab_size, embed_size, hidden_size, num_layers) # Define the model.
loss_func = nn.CrossEntropyLoss() # Define the loss function.
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate) # Define the optimization algorithm.

In [10]:
# Train the model using the mini-batch gradient descent.
for epoch in range(num_epochs):
  # Initialize the cell and hidden states.
  states = (torch.zeros(num_layers, words_batch_size, hidden_size),
            torch.zeros(num_layers, words_batch_size, hidden_size))
  
  # Iterate over the training dataset.
  for i in range(0, idx_tensor.size(1) - time_steps, time_steps):
    inputs = idx_tensor[:, i:i + time_steps]
    targets = idx_tensor[:, (i + 1):(i + 1) + time_steps]
    outputs,_ = model(inputs, states) # Feed the inputs and the cell & hidden states into the RNN model. 
    loss = loss_func(outputs, targets.reshape(-1)) # Calculate the loss between the predicted outputs and targets.

    # Perform back-propagation and weight updates.
    model.zero_grad()
    loss.backward()

    clip_grad_norm_(model.parameters(), 0.5) # Clip the gradient to avoid the exploding gradient problem.
    optimizer.step() # Update the model parameters, using the step method.

    step = (i + 1) // time_steps
    
    if step % 100 == 0:
      print ('Epoch: {}/{}, Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))

Epoch: 1/18, Loss: 6.8906
Epoch: 2/18, Loss: 6.3861
Epoch: 3/18, Loss: 6.1676
Epoch: 4/18, Loss: 5.9043
Epoch: 5/18, Loss: 5.1746
Epoch: 6/18, Loss: 4.1960
Epoch: 7/18, Loss: 3.1624
Epoch: 8/18, Loss: 2.4553
Epoch: 9/18, Loss: 1.8431
Epoch: 10/18, Loss: 1.4414
Epoch: 11/18, Loss: 1.1963
Epoch: 12/18, Loss: 1.0304
Epoch: 13/18, Loss: 0.7954
Epoch: 14/18, Loss: 0.7101
Epoch: 15/18, Loss: 0.5318
Epoch: 16/18, Loss: 0.3366
Epoch: 17/18, Loss: 0.2277
Epoch: 18/18, Loss: 0.1231


In [11]:
# Test the model.
words_list = []

with torch.no_grad():
  # Initialize the cell and hidden states.
  state = (torch.zeros(num_layers, 1, hidden_size),
           torch.zeros(num_layers, 1, hidden_size))
  
  input = torch.randint(0, vocab_size, (1,)).long().unsqueeze(1) # Randomly select one word id and convert it to shape (1, 1)

  for i in range(num_words):
    output,_ = model(input, state) # Feed the input and the cell & hidden states into the RNN model. 

    sample_id = output.exp() # Sample a word id from the exponential output. 
    word_id = torch.multinomial(sample_id, num_samples = 1).item()
    input.fill_(word_id) # Replace the input with sampled word id for the next time step.

    word = data.dictionary.idx_to_word[word_id] # Capture the word associated with the id.
    words_list.append(word) # Append the new word to the list of words.

In [12]:
# Define function to create the usual poem format (4 words x 4 lines).
def print_new_words(w_list):
  for i, w in enumerate(w_list):
    if i % 4 == 0 and i != 0:
      print('\n', end = '') # Add a new line every 4 words.
      if i % 16 == 0:
        print ('\n', end = '') # Add an empty row, every 4 lines.

    print (w + ' ', end = '')

In [13]:
print_new_words(words_list)

îmi mă paşte, - 
Tu-mi o acu-i vrei, 
legat, Ci păr Dar 
ochii îmbătată Dorinţele-i vom 

trecură încrede: mă pas 
cu el în mare; 
o umerele văi trupu-i 
ochii ochii mari Căci 

În de-o pat să-mi 
Pe marginea sus, a 
S-a timp, de luceferi. 
Vrei ce-mi ceri, zare 

nu-i Răsai s-or luceafăr 
rază, linişte capul vin'! 
mână şi-o şi-n Un 
Pe celei Marmoreele îmi 

mai curmă, vom copil 
Şi-i calea recile-i oglindă, 
pas albă aş copilaş 
fugi unde-ajunge mort, mor 

nici vom coate-şi florile-argintii 
Ce-ţi tine oare, vin, 
chip Dar ochii din 
cer a se naşte. 

cununi luceafăr, chip de 
Şedeau sus, calea idealuri 
se Şi mumă-mea vecinicului 
străvezii E ca scânteie-n 

paşte, Dar se culce 
Iubito, Fii sau prins 
ziua Cum potrivi să-mi 
fii dus. stele. pe-o 

Norocu-mi venii Pe creştetele-a 
un fulger - Tu-mi 
duce... luceafărul de luceferi. 
dedesubt, lasă, Şi-i nu-l 

e inim-o O, vin'! 
mânia, acu-i mormânt, dulce-al 
sus Uimirea-n senine. alunge, 
fii lângă Cere-mi senine. 