<a href="https://colab.research.google.com/github/CurtesMalteser/text-generator-rnn/blob/master/text-generator-rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

### **Load Data**

In [3]:
with open('drive/My Drive/anna.txt', 'r') as f:
    text = f.read()

text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

### **Tokenization**

Here the chars will be converted to and from integers.


In [0]:
# we create two dictionaries:
# variable names are self explanatory
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [5]:
# print encoded chars
encoded[:100]

array([15, 80, 30, 65, 17, 43,  1, 39, 21, 81, 81, 81, 35, 30, 65, 65, 68,
       39, 76, 30,  4,  5, 37,  5, 43, 42, 39, 30,  1, 43, 39, 30, 37, 37,
       39, 30, 37,  5, 40, 43, 26, 39, 43, 31, 43,  1, 68, 39,  7, 19, 80,
       30, 65, 65, 68, 39, 76, 30,  4,  5, 37, 68, 39,  5, 42, 39,  7, 19,
       80, 30, 65, 65, 68, 39,  5, 19, 39,  5, 17, 42, 39, 57, 44, 19, 81,
       44, 30, 68, 14, 81, 81, 45, 31, 43,  1, 68, 17, 80,  5, 19])

### **Pre-Precessing the data**
The LSTM expects as input a char converted into int, and the will be converted into one column vector where only the correspending index will have value one and remaing will be 0. This is the **one-hot encoded**.

In [0]:
def one_hot_encoded(arr, n_labels):

  # Initialize the encoded array
  one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)

  # Fill the appropriate elements with ones
  one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.

  # Finally reshape it to get back to the original array
  one_hot = one_hot.reshape((*arr.shape, n_labels))

  return one_hot

In [7]:
# check that one_hot_encoded works as expected
test_seq = np.array([(3, 5, 1)])
one_hot = one_hot_encoded(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


### **Make mini-batches**

In [0]:
def get_batches(arr, batch_size, seq_length):
  ''' Create a generator tat returns batches of size: batch_size*seq_length
  from arr.

  Arguments
  ---------
  arr: Array to generate batches from
  batch_size: The number of sequences per batch
  seq_length: Number of encoded chars in a sequence
  '''

  batch_size_total = batch_size * seq_length

  # total number of batches we can make
  n_batches = len(arr)//batch_size_total

  # Keep only enough chars. to make full batches
  arr = arr[:n_batches * batch_size_total]

  # Reshape into batch_size_rows
  arr = arr.reshape((batch_size, -1))

  # iterate through the array, one sequence at a time
  for n in range(0, arr.shape[1], seq_length):
    # The features
    x = arr[:, n:n + seq_length]

    # The targets, shifted by one
    y = np.zeros_like(x)
    try:
      y[:, :-1], y[:, -1], = x[:, 1:], arr[:, n + seq_length]
    except:
      y[:, :-1], y[:, -1], = x[:, 1:], arr[:, 0]
    yield x, y

### **Test Implementation**


*   Batch Size: 8
*   Sequence Steps: 50



In [0]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [10]:
# print first 10 times in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[15 80 30 65 17 43  1 39 21 81]
 [42 57 19 39 17 80 30 17 39 30]
 [43 19 82 39 57  1 39 30 39 76]
 [42 39 17 80 43 39 36 80  5 43]
 [39 42 30 44 39 80 43  1 39 17]
 [36  7 42 42  5 57 19 39 30 19]
 [39 77 19 19 30 39 80 30 82 39]
 [13 16 37 57 19 42 40 68 14 39]]

y
 [[80 30 65 17 43  1 39 21 81 81]
 [57 19 39 17 80 30 17 39 30 17]
 [19 82 39 57  1 39 30 39 76 57]
 [39 17 80 43 39 36 80  5 43 76]
 [42 30 44 39 80 43  1 39 17 43]
 [ 7 42 42  5 57 19 39 30 19 82]
 [77 19 19 30 39 80 30 82 39 42]
 [16 37 57 19 42 40 68 14 39 38]]


### **Define the Model**

In [11]:
# Check if the GPU is available
train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
    print('Training on GPU!')
else:
    print('No GPU available! Training on CPU!')

Training on GPU!


In [0]:
class TextGenRNN(nn.Module):
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.5, lr=1e-3):
        super().__init__()
        self.n_hidden = n_hidden
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.drop_prob = drop_prob
        self.lr = lr

        # Creating chars. dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}

        # Length of one hot encoded => len(self.chars)
        self.lstm = nn.LSTM(len(self.chars),
                            hidden_size=self.n_hidden,
                            num_layers=n_layers,
                            dropout=drop_prob,
                            batch_first=True)

        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(in_features=self.n_hidden,
                            out_features=len(self.chars))

    def forward(self, x, hidden):
        ''''Forward pass through the network.
       These inputs are x, and the hidden/cell state "hidden".
       '''

        # Get outputs and the new hidden state from the LSTM
        r_output, hidden = self.lstm(x, hidden)

        # Pass through dropout layer
        out = self.dropout(r_output)

        # stack up LSTM outputs using view
        out = out.contiguous().view(-1, self.n_hidden)

        return out, hidden

    def init_hidden(self, batch_size):
        ''''Initialize hidden state.'''
        # Create twon new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero for hidden stater and cell state of LSTM
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())

        return hidden


### **Train**

In [0]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=1e-3, clip=5, val_frac=0.1, print_every=10):
  ''' Training a Network

      Args
      ----
      net: TextGenRNN network
      data: Text data to train the network
      epochs: Number of epochs to train
      batch_size: Number of mini-sequences per mini-batch, aka batch size
      seq_length: Number of character steps per mini-batch
      lr: learning rate
      clip: Gradient clipping
      val_frac: Fraction of data to hold for validation
  '''
  opt = torch.optim.Adam(net.parameters(), lr=lr)
  criterion = nn.CrossEntropyLoss()
    
  # Create training and validation data
  val_idx = int(len(data)*(1-val_frac))
  data, val_data = data[:val_idx], data[val_idx:]
    
  if(train_on_gpu):
      net.cuda()
    
  counter = 0
  n_chars = len(net.chars)
  for e in range(epochs):
      # Initialize hidden state
      h = net.init_hidden(batch_size)
        
      for x, y in get_batches(data, batch_size, seq_length):
          counter += 1
            
          # One-hot encode our data and make them Torch tensors
          x = one_hot_encoded(x, n_chars)
          inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
          if(train_on_gpu):
              inputs, targets = inputs.cuda(), targets.cuda()

          # Creating new variables for the hidden state, otherwise
          # we'd backprop through the entire training history
          h = tuple([each.data for each in h])

          # zero accumulated gradients
          net.zero_grad()
            
          # get the output from the model
          output, h = net(inputs, h)
            
          # calculate the loss and perform backprop
          loss = criterion(output, targets.view(batch_size*seq_length).long())
          loss.backward()
          # "clip_grad_norm" helps prevent the exploding gradient problem in RNNs / LSTMs.
          nn.utils.clip_grad_norm_(net.parameters(), clip)
          opt.step()
            
          # loss stats
          if counter % print_every == 0:
              # Get validation loss
              val_h = net.init_hidden(batch_size)
              val_losses = []
              net.eval()
              for x, y in get_batches(val_data, batch_size, seq_length):
                  # One-hot encode our data and make them Torch tensors
                  x = one_hot_encoded(x, n_chars)
                  x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                  # Creating new variables for the hidden state, otherwise
                  # we'd backprop through the entire training history
                  val_h = tuple([each.data for each in val_h])
                    
                  inputs, targets = x, y
                  if(train_on_gpu):
                      inputs, targets = inputs.cuda(), targets.cuda()

                  output, val_h = net(inputs, val_h)
                  val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                  val_losses.append(val_loss.item())
                
              net.train() # reset to train mode after iterationg through validation data
                
              print("Epoch: {}/{}...".format(e+1, epochs),
                    "Step: {}...".format(counter),
                    "Loss: {:.4f}...".format(loss.item()),
                    "Val Loss: {:.4f}".format(np.mean(val_losses)))

### **Intantiating the Model**


In [14]:
# Define and print the net
n_hidden = 512
n_layers = 3

net = TextGenRNN(chars, n_hidden, n_layers)
print(net)

TextGenRNN(
  (lstm): LSTM(83, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [0]:
batch_size = 128
seq_length = 100
n_epochs = 50

train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

Epoch: 1/50... Step: 10... Loss: 5.2035... Val Loss: 4.9594
Epoch: 1/50... Step: 20... Loss: 5.1304... Val Loss: 4.8510
Epoch: 1/50... Step: 30... Loss: 5.0925... Val Loss: 4.8122
Epoch: 1/50... Step: 40... Loss: 5.0920... Val Loss: 4.8048
Epoch: 1/50... Step: 50... Loss: 5.0927... Val Loss: 4.7982
