# In this notebook, I construct novel sentences using LSTM network


In [1]:
import numpy as np
from torch import nn
import torch.nn.functional as F
import torch

### loading the data:

In [2]:
with open('data/anna.txt') as file:
    text = file.read()
    
text[0:50]

'Chapter 1\n\n\nHappy families are all alike; every un'

### define encoding function to encode words to numbers:

In [3]:
def encode(text):
    chars = tuple(set(text))
    int2char = dict(enumerate(chars))
    char2int = {char: i for i, char in int2char.items()}
    encoded = np.array([char2int[char] for char in text])  # [95, 6, 13, 95, ...] corresponding to text
    return encoded, char2int

In [6]:
encoded, char2int = encode(text)
encoded[:50]

array([19, 72, 20,  0,  1, 59, 28, 58, 46, 66, 66, 66, 71, 20,  0,  0, 70,
       58, 15, 20,  2, 54, 33, 54, 59, 26, 58, 20, 28, 59, 58, 20, 33, 33,
       58, 20, 33, 54, 64, 59,  8, 58, 59, 78, 59, 28, 70, 58, 36, 40])

### one-hot encoding of the data:

In [7]:
def one_hot_encode(arr, n_labels):    #n_labels is the number of unique words (vocabulary), arr is the text
    
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    print("n_labels = {0},    first dim = {1}".format(n_labels, np.multiply(*arr.shape)))

    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.    #this creates the on-hot encoded vector
    
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

### getting batches of the data:

In [8]:
def get_batches(data, n_seq, n_steps):
    batch_size = n_seq * n_steps
    n_batches = len(data) // batch_size
    data = data[:n_batches*batch_size]          # drop some data to get only full batches
    data = data.reshape((n_seq, -1))        
    
    for i in range(0, data.shape[1], n_steps):  # iterate on the columns to get the batches
        x = data[:, i:i+n_steps]
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], data[:, i+n_steps]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], data[:, 0]
        yield x, y                              # x, y are generators that you can use next() on

In [9]:
batches = get_batches(encoded, 10, 50)
x, y = next(batches)
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[19 72 20  0  1 59 28 58 46 66]
 [58 20  2 58 40 48  1 58 76 48]
 [78 54 40 27 66 66 60 13 59 26]
 [40 58 17 36 28 54 40 76 58 72]
 [58 54  1 58 54 26 81 58 26 54]
 [58 51  1 58 41 20 26 66 48 40]
 [72 59 40 58 16 48  2 59 58 15]
 [ 8 58 35 36  1 58 40 48 41 58]
 [ 1 58 54 26 40 55  1 27 58  7]
 [58 26 20 54 17 58  1 48 58 72]]

y
 [[72 20  0  1 59 28 58 46 66 66]
 [20  2 58 40 48  1 58 76 48 54]
 [54 40 27 66 66 60 13 59 26 81]
 [58 17 36 28 54 40 76 58 72 54]
 [54  1 58 54 26 81 58 26 54 28]
 [51  1 58 41 20 26 66 48 40 33]
 [59 40 58 16 48  2 59 58 15 48]
 [58 35 36  1 58 40 48 41 58 26]
 [58 54 26 40 55  1 27 58  7 72]
 [26 20 54 17 58  1 48 58 72 59]]


## defining the network:

In [55]:
class CharRNN(nn.Module):
    
    def __init__(self, text, n_steps=100, n_hidden=256, n_layers=2, drop_prop=0.5, lr=0.001):
        # n_steps: number of elements in each sequence in each bartch
        # n_hidden: number of output elements of the intermediate layers
        # n_layers: number of LSTM layers to use
        
        super().__init__()
        self.drop_prop = drop_prop
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_steps = n_steps
        self.lr = lr
        
        slef.text = text
        self.int2char = dict(enumerate(self.text))
        self.char2int = {char: i for i, char in self.int2char}
        
        self.lstm = nn.LSTM(len(self.text), n_hidden, n_layers, dropout=drop_prop, batch_first=True)
        self.dropout = nn.Dropout(drop_prop)
        self.fc = nn.Linear(n_hidden, len(self.text))
        
        self.init_weights()
        
        
        
    def forward(self, x, hc):
        x, (h, c) = self.lstm(x, hc)
        x = self.dropout(x)
        x = self.fc(x)
        
        return x, (h, c)
    
    def predict(self, x, h=None, cuda=False):
        if cuda:
            self.cuda()
        else:
            self.cpu()
            
        x = np.array([[self.char2int[x]]])
        x = one_hot_encode(x, len(self.char2int))
        x = torch.from_numpy(x)
        
        if h == None:
            h = init_hidden(1)
            
        h = tuple([each.data for each in h])
        
        if cuda:
            x = x.cuda()
            
        out, h = self.forward(x, h)
        out = F.softmax(out)
        
        out = out.numpy().squeeze()
        
        return self.int2char[]
    
    def init_weights(self):
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-1, 1)
        
        
    def init_hidden(self, n_seqs):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x n_seqs x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        return (weight.new(self.n_layers, n_seqs, self.n_hidden).zero_(),
                weight.new(self.n_layers, n_seqs, self.n_hidden).zero_())
        