## Download Data and Inspect Them

In [1]:
with open('./data/anna.txt', 'r') as f:
    text = f.read()

In [2]:
print('len(text) ={:,}'.format(len(text)))
print('\n')
print('text[0:100]=\n{}'.format(text[0:50]))

len(text) =1,985,223


text[0:100]=
Chapter 1


Happy families are all alike; every un


## Define Text Encoder and Encode Text

In [3]:
chars = tuple(set(text))
idx2word = dict(enumerate(chars))
word2idx = {word: idx for idx, word in idx2word.items()}

In [4]:
print('len(word2idx)={}'.format(len(word2idx)))

len(word2idx)=83


In [5]:
encoded = [word2idx[w] for w in text]

print('word2idx={}'.format(word2idx))
print('text[0:50]=\n{}'.format(text[0:50]))
print('\n')
print('encoded[0:50]=\n{}'.format(encoded[0:50]))

word2idx={',': 0, 'l': 1, 'c': 2, 'w': 3, 'O': 4, 'x': 5, '(': 6, 'G': 7, '"': 8, 'd': 9, 'v': 10, ':': 11, 'A': 12, 'B': 13, 'j': 14, 's': 15, 'F': 16, 'Y': 17, 'Q': 18, 'e': 19, 'o': 20, '`': 21, '/': 22, 'X': 23, 'u': 24, 'U': 25, '.': 26, '\n': 27, 'M': 28, '8': 29, '5': 30, 'm': 31, ')': 32, 'z': 33, '0': 34, '*': 35, 'C': 36, 'h': 37, 'y': 38, '$': 39, '!': 40, '9': 41, '&': 42, 'g': 43, 'L': 44, 'H': 45, ' ': 46, '@': 47, 'S': 48, '?': 49, 'J': 50, 'k': 51, '3': 52, '7': 53, 'i': 54, 't': 55, 'q': 56, 'N': 57, 'r': 58, 'a': 59, "'": 60, 'D': 61, 'I': 62, '2': 63, 'V': 64, 'p': 65, '-': 66, ';': 67, 'n': 68, '_': 69, 'f': 70, '4': 71, '6': 72, 'W': 73, 'R': 74, 'T': 75, 'P': 76, 'b': 77, 'Z': 78, '%': 79, 'E': 80, 'K': 81, '1': 82}
text[0:50]=
Chapter 1


Happy families are all alike; every un


encoded[0:50]=
[36, 37, 59, 65, 55, 19, 58, 46, 82, 27, 27, 27, 45, 59, 65, 65, 38, 46, 70, 59, 31, 54, 1, 54, 19, 15, 46, 59, 58, 19, 46, 59, 1, 1, 46, 59, 1, 54, 51, 19, 67, 46, 19, 10,

## Define Iterator

In [6]:
import numpy as np

def get_batches(text, batch_size, seq_length):
    n_seq = len(text) // (batch_size * seq_length)
    text = np.array(text[:batch_size * seq_length * n_seq])
    text = text.reshape((batch_size, -1))
    
    for i in range(0, text.shape[1], seq_length):
        
        x = text[:, i:i+seq_length]
        y = np.zeros_like(x)

        try:
            y[:, :-1], y[:, -1] = x[:, 1:], text[:, i+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], text[:, 0]
#             print('IndexError: i={}'.format(i))

        yield x, y

### Inspect Batch Result

In [7]:
for index, (x, y) in enumerate(get_batches(encoded, 5, 20)):
    if index == 3:
        break
        
    print('Batch {}:'.format(index))
        
    for i_seq, (x_seq, y_seq) in enumerate(zip(x, y)):
        
        # Don't show newline because of readability
        if word2idx['\n'] in x_seq:
            continue 
            
        if i_seq == 3:
            break
            
        print('\tseq {}'.format(i_seq))
        print('\t\tx:\t{}'.format(x_seq))
        print('\t\ty:\t   {}'.format(y_seq))
        
        x_word = [idx2word[idx] for idx in x_seq]
        y_word = [idx2word[idx] for idx in y_seq]
        print('\t\tx_word:\t\'{}\''.format(''.join(x_word)))
        print('\t\ty_word:\t \'{}\''.format(''.join(y_word)))
        
        

Batch 0:
	seq 1
		x:	[59 43 68 54 70 54  2 19 68 55 67 46 55 37 54 58 55 38 66 19]
		y:	   [43 68 54 70 54  2 19 68 55 67 46 55 37 54 58 55 38 66 19 54]
		x_word:	'agnificent; thirty-e'
		y_word:	 'gnificent; thirty-ei'
	seq 2
		x:	[19 46 55 37 59 68 51 19  9 46 44 19 10 54 68 46 59 68  9 46]
		y:	   [46 55 37 59 68 51 19  9 46 44 19 10 54 68 46 59 68  9 46  3]
		x_word:	'e thanked Levin and '
		y_word:	 ' thanked Levin and w'
Batch 1:
	seq 0
		x:	[31 54  1 54 19 15 46 59 58 19 46 59  1  1 46 59  1 54 51 19]
		y:	   [54  1 54 19 15 46 59 58 19 46 59  1  1 46 59  1 54 51 19 67]
		x_word:	'milies are all alike'
		y_word:	 'ilies are all alike;'
Batch 2:
	seq 0
		x:	[67 46 19 10 19 58 38 46 24 68 37 59 65 65 38 46 70 59 31 54]
		y:	   [46 19 10 19 58 38 46 24 68 37 59 65 65 38 46 70 59 31 54  1]
		x_word:	'; every unhappy fami'
		y_word:	 ' every unhappy famil'
	seq 1
		x:	[46 15 55 58 59 54 43 37 55 46 59  3 59 38  0 46 59 68  9 46]
		y:	   [15 55 58 59 54 43 37 55 46 59  3 59 38  0 46 5

## Define One-Hot Encoder

In [8]:
def one_hot_encode(arr, n_labels):
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

## Define LSTM Model

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CharRNN(nn.Module):
    def __init__(self, tokens, n_steps=100, n_hidden=256, n_layers=2,
                        drop_prob=0.5, lr=0.001):
        
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.chars = tokens
        self.int2char = dict(enumerate(set(self.chars)))
        self.char2int = {c: i for i, c in self.int2char.items()}
        
        self.lstm = nn.LSTM(input_size=len(self.chars),
                           hidden_size=n_hidden,
                           num_layers=n_layers,
                           dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)

        self.fc = nn.Linear(n_hidden, len(self.chars))
        
        self.init_weights()
        
        
    def forward(self, x, hc):
        
        x, (h, c) = self.lstm(x, hc)
        
        x = self.dropout(x)
        
        # Stack up LSTM outputs using view
        x = x.view(x.size()[0]*x.size()[1], self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        x = self.fc(x)
        
        return x, (h, c)
    
    
    def predict(self, char, h=None, cuda=False, top_k=None):
        
        if cuda:
            self.cuda()
        else:
            self.cpu()
            
        if h is None:
            h = self.init_hidden(1)
            
        x = np.array([[self.char2int[char]]])
        x = one_hot_encode(x, len(self.chars))
        
        inputs = torch.from_numpy(x)
        
        if cuda:
            inputs = inputs.cuda()
            
        h = tuple([each.data for each in h])
        out, h = self.forward(inputs, h)
        
        p = F.softmax(out, dim=1).data
        
        if cuda:
            p = p.cpu()
            
        if top_k is None:
            top_ch = np.arange(len(self.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
            
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
            
        return self.int2char[char], h
        
   
    def init_weights(self):
        initrange = 0.1
        
        self.fc.bias.data.fill_(0)
        
        self.fc.weight.data.uniform_(-1, 1)
        
        
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters()).data
        
        return (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
    


## Define Training Method

In [10]:
import torch.optim as optim

def train(net, data, epochs=10, batch_size=10, seq_length=50, 
          lr=0.001, clip=5, val_frac=0.1, cuda=False, print_every=10):
    
    net.train()
    
    # define optimizer
    opt = optim.Adam(net.parameters(), lr=lr)
    
    # define loss function
    criterion = nn.CrossEntropyLoss()
    
    # create training set and validation set
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if cuda:
        net.cuda()
    
    n_chars = len(net.chars)
    
    # start training
    for e in range(epochs):
        counter = 0
        
        # reset weight
        h = net.init_hidden(batch_size)
        
        # get batch of encoded data
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            x = one_hot_encode(x, n_chars)
            
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            # move tensor from cpu to gpu
            if cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
                
                
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])
                
            net.zero_grad()
    
            output, h = net.forward(inputs, h)
            
            # Backpropagation
            loss = criterion(output, targets.view(batch_size*seq_length))
            
            loss.backward()
            
            nn.utils.clip_grad_norm_(net.parameters(), clip)
                
            opt.step()
            
            if counter % print_every == 0:
                val_h = net.init_hidden(batch_size)
                val_losses = []
                
                for x, y in get_batches(val_data, batch_size, seq_length):
                    
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if cuda:
                        inputs, targets = inputs.cuda(), targets.cuda()
                        
                        
                    output, val_h = net.forward(inputs, val_h)
                    
                    val_loss = criterion(output, targets.view(batch_size*seq_length))
                    
                    val_losses.append(val_loss.item())
                    
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))
            inputs = inputs.cuda()


In [11]:
if 'net' in locals():
    del net

In [12]:
net = CharRNN(chars, n_hidden=512, n_layers=2)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [13]:
from datetime import datetime

batch_size, seq_length = 128, 100
epochs = 1

start = datetime.now()

# you may change cuda to True if you plan on using a GPU!
# also, if you do, please INCREASE the epochs to 25
train(net, encoded, epochs=epochs, batch_size=batch_size, seq_length=seq_length, 
      lr=0.001, cuda=True, print_every=10)

end = datetime.now()
print('Execution Time:{}'.format(end - start))
print('Time per Epoch={}'.format((end-start)/epochs))

Epoch: 1/1... Step: 10... Loss: 3.3540... Val Loss: 3.3189
Epoch: 1/1... Step: 20... Loss: 3.1834... Val Loss: 3.2033
Epoch: 1/1... Step: 30... Loss: 3.0794... Val Loss: 3.0674
Epoch: 1/1... Step: 40... Loss: 2.8841... Val Loss: 2.8981
Epoch: 1/1... Step: 50... Loss: 2.7661... Val Loss: 2.7212
Epoch: 1/1... Step: 60... Loss: 2.6040... Val Loss: 2.6159
Epoch: 1/1... Step: 70... Loss: 2.5317... Val Loss: 2.5475
Epoch: 1/1... Step: 80... Loss: 2.4691... Val Loss: 2.5008
Epoch: 1/1... Step: 90... Loss: 2.4442... Val Loss: 2.4586
Epoch: 1/1... Step: 100... Loss: 2.3950... Val Loss: 2.4215
Epoch: 1/1... Step: 110... Loss: 2.3486... Val Loss: 2.3903
Epoch: 1/1... Step: 120... Loss: 2.2833... Val Loss: 2.3604
Epoch: 1/1... Step: 130... Loss: 2.3091... Val Loss: 2.3402
Execution Time:0:00:41.695086
Time per Epoch=0:00:41.695086


## Save Model

In [14]:
model_name = 'lstm_{}_epochs.net'.format(epochs)

checkpoint = {'n_hidden': net.n_hidden,
             'n_layers': net.n_layers,
             'state_dict': net.state_dict(),
             'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

## Sample 

In [15]:
def sample(net, size, prime='The', top_k=None, cuda=False):
        
    if cuda:
        net.cuda()
    else:
        net.cpu()

    net.eval()
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = net.predict(ch, h, cuda=cuda, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = net.predict(chars[-1], h, cuda=cuda, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [16]:
print(sample(net, 2000, prime='Anna', top_k=5, cuda=False))

Anna A_ arahaverad o2lle, a_d aq o4 Cersas aq t_d.aqqor as qo2s t_qere a_d, aq qCts t_e qCer qaq o4 Cer as t4ares a_d.o2sqed qe Car wor dt_qq qCtq Ce so2qCe 4ortqCe CtsCe ald Ce Co2dd qCa_ qo satd Ct_g altd, satde asqeladt_g qCaq o4 Cased qCaq a_d.o4 a_er qooqC qCer qCerered a_d qCers, Cad qCe ald o4 a_d Caqe qoo_gU ."Rell, Ctd so5 qCe wasq Ce satde_, a_de_gqtg qCes a_d al o4.ale a_d qCe -ero_ qCe qCe stqCt_g Cts Cas add a_q Cer ad qo qCer wtsq Ce as qCe qarer qCe..4ore Cas wor qCe.qCe wald sqere 4ro2d qo Ce Cer ow Cts qCe alo_qCer as o2q a_g qCe qCerede a_d a_ wo2ld o4 t_ qCows o2qCed Ce 5aqed qCe alde_d waqere qo cares t_qCe seredesqo_ as tqCe so5aq a_ qoo Cererere Caq o4 qCe qo co Co5ered, saded o_d ove qo Cars qo st_ge, a_d aq t5lt_g aqC qCe -ored qo Ce 5ere qCo2s was qCt_g.a_ ald as ares ove co2rq or qto_g qo qCer.CtqC as a_d qCe Cers qCe Ceradde_q wor t_ sCad asqo_qer qCaq.sore sa_ aq o4 Cerses t4 Cer asd a_d Cever ald Ce qto_ a_d qCtq aqre_ qo Cts qCaq a_ qCe Cere so4ellh sCe qo