In [268]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [269]:
PATH=Path('..')/'data/nietzsche/data/'

In [270]:
#get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}/nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600901


In [271]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [272]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 86


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [273]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

Map from chars to indices and back again

In [274]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [275]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [276]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [277]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

In [278]:
c1_dat[:5]

[40, 30, 29, 1, 40]

In [279]:
np.stack(c1_dat)[:5]

array([40, 30, 29,  1, 40])

Our inputs

In [280]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

Our output

In [281]:
y = np.stack(c4_dat)

The first 4 inputs and outputs

In [282]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [283]:
y[:4]

array([30, 29,  1, 40])

In [284]:
x1.shape, y.shape

((200300,), (200300,))

### Create and train model

Video 126:20

Pick a size for our hidden state

In [285]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix), using 1/2 num chars

In [286]:
n_fac = 42

In [287]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        #input size is vocab size (84), out size is number of factors in the embedding (42)
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        #square matrix
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        #h = F.tanh(self.l_hidden(in1))
        #h initialized as zeros so can refactor-then can create a RNN loop
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [288]:
#from_arrays(path, val_idxs, xs, y, is_reg=True, bs=64, test_xs=None, shuffle=True)
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [289]:
#standard pytorch model
m = Char3Model(vocab_size, n_fac).cuda()

In [290]:
it = iter(md.trn_dl)
#grab a minibatch
*xs,yt = next(it)
t = m(*V(xs))

In [291]:
t
#512 is minibatch, 86 is the probability of each of the vocab items - vals are log of these

Variable containing:
-4.5703 -4.4720 -4.7361  ...  -4.6183 -4.3230 -4.6636
-4.4538 -4.3228 -4.5502  ...  -4.2488 -4.6725 -4.6845
-4.5853 -4.3747 -4.7382  ...  -4.5644 -4.2104 -4.4266
          ...             ⋱             ...          
-4.5646 -4.4539 -4.7844  ...  -4.4951 -4.6160 -4.6449
-4.6910 -4.3795 -4.5409  ...  -4.5171 -4.5942 -4.6191
-4.5891 -4.3273 -4.6819  ...  -4.5846 -4.5968 -4.5751
[torch.cuda.FloatTensor of size 512x86 (GPU 0)]

In [292]:
opt = optim.Adam(m.parameters(), 1e-2)
#pass in list of parameters (m.parameters())

In [293]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      2.085499   3.666888  



[array([3.66689])]

In [294]:
#set lr a bit lower and run fit again
set_lrs(opt, 0.001)

In [295]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.833571   3.150568  



[array([3.15057])]

### Test model

In [296]:
def get_next(inp):
    #get indexes of the chars, turn into Tensor
    idxs = T(np.array([char_indices[c] for c in inp]))
    #turn into Variables, run through model, get a likelyhood for each character (and runs log(softmax()) on them)
    p = m(*VV(idxs))
    #print('p: {0}'.format(p))
    #Returns the indices of the maximum values along an axis (grab the character number)
    i = np.argmax(to_np(p))
    print('i: {0}'.format(i))
    return chars[i]

In [297]:
get_next('y. ')

i: 44


'T'

In [298]:
get_next('ppl')

i: 62


'i'

In [299]:
get_next(' th')

i: 58


'e'

In [300]:
get_next('and')

i: 2


' '

## Our first RNN!

Video 1:38

### Create inputs

Video 1:41

This is the size of our unrolled RNN.

In [301]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [302]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [303]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [304]:
xs = np.stack(c_in_dat, axis=0)

In [305]:
xs.shape

(600893, 8)

In [306]:
y = np.stack(c_out_dat)

So each column below is one series of 8 characters from the text.

In [307]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [308]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

Video 1:43

In [309]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [310]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [311]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        #same as prev but using a loop
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            #input (encoding) and h (encoding of chars so far) are potentially quite different - may be suboptimal to add them
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [312]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [313]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      2.015942   1.990042  



[array([1.99004])]

In [314]:
set_lrs(opt, 0.001)

In [315]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.720497   1.718816  



[array([1.71882])]

In [316]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        #extend input as concat below
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            #note concatenation (here axis=1 is extending columns)
            #concat as hidden (state so far) and input are qualitativeley differnt
            inp = torch.cat((h, self.e(c)), 1)
            #converts back to size n hidden again
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [317]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [318]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [319]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.805404   1.77913   



[array([1.77913])]

In [320]:
set_lrs(opt, 1e-4)

In [321]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.695405   1.692908  



[array([1.69291])]

### Test model

In [322]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [323]:
get_next('for thos')

'e'

In [324]:
get_next('part of ')

't'

In [325]:
get_next('queens a')

'n'

## Reimplement nn.RNN 

This isn't working, not clear on concatenation, outputs

In [326]:
class BasicRNN(nn.Module):
    def __init__(self, n_fac, n_hidden):
        super(BasicRNN, self).__init__()
        self.n_fac = n_fac
        self.n_hidden = n_hidden
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.outp = []
        
    def forward(self, inp, h):
        #converts back to size n hidden again
        inp = F.relu(self.l_in(inp))
        h = torch.cat((F.tanh(self.l_hidden(inp)), h), 0)
        h = nn.Linear(in_features=n_fac, out_features=n_hidden)
        self.outp.append(h)
        return V(self.outp), V(h)

## RNN with pytorch

In [327]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        #starting point, unit axis at front. here we are not usinf MultiRNN or birirectional so use 1
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        #for loop (cat, relu, tanh) in here. Note we pass in our initial hidden state h
        outp, h = self.rnn(inp, h)
        #h = self.rnn(inp, h)
        #we just want final hidden state, run through output layer to get correct vocab size
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [328]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [329]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [330]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [331]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [332]:
t = m(*V(xs)); t.size()

torch.Size([512, 86])

In [333]:
fit(m, md, 4, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.860195   1.841723  
    1      1.680927   1.670507  
    2      1.592979   1.591485  
    3      1.532764   1.54404   



[array([1.54404])]

In [334]:
set_lrs(opt, 1e-4)

In [335]:
fit(m, md, 2, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.473685   1.507939  
    1      1.470911   1.502394  



[array([1.50239])]

### Test model

In [336]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [337]:
get_next('for thos')

'e'

In [338]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [339]:
get_next_n('for thos', 40)

'for those of the same the same the same the same'

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [340]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [341]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [342]:
xs = np.stack(c_in_dat)
xs.shape

(75112, 8)

In [343]:
ys = np.stack(c_out_dat)
ys.shape

(75112, 8)

In [344]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [345]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

### Create and train model

In [346]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [347]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [348]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        #same as before, return all intead of just last
        return F.log_softmax(self.l_out(outp), dim=-1)

In [349]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [350]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [351]:
#8 timesteps, 84 probabilities. Costom loss function
def nll_loss_seq(inp, targ):
    #the way pytorch handles rnn data: 1st axis is sequence length (num timesteps)
    #nh is the hidden state size
    sl,bs,nh = inp.size()
    #flatten targets
    targ = targ.transpose(0,1).contiguous().view(-1)
    #nll_loss expects 2 rank 2 tensors (including minibatch for each tensor)
    #flastten targets
    return F.nll_loss(inp.view(-1,nh), targ)

In [352]:
fit(m, md, 4, opt, nll_loss_seq)

epoch      trn_loss   val_loss   
    0      2.59743    2.410831  
    1      2.284504   2.195268  
    2      2.132288   2.078735  
    3      2.038942   2.006242  



[array([2.00624])]

In [353]:
set_lrs(opt, 1e-4)

In [354]:
fit(m, md, 1, opt, nll_loss_seq)

epoch      trn_loss   val_loss   
    0      1.990079   1.991542  



[array([1.99154])]

### Identity init!

In [355]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [356]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [357]:
fit(m, md, 4, opt, nll_loss_seq)

epoch      trn_loss   val_loss   
    0      2.423748   2.253473  
    1      2.149514   2.093322  
    2      2.039172   2.003588  
    3      1.972531   1.966282  



[array([1.96628])]

In [358]:
set_lrs(opt, 1e-3)

In [359]:
fit(m, md, 4, opt, nll_loss_seq)

epoch      trn_loss   val_loss   
    0      1.889311   1.899213  
    1      1.878262   1.892775  
    2      1.872006   1.885307  
    3      1.861091   1.879393  



[array([1.87939])]

## Stateful model

### Lesson 7

Video 3:00

### Setup

In [7]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH=Path('..')/'data/nietzsche/data/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}/{TRN_PATH}'
VAL = f'{PATH}/{VAL_PATH}'

%ls {PATH}

[0m[01;34mmodels[0m/  nietzsche.txt  [01;34mtrn[0m/  [01;34mval[0m/


In [9]:
%ls {PATH}/trn

training.txt


In [13]:
#field is descr of how process text, want char model - each char in sep token (use list)
TEXT = data.Field(lower=True, tokenize=list)
#n_fac is size of embedding
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)
#len(dataloader) = num minibatches = ntokens/bs/bptt (but bptt not exactly 8 as length randomized), 
#nt = num tokens (unique)

(942, 56, 1, 482979)

### RNN

In [15]:
??repackage_var

[0;31mSignature:[0m [0mrepackage_var[0m[0;34m([0m[0mh[0m[0;34m)[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mrepackage_var[0m[0;34m([0m[0mh[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Wraps h in new Variables, to detach them from their history."""[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mVariable[0m[0;34m([0m[0mh[0m[0;34m.[0m[0mdata[0m[0;34m)[0m [0;32mif[0m [0mtype[0m[0;34m([0m[0mh[0m[0;34m)[0m [0;34m==[0m [0mVariable[0m [0;32melse[0m [0mtuple[0m[0;34m([0m[0mrepackage_var[0m[0;34m([0m[0mv[0m[0;34m)[0m [0;32mfor[0m [0mv[0m [0;32min[0m [0mh[0m[0;34m)[0m[0;34m[0m[0m
[0;31mFile:[0m      /mnt/963GB/Data/Python/Courses/fastai/my_fastai/dl1/fastai/lm_rnn.py
[0;31mType:[0m      function


In [18]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        #only initialize hidden state once
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        #last bs may be smaller
        if self.h.size(1) != bs: 
            self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        #h will have same value but no history of operations
        #backprop trough time - after for loop throw away history and start again
        self.h = repackage_var(h)
        #flatten out (col = vocab size, rows=as big as necc ie bs*bptt)
        #softmax req us to pass in axis (axis want to sum to one)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): 
        self.h = V(torch.zeros(1, bs, n_hidden))

In [19]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [20]:
fit(m, md, 4, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.886327   1.861101  
    1      1.702951   1.706196  
    2      1.622682   1.637929  
    3      1.56125    1.601405  



[array([1.6014])]

In [21]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.500752   1.558232  
    1      1.488323   1.551951  
    2      1.487393   1.547811  
    3      1.491995   1.544134  



[array([1.54413])]

### RNN loop

In [22]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    #F.linear does matrix mult plus bias
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [23]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [24]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [25]:
fit(m, md, 4, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.886524   1.849988  
    1      1.702388   1.710569  
    2      1.614976   1.635929  
    3      1.567613   1.595526  



[array([1.59553])]

### GRU

In [26]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [27]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [28]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [29]:
fit(m, md, 6, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.761311   1.741075  
    1      1.58497    1.591423  
    2      1.501812   1.528557  
    3      1.450953   1.494429  
    4      1.407352   1.47168   
    5      1.375898   1.46344   



[array([1.46344])]

In [30]:
set_lrs(opt, 1e-4)

In [31]:
fit(m, md, 3, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.282883   1.429307  
    1      1.291534   1.424964  
    2      1.282364   1.422862  



[array([1.42286])]

### Putting it all together: LSTM

In [32]:
from fastai import sgdr

#doubled size of hidden layer as added .5 dropout
n_hidden=512

In [33]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        #note dropout after each timestep
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        #return tuple as have a cell state
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [34]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
#LayerOptimizer(opt_fn, layer_groups, lrs, wds=None)
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [37]:
os.makedirs(f'{PATH}/models', exist_ok=True)

In [38]:
fit(m, md, 2, lo.opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.666216   1.607022  
    1      1.629502   1.576865  



[array([1.57687])]

In [41]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}/models/cyc_{cycle}')
#callback, will do cosine anealing by changing lr inside the object
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

epoch      trn_loss   val_loss   
    0      1.492659   1.453389  
    1      1.559975   1.496189  
    2      1.443095   1.4178    
    3      1.574661   1.513888  
    4      1.51066    1.467412  
    5      1.445606   1.418899  
    6      1.381208   1.383612  
    7      1.569964   1.523049  
    8      1.538134   1.500693  
    9      1.509982   1.478297  
    10     1.477628   1.450553  
    11     1.443845   1.421445  
    12     1.395372   1.396344  
    13     1.357949   1.370787  
    14     1.332304   1.358466  



[array([1.35847])]

In [42]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}/models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

epoch      trn_loss   val_loss   
    0      1.319185   1.356783  
    1      1.319551   1.356938  
    2      1.3086     1.354031  
    3      1.318066   1.35318   
    4      1.318408   1.350228  
    5      1.305328   1.347788  
    6      1.302752   1.347255  
    7      1.308368   1.348396  
    8      1.302775   1.345222  
    9      1.291163   1.342553  
    10     1.282529   1.341878  
    11     1.275409   1.33972   
    12     1.279167   1.33868   
    13     1.271164   1.338093  
    14     1.267691   1.338186  
    15     1.275967   1.340301  
    16     1.276588   1.339853  
    17     1.259084   1.339114  
    18     1.258865   1.338772  
    19     1.254816   1.338704  
    20     1.239552   1.33667   
    21     1.238737   1.337353  
    22     1.22576    1.336954  
    23     1.220856   1.337654  
    24     1.218692   1.337587  
    25     1.207073   1.33781   
    26     1.202937   1.337877  
    27     1.201622   1.338251  
    28     1.195254   1.338694  
    29   

[array([1.39102])]

### Test

In [43]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [44]:
get_next('for thos')

'e'

In [45]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [46]:
print(get_next_n('for thos', 400))

for those impersistic process of such monsters are to such con7stent, now it something. ye may say to them life is through an 'esubperaoven them come, to have noble crofes than man, and in regard and solf-mens, toaggerman's evidence andwill--is means of mode.68. the same." and thereof a little worthy, indeed, and will--o) the reserved to "bad wofle," stupid ot.[21. le hunyer as perhaps also tto positation


Video 1:02 End RNN section