In [2]:
! pip install fastai==0.7.0
! pip install torchtext==0.2.3

Collecting fastai==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/50/6d/9d0d6e17a78b0598d5e8c49a0d03ffc7ff265ae62eca3e2345fab14edb9b/fastai-0.7.0-py3-none-any.whl (112kB)
[K    100% |████████████████████████████████| 122kB 4.4MB/s 
Collecting pandas-summary (from fastai==0.7.0)
  Downloading https://files.pythonhosted.org/packages/97/55/ea54109a4e7a8e7342bdf23e9382c858224263d984b0d95610568e564f59/pandas_summary-0.0.5-py2.py3-none-any.whl
Collecting graphviz (from fastai==0.7.0)
  Downloading https://files.pythonhosted.org/packages/1f/e2/ef2581b5b86625657afd32030f90cf2717456c1d2b711ba074bf007c0f1a/graphviz-0.10.1-py2.py3-none-any.whl
Collecting ipywidgets (from fastai==0.7.0)
[?25l  Downloading https://files.pythonhosted.org/packages/30/9a/a008c7b1183fac9e52066d80a379b3c64eab535bd9d86cdc29a0b766fd82/ipywidgets-7.4.2-py2.py3-none-any.whl (111kB)
[K    100% |████████████████████████████████| 112kB 4.7MB/s 
[?25hCollecting widgetsnbextension (from fastai==0.7.0)
[?2

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [0]:
PATH='data/nietzsche/'

In [5]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

nietzsche.txt: 606kB [00:00, 1.73MB/s]                           

corpus length: 600893





In [6]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [8]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again

In [0]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [10]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [11]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [0]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

Our inputs

In [0]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

Our output

In [0]:
y = np.stack(c4_dat)

The first 4 inputs and outputs

In [0]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [0]:
y[:4]

array([30, 29,  1, 40])

In [0]:
x1.shape, y.shape

((200297,), (200297,))

### Create and train model

Pick a size for our hidden state

In [0]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix)

In [0]:
n_fac = 42

In [0]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [0]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [0]:
m = Char3Model(vocab_size, n_fac).cuda()

In [0]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [0]:
opt = optim.Adam(m.parameters(), 1e-2)

In [0]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       2.09627  6.52849]                                 



In [0]:
set_lrs(opt, 0.001)

In [0]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.84525  6.52312]                                 



### Test model

In [0]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [0]:
get_next('y. ')

'T'

In [0]:
get_next('ppl')

'e'

In [0]:
get_next(' th')

'e'

In [0]:
get_next('and')

' '

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [0]:

cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [0]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [0]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [0]:
xs = np.stack(c_in_dat, axis=0)

In [0]:
xs.shape

(600884, 8)

In [0]:
y = np.stack(c_out_dat)

So each column below is one series of 8 characters from the text.

In [0]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [0]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [0]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [0]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [0]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [0]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [0]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       2.02986  1.99268]                                



In [0]:
set_lrs(opt, 0.001)

In [0]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.73588  1.75103]                                 



In [0]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [0]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [0]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [0]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.81654  1.78501]                                



In [0]:
set_lrs(opt, 1e-4)

In [0]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.69008  1.69936]                                 



### Test model

In [0]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [0]:
get_next('for thos')

'e'

In [0]:
get_next('part of ')

't'

In [0]:
get_next('queens a')

'n'

## RNN with pytorch

In [0]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [0]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [0]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [0]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [0]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [0]:
t = m(*V(xs)); t.size()

torch.Size([512, 85])

In [0]:
fit(m, md, 4, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.86065  1.84255]                                 
[ 1.       1.68014  1.67387]                                 
[ 2.       1.58828  1.59169]                                 
[ 3.       1.52989  1.54942]                                 



In [0]:
set_lrs(opt, 1e-4)

In [0]:
fit(m, md, 2, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.46841  1.50966]                                 
[ 1.       1.46482  1.5039 ]                                 



### Test model

In [0]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [0]:
get_next('for thos')

'e'

In [0]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [0]:
get_next_n('for thos', 40)

'for those the same the same the same the same th'

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [0]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [0]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [0]:
xs = np.stack(c_in_dat)
xs.shape

(200297, 3)

In [0]:
ys = np.stack(c_out_dat)
ys.shape

(200297, 3)

In [0]:
xs[:cs,:cs]

array([[40, 42, 29],
       [30, 25, 27],
       [29,  1,  1]])

In [0]:
ys[:cs,:cs]

array([[42, 29, 30],
       [25, 27, 29],
       [ 1,  1,  1]])

### Create and train model

In [0]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [0]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [0]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

In [0]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [0]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [0]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [0]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   
    0      2.355994   2.326572  
    1      2.238234   2.226911  
    2      2.180176   2.181392  
    3      2.150363   2.158499  



[array([2.1585])]

In [0]:
set_lrs(opt, 1e-4)

In [0]:
fit(m, md, 1, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   
    0      2.120463   2.138606  



[array([2.13861])]

### Identity init!

In [0]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [0]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [0]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   
    0      2.270453   2.258912  
    1      2.218625   2.224396  
    2      2.203576   2.202711  
    3      2.188805   2.204043  



[array([2.20404])]

In [0]:
set_lrs(opt, 1e-3)

In [0]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   
    0      2.074014   2.102601  
    1      2.062794   2.094643  
    2      2.053455   2.09168   
    3      2.060589   2.088715  



[array([2.08871])]

## Stateful model

### Setup

In [0]:
ls TRN/

trn.txt


In [13]:
%cd  data/nietzsche/

/content/data/nietzsche


In [12]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='/content/data/nietzsche/'

TRN_PATH = 'TRN/'
VAL_PATH = 'VAL/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

# Note: The student needs to practice her shell skills and prepare her own dataset before proceeding:
# - trn/trn.txt (first 80% of nietzsche.txt)
# - val/val.txt (last 20% of nietzsche.txt)

%ls {PATH}

nietzsche.txt


In [14]:
# TRN
!ls 

nietzsche.txt


In [0]:
! mkdir VAL
! mkdir TRN

In [0]:
print(len(text))
print(.8* len(text))
print()
600893  *.8


600893
480714.4



480714.4

In [0]:
# !split -l 7948 --additional-suffix=.txt nietzsche.txt trn

In [0]:
# ! ls

nietzsche.txt  TRN  trnaa.txt  trnab.txt  VAL


In [0]:
# ! mv -v  trnab.txt val.txt

renamed 'trnab.txt' -> 'val.txt'


In [0]:
pwd

'/content/data/nietzsche/TRN'

In [0]:
%cd /content/data/nietzsche

/content/data/nietzsche


renamed '/content/data/nietzsche/val.txt' -> '/content/data/nietzsche/VAL/val.txt'


In [0]:
ls TRN/

trn.txt


In [0]:
len(text[480714:])

120179

In [0]:
ls

trn.txt


In [0]:
%rm /content/data/nietzsche/TRN/trn.txt
%rm /content/data/nietzsche/VAL/val.txt

In [0]:
f= open("trn.txt","w+")
for i in range(len(text[:530000])):
     f.write(text[i])
f.close() 

f= open("val.txt","w+")
for i in range(len(text[530000:])):
     f.write(text[i])
f.close() 

In [17]:
# ! wc -m  val.txt
! mv -v /content/data/nietzsche/val.txt /content/data/nietzsche/VAL/
! mv -v /content/data/nietzsche/trn.txt /content/data/nietzsche/TRN/

renamed '/content/data/nietzsche/val.txt' -> '/content/data/nietzsche/VAL/val.txt'
renamed '/content/data/nietzsche/trn.txt' -> '/content/data/nietzsche/TRN/trn.txt'


In [18]:
ls TRN/  

trn.txt


In [19]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(1017, 55, 1, 521287)

### RNN

In [0]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [0]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [22]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   
    0      1.877696   1.829953  
    1      1.696754   1.666223  
    2      1.618095   1.589783  
    3      1.564217   1.534397  



[array([1.5344])]

In [23]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   
    0      1.484162   1.486433  
    1      1.486898   1.478987  
    2      1.482042   1.472488  
    3      1.481383   1.469501  



[array([1.4695])]

### RNN loop

In [0]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [0]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [0]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [0]:
fit(m, md, 4, opt, F.nll_loss)

### GRU

In [0]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [0]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [0]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [0]:
fit(m, md, 6, opt, F.nll_loss)

In [0]:
set_lrs(opt, 1e-4)

In [0]:
fit(m, md, 3, opt, F.nll_loss)

### Putting it all together: LSTM

In [0]:
from fastai import sgdr

n_hidden=512

In [0]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [0]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [0]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [28]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   
    0      1.806121   1.711537  
    1      1.715098   1.611963  



[array([1.61196])]

In [29]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15, style=ProgressStyle(description_width='initia…

epoch      trn_loss   val_loss   
    0      1.525274   1.434996  
    1      1.576462   1.478579  
    2      1.454153   1.370709  
    3      1.600508   1.498637  
    4      1.511944   1.42372   
    5      1.441898   1.358021  
    6      1.381154   1.30435   
    7      1.595178   1.491192  
    8      1.544926   1.45098   
    9      1.514823   1.424978  
    10     1.468223   1.38548   
    11     1.433429   1.344455  
    12     1.379225   1.302261  
    13     1.333929   1.259723  
    14     1.300044   1.235799  



[array([1.2358])]

In [30]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63, style=ProgressStyle(description_width='initia…

epoch      trn_loss   val_loss   
    0      1.291347   1.231377  
    1      1.29596    1.226804  
    2      1.287933   1.224325  
    3      1.295655   1.221025  
    4      1.287611   1.214985  
    5      1.276204   1.210892  
    6      1.277007   1.209524  
    7      1.282942   1.207734  
    8      1.269476   1.202194  
    9      1.268469   1.194825  
    10     1.255592   1.187848  
    11     1.251171   1.182679  
    12     1.246463   1.178725  
    13     1.243161   1.176739  
    14     1.243858   1.175755  
    15     1.248794   1.177576  
    16     1.249167   1.171286  
    17     1.238075   1.164544  
    18     1.236175   1.156244  
    19     1.226105   1.14902   
    20     1.215867   1.141634  
    21     1.212123   1.134026  
    22     1.205074   1.12655   
    23     1.195847   1.120786  
    24     1.187793   1.115147  
    25     1.182869   1.10986   
    26     1.17906    1.10589   
    27     1.180506   1.102729  
    28     1.179099   1.100282  
    29   

[array([0.93639])]

In [0]:

# !ls models	

cyc_0  cyc_1  cyc_2  cyc_3  cyc_4


In [0]:
# !zip -r /content/data/nietzsche/models.zip /content/data/nietzsche/models

  adding: content/data/nietzsche/models/ (stored 0%)
  adding: content/data/nietzsche/models/cyc_2 (deflated 6%)
  adding: content/data/nietzsche/models/cyc_4 (deflated 7%)
  adding: content/data/nietzsche/models/cyc_0 (deflated 6%)
  adding: content/data/nietzsche/models/cyc_1 (deflated 6%)
  adding: content/data/nietzsche/models/cyc_3 (deflated 6%)


In [0]:
# from google.colab import files
# 

In [0]:
# files.download("/content/data/nietzsche/models.zip")

----------------------------------------
Exception happened during processing of request from ('::ffff:127.0.0.1', 43432, 0, 0)
Traceback (most recent call last):
  File "/usr/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.6/socketserver.py", line 721, in __init__
    self.handle()
  File "/usr/lib/python3.6/http/server.py", line 418, in handle
    self.handle_one_request()
  File "/usr/lib/python3.6/http/server.py", line 406, in handle_one_request
    method()
  File "/usr/lib/python3.6/http/server.py", line 639, in do_GET
    self.copyfile(f, self.wfile)
  File "/usr/lib/python3.6/http/server.py", line 800, in copyfile
    shutil.copyfil

### Test

In [0]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [32]:
get_next('for thos')

'e'

In [0]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [34]:
print(get_next_n('for thos', 400))

for those wrong andforms of mankind himself.97=v'een corresponsible,more permitted with innature of society, inclination; now, disaxmuth it. i can not better and cratic(of self-halvenge-hjourism, were disguise, and great and that one praisecty and sexual courage, towardswith:it will notice out of this maic, thereby the closely difference (say to you? "hencarlarisement-greatnong men. that peresses, and tri
