In [1]:
from fastai.text import *

In [2]:
bs = 64

In [3]:
path = Path('/storage/human_numbers')

# Human Numbers

## Data

### Load txt files into strings

In [4]:
#path = untar_data(URLs.HUMAN_NUMBERS)

In [5]:
path.ls()

[PosixPath('/storage/human_numbers/models'),
 PosixPath('/storage/human_numbers/train.txt'),
 PosixPath('/storage/human_numbers/valid.txt')]

In [6]:
def readnums(d): return [', '.join(o.strip() 
                         for o in open(path/d).readlines())]

In [7]:
train_txt = readnums('train.txt')

In [8]:
train_txt[0][:80]

'one, two, three, four, five, six, seven, eight, nine, ten, eleven, twelve, thirt'

In [9]:
valid_txt = readnums('valid.txt')

In [10]:
valid_txt[0][-80:]

' nine thousand nine hundred ninety eight, nine thousand nine hundred ninety nine'

### Load strings into Databunch

In [11]:
train = TextList(train_txt, path=path)
valid = TextList(valid_txt, path=path)

In [12]:
src = ItemLists(path=path, train=train, valid=valid).label_for_lm()
data = src.databunch(bs=bs)

In [13]:
train[0].text[:80]

'xxbos one , two , three , four , five , six , seven , eight , nine , ten , eleve'

### Visualize tokenized batches and vocab

In [14]:
len(data.valid_ds[0][0].data)

13017

In [15]:
data.bptt, len(data.valid_dl)

(70, 3)

In [16]:
13017/70/bs

2.905580357142857

In [17]:
it = iter(data.valid_dl)
x1, y1 = next(it)
x2, y2 = next(it)
x3, y3 = next(it)
it.close()

In [18]:
x1.numel()+x2.numel()+x3.numel()

13440

In [19]:
x1.shape, y1.shape

(torch.Size([64, 70]), torch.Size([64, 70]))

In [20]:
x2.shape, y2.shape

(torch.Size([64, 70]), torch.Size([64, 70]))

In [21]:
x1[:,0]

tensor([ 2,  9, 11, 12, 13, 11, 10,  9, 10, 14, 19, 25, 19, 15, 16, 11, 19,  9,
        10,  9, 19, 25, 19, 11, 19, 11, 10,  9, 19, 20, 11, 26, 20, 23, 20, 20,
        24, 20, 11, 14, 11, 11,  9, 14,  9, 20, 10, 20, 35, 17, 11, 10,  9, 17,
         9, 20, 10, 20, 11, 20, 11, 20, 20, 20], device='cuda:0')

In [22]:
y1[:,0]

tensor([19, 19, 27, 10,  9, 12, 32, 19, 26, 10, 11, 15, 11, 10,  9, 15, 11, 19,
        26, 19, 11, 18, 11, 18,  9, 18, 21, 19, 10, 10, 20,  9, 11, 16, 11, 11,
        13, 11, 13,  9, 13, 14, 20, 10, 20, 11, 24, 11,  9,  9, 16, 17, 20, 10,
        20, 11, 24, 11, 19,  9, 19, 11, 11, 10], device='cuda:0')

In [23]:
# Get vocab
v = data.valid_ds.vocab

In [24]:
v.textify(x1[0])

'xxbos eight thousand one , eight thousand two , eight thousand three , eight thousand four , eight thousand five , eight thousand six , eight thousand seven , eight thousand eight , eight thousand nine , eight thousand ten , eight thousand eleven , eight thousand twelve , eight thousand thirteen , eight thousand fourteen , eight thousand fifteen , eight thousand sixteen , eight thousand seventeen , eight'

In [25]:
v.textify(y1[0])

'eight thousand one , eight thousand two , eight thousand three , eight thousand four , eight thousand five , eight thousand six , eight thousand seven , eight thousand eight , eight thousand nine , eight thousand ten , eight thousand eleven , eight thousand twelve , eight thousand thirteen , eight thousand fourteen , eight thousand fifteen , eight thousand sixteen , eight thousand seventeen , eight thousand'

In [26]:
data.show_batch(ds_type=DatasetType.Valid)

idx,text
0,"thousand forty seven , eight thousand forty eight , eight thousand forty nine , eight thousand fifty , eight thousand fifty one , eight thousand fifty two , eight thousand fifty three , eight thousand fifty four , eight thousand fifty five , eight thousand fifty six , eight thousand fifty seven , eight thousand fifty eight , eight thousand fifty nine , eight thousand sixty , eight thousand sixty"
1,"eight , eight thousand eighty nine , eight thousand ninety , eight thousand ninety one , eight thousand ninety two , eight thousand ninety three , eight thousand ninety four , eight thousand ninety five , eight thousand ninety six , eight thousand ninety seven , eight thousand ninety eight , eight thousand ninety nine , eight thousand one hundred , eight thousand one hundred one , eight thousand one"
2,"thousand one hundred twenty four , eight thousand one hundred twenty five , eight thousand one hundred twenty six , eight thousand one hundred twenty seven , eight thousand one hundred twenty eight , eight thousand one hundred twenty nine , eight thousand one hundred thirty , eight thousand one hundred thirty one , eight thousand one hundred thirty two , eight thousand one hundred thirty three , eight thousand"
3,"three , eight thousand one hundred fifty four , eight thousand one hundred fifty five , eight thousand one hundred fifty six , eight thousand one hundred fifty seven , eight thousand one hundred fifty eight , eight thousand one hundred fifty nine , eight thousand one hundred sixty , eight thousand one hundred sixty one , eight thousand one hundred sixty two , eight thousand one hundred sixty three"
4,"thousand one hundred eighty three , eight thousand one hundred eighty four , eight thousand one hundred eighty five , eight thousand one hundred eighty six , eight thousand one hundred eighty seven , eight thousand one hundred eighty eight , eight thousand one hundred eighty nine , eight thousand one hundred ninety , eight thousand one hundred ninety one , eight thousand one hundred ninety two , eight thousand"


## Single fully connected model

In [27]:
data = src.databunch(bs=bs, bptt=3)

In [28]:
# Get one batch from data bunch
x, y = data.one_batch()
x.shape, y.shape

(torch.Size([64, 3]), torch.Size([64, 3]))

In [29]:
# nv : Vocab size
nv = len(v.itos); nv

40

In [30]:
nh=64

### Define loss func and architecture

In [31]:
# Set loss func with target being last word of sentence
def loss4(input, target): return F.cross_entropy(input, target[:,-1])
def acc4(input, target): return accuracy(input, target[:,-1])

In [32]:
class Model0(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv, nh) # input to hidden layer
        self.h_h = nn.Linear(nh, nh)    # hidden to hidden
        self.h_o = nn.Linear(nh, nv)    # hidden to output
        self.bn = nn.BatchNorm1d(nh)
        
    def forward(self, x):
        h = self.bn( F.relu( self.i_h( x[:,0] )))
        if x.shape[1]>1:
            h = h + self.i_h( x[:,1] )
            h = self.bn( F.relu( self.h_h( h )))
        if x.shape[1] > 2:
            h = h + self.i_h( x[:,2])
            h = self.bn( F.relu( self.h_h( h )))
        return self.h_o( h )

### Train

In [33]:
learn = Learner(data, Model0(), loss_func=loss4, metrics=acc4)

In [34]:
learn.fit_one_cycle(6, 1e-4)

epoch,train_loss,valid_loss,acc4,time
0,3.63565,3.628775,0.045267,00:01
1,3.062792,3.217659,0.300551,00:01
2,2.46874,2.737075,0.426241,00:01
3,2.142269,2.457103,0.449678,00:01
4,2.017303,2.351988,0.452665,00:01
5,1.991442,2.336699,0.445083,00:01


## Same logic with a loop

In [35]:
class Model1(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv, nh) # Input to hidden layer
        self.h_h = nn.Linear(nh, nh)    # Hidden to Hidden
        self.h_o = nn.Linear(nh, nv)    # Hidden to output
        self.bn  = nn.BatchNorm1d(nh)
        
    def forward(self, x):
        h = torch.zeros(x.shape[0], nh).to(device=x.device)
        
        for i in range(x.shape[1]):
            h += self.i_h( x[:,i] )
            h = self.bn( F.relu( self.h_h( h )))
        return self.h_o( h )

In [36]:
learn = None

In [37]:
learn = Learner(data, Model1(), loss_func=loss4, metrics=acc4)

In [38]:
learn.fit_one_cycle(6, 1e-4)

epoch,train_loss,valid_loss,acc4,time
0,3.646441,3.660183,0.036075,00:01
1,3.083254,3.086922,0.424632,00:01
2,2.493611,2.558319,0.464154,00:01
3,2.167393,2.311152,0.467142,00:01
4,2.040799,2.225988,0.46852,00:01
5,2.014502,2.213849,0.46875,00:01


## Multi fully connected model

In [107]:
data = src.databunch(bs=bs, bptt=20)

In [47]:
x, y = data.one_batch()
x.shape, y.shape

(torch.Size([64, 20]), torch.Size([64, 20]))

In [54]:
v.textify( x[0] )

', two hundred fifteen , two hundred sixteen , two hundred seventeen , two hundred eighteen , two hundred nineteen'

In [41]:
class Model2(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.h_h = nn.Linear(nh,nh)
        self.h_o = nn.Linear(nh,nv)
        self.bn = nn.BatchNorm1d(nh)
        
    def forward(self, x):
        h = torch.zeros(x.shape[0], nh).to(device=x.device)
        res = []
        for i in range(x.shape[1]):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
            res.append(self.h_o(self.bn(h)))
        return torch.stack(res, dim=1)

### Train

In [45]:
learn = Learner( data, Model2(),  metrics=accuracy)

In [46]:
learn.fit_one_cycle(10, 1e-4, pct_start=.1)

epoch,train_loss,valid_loss,accuracy,time
0,3.696213,3.628641,0.104616,00:00
1,3.601722,3.52322,0.128764,00:00
2,3.486953,3.421871,0.180966,00:00
3,3.369045,3.329303,0.199858,00:00
4,3.257988,3.249365,0.248935,00:00
5,3.162196,3.18859,0.270526,00:00
6,3.087529,3.149022,0.276847,00:00
7,3.035241,3.127502,0.280398,00:00
8,3.003057,3.119274,0.281534,00:00
9,2.986256,3.118058,0.281747,00:00


## Maintain state instead of intializing to 0 each time

In [78]:
class Model3(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.h_h = nn.Linear(nh,nh)
        self.h_o = nn.Linear(nh,nv)
        self.bn = nn.BatchNorm1d(nh)
        self.h = torch.zeros(bs, nh).cuda()
        
    def forward(self, x):
        h = self.h
        res = []
        
        for i in range(x.shape[1]):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
            res.append(self.bn(h))
        

        self.h = h.detach()
        res = torch.stack(res, dim=1)
        res = self.h_o(res)
        return res

### Train

In [76]:
learn = Learner( data, Model3(), metrics=accuracy)

In [77]:
learn.fit(15, 1e-4)

epoch,train_loss,valid_loss,accuracy,time
0,3.661883,3.577091,0.096662,00:00
1,3.556543,3.459776,0.160511,00:00
2,3.43854,3.339997,0.223793,00:00
3,3.312831,3.215936,0.343537,00:00
4,3.1839,3.089219,0.404403,00:00
5,3.054075,2.968214,0.430398,00:00
6,2.926588,2.854238,0.443537,00:00
7,2.802353,2.743898,0.44929,00:00
8,2.683151,2.632955,0.453338,00:00
9,2.570487,2.540583,0.460724,00:00


## Stacked RNNs

Use nn.RNN to take care of the loop

In [118]:
class Model4(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.rnn = nn.RNN(nh,nh, batch_first=True)
        self.h_o = nn.Linear(nh,nv)
        self.bn = BatchNorm1dFlat(nh)
        self.h = torch.zeros(1, bs, nh).cuda()
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(self.bn(res))

In [119]:
learn = Learner( data, Model4(), metrics=accuracy)

In [120]:
learn.fit_one_cycle(20, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.739277,3.634076,0.075426,00:00
1,3.551204,3.286042,0.209872,00:00
2,3.132591,2.651541,0.36669,00:00
3,2.560137,2.124702,0.42919,00:00
4,2.083465,1.98283,0.327273,00:00
5,1.788196,1.996942,0.317045,00:00
6,1.622772,1.980888,0.322088,00:00
7,1.502787,1.738041,0.46108,00:00
8,1.382799,1.558231,0.487642,00:00
9,1.271969,1.485714,0.488849,00:00


## 2-layer GRU

Use a small 2-layers NN inside each Hidden layer to filter what data to keep from embedding and previous hidden layer.

In [121]:
class Model5(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.rnn = nn.GRU(nh, nh, 2, batch_first=True)
        self.h_o = nn.Linear(nh,nv)
        self.bn = BatchNorm1dFlat(nh)
        self.h = torch.zeros(2, bs, nh).cuda()
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(self.bn(res))

In [122]:
learn = Learner(data, Model5(), metrics=accuracy)

In [123]:
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.999274,2.372445,0.458239,00:00
1,1.877551,1.580082,0.573153,00:00
2,0.969807,0.935314,0.786861,00:00
3,0.4748,0.837898,0.829261,00:00
4,0.235704,0.930713,0.833026,00:00
5,0.123071,0.937063,0.831747,00:00
6,0.068054,0.85915,0.839489,00:00
7,0.040318,0.933715,0.836435,00:00
8,0.025901,0.984019,0.832457,00:00
9,0.018475,0.980621,0.831889,00:00
