In [69]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [70]:
from fastbook import *

In this notebooks, the nut and bolts of the natural language models are discussed- we know how to pre-train a language model, use the high-level and mid-level API for flexibity. Understanding what happens beneath the hood will help understand even better.

To try out the efficacy of a new algorithm or when we try to embark on a learning journey in ML, start with a simple dataset , easy to train and verify. Starting with complicated datasets is a hindrance as the complexity of the project increases manifold.

In [71]:
from fastai.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)

This is a simple dataset which has first 10000 numbers written in English. 

In [72]:
Path.BASE_PATH = path
path.ls()



(#2) [Path('valid.txt'),Path('train.txt')]

In [73]:
lines = L()
with open(path/'train.txt') as f: lines+=L(*f.readlines())
with open(path/'valid.txt') as f: lines+=L(*f.readlines())

lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

Step1 : concatenation of the numbers in the list to a single big string, we use ' . ' as a separator to move from one number to next

In [74]:
text = ' . '.join(l.strip() for l in lines)
len(text)

365478

we could use a tokenizer like spacy, but since this is a simple dataset of only numbers in word form, a simple space based tokenizer is enough

In [75]:
tokens = text.split(' ')

In [76]:
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [77]:
tokens[63093]

'ninety'

In [78]:
vocab = L(*tokens).unique()

In [79]:
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [80]:
print(vocab)

['one', '.', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand']


In [81]:
word2index = {w:i for i,w in enumerate(vocab)}
nums = L(word2index[i] for i in tokens)
len(nums)

63095

Language model from scratch - we now have the tokenized and numericalized text data

In [82]:
print(L((tokens[i:i+3],tokens[i+3]) for i in range(0,len(tokens)-4,3))[:100])
  

[(['one', '.', 'two'], '.'), (['.', 'three', '.'], 'four'), (['four', '.', 'five'], '.'), (['.', 'six', '.'], 'seven'), (['seven', '.', 'eight'], '.'), (['.', 'nine', '.'], 'ten'), (['ten', '.', 'eleven'], '.'), (['.', 'twelve', '.'], 'thirteen'), (['thirteen', '.', 'fourteen'], '.'), (['.', 'fifteen', '.'], 'sixteen'), (['sixteen', '.', 'seventeen'], '.'), (['.', 'eighteen', '.'], 'nineteen'), (['nineteen', '.', 'twenty'], '.'), (['.', 'twenty', 'one'], '.'), (['.', 'twenty', 'two'], '.'), (['.', 'twenty', 'three'], '.'), (['.', 'twenty', 'four'], '.'), (['.', 'twenty', 'five'], '.'), (['.', 'twenty', 'six'], '.'), (['.', 'twenty', 'seven'], '.'), (['.', 'twenty', 'eight'], '.'), (['.', 'twenty', 'nine'], '.'), (['.', 'thirty', '.'], 'thirty'), (['thirty', 'one', '.'], 'thirty'), (['thirty', 'two', '.'], 'thirty'), (['thirty', 'three', '.'], 'thirty'), (['thirty', 'four', '.'], 'thirty'), (['thirty', 'five', '.'], 'thirty'), (['thirty', 'six', '.'], 'thirty'), (['thirty', 'seven', '.'

Given three words in sequence, predict the next word in the sequence. The dataset is curated in a way, three words in sequence form the independent variable and fourth word is taken as the dependent variable. The same could be done in  numerical form as follows: 

In [83]:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))


In [84]:
print(seqs[:200])

[(tensor([0, 1, 2]), 1), (tensor([1, 3, 1]), 4), (tensor([4, 1, 5]), 1), (tensor([1, 6, 1]), 7), (tensor([7, 1, 8]), 1), (tensor([1, 9, 1]), 10), (tensor([10,  1, 11]), 1), (tensor([ 1, 12,  1]), 13), (tensor([13,  1, 14]), 1), (tensor([ 1, 15,  1]), 16), (tensor([16,  1, 17]), 1), (tensor([ 1, 18,  1]), 19), (tensor([19,  1, 20]), 1), (tensor([ 1, 20,  0]), 1), (tensor([ 1, 20,  2]), 1), (tensor([ 1, 20,  3]), 1), (tensor([ 1, 20,  4]), 1), (tensor([ 1, 20,  5]), 1), (tensor([ 1, 20,  6]), 1), (tensor([ 1, 20,  7]), 1), (tensor([ 1, 20,  8]), 1), (tensor([ 1, 20,  9]), 1), (tensor([ 1, 21,  1]), 21), (tensor([21,  0,  1]), 21), (tensor([21,  2,  1]), 21), (tensor([21,  3,  1]), 21), (tensor([21,  4,  1]), 21), (tensor([21,  5,  1]), 21), (tensor([21,  6,  1]), 21), (tensor([21,  7,  1]), 21), (tensor([21,  8,  1]), 21), (tensor([21,  9,  1]), 22), (tensor([22,  1, 22]), 0), (tensor([ 0,  1, 22]), 2), (tensor([ 2,  1, 22]), 3), (tensor([ 3,  1, 22]), 4), (tensor([ 4,  1, 22]), 5), (ten

In [85]:
bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)

The neural network architure used is a simple three layer model which has layers consisting of embedding to pick the index according to the word's position in the vocab. The first layer will have just the embedding of the first word, second layer will have embedding of second word + activation of the first word , third layer has embedding of third word + activation of the first+second

Another tweak is that the weight matrix for each of the layers correspinding to the three words are the same. the weight matrix is independent of position -> it only encodes probability of next word given a word. This word-word encoding does not depend on position.

In [86]:
# model built on pytorch
class LMModel1(Module):
    def __init__(self,vocab_sz ,n_hidden):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        
    def forward(self,x):
        h = F.relu(self.h_h(self.i_h(x[:,0])))
        h = h + self.i_h(x[:,1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:,2])
        h = F.relu(self.h_h(h))
        return self.h_o(h)
    


In [87]:
learn = Learner(dls,LMModel1(len(vocab), 64),loss_func=F.cross_entropy,metrics=accuracy )
learn.fit_one_cycle(4,1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.816107,1.899504,0.463275,00:02
1,1.413302,1.75092,0.468267,00:01
2,1.419951,1.615062,0.486095,00:02
3,1.384168,1.608317,0.495365,00:02


In [88]:
n,counts = 0,torch.zeros(len(vocab))
for x,y in dls.valid:
    n+=y.shape[0]
    for i in range_of(vocab) : counts[i] += (y == i).long().sum()
idx = torch.argmax(counts)
idx,vocab[idx.item()], counts[idx].item()

(tensor(29), 'thousand', 638.0)

first recurrent neural network


In [89]:
class LMModel2(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        
    def forward(self, x):
        h = 0
        for i in range(3):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
        return self.h_o(h)
     

In [90]:
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, 
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.821032,1.857483,0.467554,00:02
1,1.406928,1.751597,0.468029,00:01
2,1.412491,1.67344,0.490373,00:02
3,1.368348,1.620943,0.493463,00:02


RNN is not a new architecture , it is this for loop based neural netwrk where activations of layers are placed inside for loop

Some problems with above implementation:
1. h = 0 for every input sequence - for the current example, the sequence is short (only three words),but in the original examples, we might have to parse long sequences. 
2. only fourth word is predicted given the first three words - why not predict second,third,fourth given first second third

each time we get a new word, h is set to 0, so we lose all the information from the previous words. Instead we could preserve the value of h. This introduces another subtle problem. if there are 10,000 words in the dataset, we will have 10,000 layer networks. so, when we calculate gradients, we have to back-propagate from 10000th layer to layer 0, with this network,, not even 1 mini batch can be processed. Instead, only last three layers are used for calculating the gradients.

In [91]:
class LMModel3(Module):
    def __init__(self,vocab_sz,n_hidden):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0
    
    def forward(self,x):
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach()  # to prevent back prop to previous layers
        
        return out
    def reset(self) : self.h=0
            
            
        
        

The model will have same activations for different sequence lengths, the gradients will be calculated only for the last sequence and not the whole stream to avoid memory overhead and computational problems.
This approach is called Backpropagation through time (BPTT). 

for this model, the dataset is to be modeled in a certain way. the dataset is split into batches of size 64.

In [92]:
bs = 64
m = len(seqs) // bs
m,bs,len(seqs)

(328, 64, 21031)

In [93]:
def group_chunks(ds,bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m):new_ds += L(ds[i+m*j] for j in range(bs))
    return new_ds

In [94]:
#Splitting into train and test
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut],bs),group_chunks(seqs[cut:],bs),bs=bs,drop_last=True,shuffle=False)


there was a method called Reset in class LMModel3 which set h=0 to start clean for the next epoch. This is called via callBack

In [95]:
learn = Learner(dls,LMModel3(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10,3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.681793,1.890971,0.40649,00:02
1,1.345999,1.785482,0.442788,00:02
2,1.112991,1.656702,0.471635,00:02
3,1.007225,1.471269,0.542548,00:02
4,0.971087,1.537394,0.563462,00:02
5,0.921116,1.647311,0.578606,00:02
6,0.875277,1.699624,0.545913,00:02
7,0.812338,1.521746,0.614423,00:02
8,0.767454,1.545469,0.605769,00:02
9,0.74674,1.545983,0.613462,00:02


In [96]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [97]:
[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [98]:
class LMModel4(Module):
    def __init__(self, vocab_sz,n_hidden):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self,x):
        outs = []
        for i in range(sl):
            self.h = self.h + self.i_h(x[:,0])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self): self.h = 0
     
        

output of size (bsXslXvocab_sz) but learner expects (bsXsl), hence some flattening is to be done)

In [99]:
def loss_func(inp,targ):
    return F.cross_entropy(inp.view(-1,len(vocab)),targ.view(-1))

In [100]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func,
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(20, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.409574,3.408678,0.025553,00:00
1,3.259314,3.093459,0.137777,00:00
2,2.965466,2.808803,0.162842,00:00
3,2.811515,2.776106,0.164225,00:00
4,2.742966,2.756315,0.167399,00:00
5,2.7065,2.737959,0.173828,00:00
6,2.674127,2.694182,0.225098,00:00
7,2.588782,2.590155,0.282878,00:00
8,2.526418,2.587842,0.273356,00:00
9,2.459126,2.579491,0.282389,00:00


Multi-layer RNN:The activations from one layer are passed to another RNN . This way multiple RNNs are stacked on top of each other.


In [101]:
class LMModel5(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = torch.zeros(n_layers, bs, n_hidden)
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(res)
    
    def reset(self): self.h.zero_()

We can get better performance with RNNs, if we call detach less often, i.e make the layer deeper and add more layers as well. this poses another problem. the deeper layers mean that there are more matrix multiplications to do. If matrix multipllcations are done with a number greater than 1, it leads to very huge numbers and opposite with numbers less than 1. Exploding or diminishing gradients .
Also, weights are usually in floating points - their precision drops with increase in values. This means that weights are either close to zero or go to inf . making the training process useles..


There are some methods to avoid exploding or vanishing gradients - modify the definition of layers like adding batch normalization in convolutional nets or resnets.
Also, another method is careful initialization of weights.
The problem in RNNs is avoided by introducing two types of layers - LSTM and GRU (Gate Recurrent Unit) and Long Short term memory

In LSTM, there are two hidden states compared to RNN which had just one hidden state.One for elping the output layer predict the right token given input words and another one for retaining memory of everything happened in the sentence. RNN does not keep track of many previous words.it suffers from very short term memory.

LSTM uses four neural network layers with sigmoid activation functions and tanh activation functions which squishes output in the range [-1,1]
tanh(x) = (e^x - e^-x)/(e^x+e^-x). the four nets are called gates.
the arrows for input and previous hidden state are joined together in the beginning itself. So, earlier for RNNs the embeddings dimension were different. Now input is n_in+n_hid and output dim is n_hid .this applies to all the nets(gates). 
The first gate from left to right is called forget gate. it has sigmoid activation function -> so the output is squished between 0 and 1. the result from this gate is multiplied with cell state from previous time step to determine if the values are to be thrown away or not. Values closer to 0 are discarded while values closer to 1 are passed along. 
This allows LSTM to forget things that are not important, for example if one essay is done, (xxbos encountered), the past could simply be forgotten.


The second gate is called input gate.It works with the third gate to update the cell state.The forget gate may have removed context specific information like gender pronouns which may be needed for better prediction. The input gate works with forget gate to update the cell state.similar to forget gate, input gate decides what words to be updated in the cell state and by what amount is decided by third cell gate(tanh between -1 and 1).


The last gate is the output gate. It determines which information from the cell state to use to generate te output. the cell state goes through tanh to make it between -1 and 1 before combining it with sigmoid output from previous network to generate hidden state at time step t.

In [102]:
class LSTMCell(Module):
    def __init__(self,ni,nh):
        self.forget_gate = nn.Linear(ni+nh,nh)
        self.input_gate = nn.Linear(ni+nh, nh)
        self.cell_gate = nn.Linear(ni+nh,nh)
        self.output_gate = nn.Linear(ni+nh,nh)
        
        
    def forward(self,input,state):
        h,c = state
        h = torch.cat([h,input],dim=1)
        forget = torch.sigmoid(self.input_gate(h))
        c = c*forget
        inp = torch.sigmoid(self.input_gate(h))
        cell = torch.tanh(self.cell_gate(h))
        c = c+inp*cell
        out = torch.sigmoid(self.output_gate(h))
        h = out*torch.tanh(c)
        return h,(h,c)
        

The code could be refactored. The four networks matrix multiplications could be combined together into one big matrix multiplication as the special fast kernel is to be launched only once on the GPU and parallelization is taken care of by GPU. stacking ni and nh on top of each othre is cumbersome. Hence two separate networks for ni and nh and their results are compared.

In [None]:
class LSTMCell(Module):
    def __init__(self,ni,nh):
        self.ih = nn.Linear(ni,4*nh)
        self.hh = nn.Linear(nh,4*nh)
        
    def forward(self,input,state):
        h,c = state
        gates = (self.ih(input) + self.hh(state)).chunk(4,1) # split the tensor into 4 smaller chunks
        ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        
        c = (forgetgate*c + ingate*cellgate)
        h = outgate*c.tanh()
        
        return h,(h,c)

In [103]:
#Pytorch chunk method working
t = torch.arange(0,10); t

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [104]:

t.chunk(2)

(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9]))

In [105]:
#To use multi-layer LSTM like RNN, we make use of fastai libraryto do that
class LMModel6(Module):
    def __init__(self,vocab_sz, n_hidden,n_layers):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.rnn = nn.LSTM(n_hidden,n_hidden,n_layers,batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(n_layers,bs,n_hidden) for _ in range(2)]
    
    def forward(self,x):
        res,h = self.rnn(self.i_h(x),self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)
    
    def reset(self):
        for h in self.h : h.zero_()
            
        
        
        

In [106]:
learn = Learner(dls, LMModel6(len(vocab), 64, 2), 
                loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.024198,2.711789,0.295817,00:01
1,2.156858,1.789582,0.420898,00:01
2,1.601904,1.780979,0.493164,00:01
3,1.304933,2.12231,0.525309,00:02
4,1.043346,2.071455,0.579183,00:02
5,0.771994,1.738052,0.677246,00:01
6,0.542696,1.463323,0.731283,00:01
7,0.357117,1.509923,0.769287,00:01
8,0.222075,1.64333,0.80363,00:01
9,0.129873,1.521902,0.816732,00:01


RNNs are generally hard to train due to the problem of exploding and vanishing gradients.Using LSTM makes it better but it is prone to overfitting .There are techniques to prevent overfitting like regularization, dropout etc. Data Augmentation is hard to do with language models like images. With techniques like dropout, activation regularization and temporal regularization, good performance can be achieved!

In dropout some activations are zeroed to prevent frauding , to promote cooperation between neurons and to generate noisy activations so that there is good generalization.
As far as activation regularization is concerned, it is similar to weight decay. some penalty is added to loss function to reduce the sum squared of weights .
for LSTM, the final activations are penalized in the loss function to keep them small enough.
In NLP, we are generating tokens in order, meaning that the sentence should make sense when read in order. 
