# References:
* [Original Notebook](https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb)

* [Exploding and vanishing gradients](https://www.volkerschatz.com/science/float.html)


In [1]:
from fastai.text.all import *
path = untar_data(URLs.IMDB)

In [2]:
files = get_text_files(path, folders=['train', 'test'])

In [3]:
path = untar_data(URLs.HUMAN_NUMBERS)


In [4]:
lines = L()
with open(path/'train.txt') as f:  lines +=L(*f.readlines())
with open(path/'valid.txt') as f:  lines +=L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [5]:
# take all those lines and concatenate them in one big stream.
# Text contains 1 to 100,000
text =' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [6]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [7]:
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [8]:
token_counts = {}
for each_token in vocab:
  token_counts[each_token] = tokens.count(each_token)
token_counts['.']

9997

In [9]:
# tokens into numbers
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums  # id of each token in text

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [10]:
'''
split text:
  * first_3 tokens: inputs
  * fourth token: prediction
'''
L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [11]:
# Now we will do it with tensors of the numericalized values, which is what the model will actually use:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0, len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [12]:
# split the sequence randomly (we can batch using datalooaders easily)
bs = 64
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)
dls

<fastai.data.core.DataLoaders at 0x7e24faf1b730>

In [13]:
# first linear layer will use only the first word's embedding as activations, the second layer will use the second word's embedding plus the first layer's output activations, and the third layer will use the third word's embedding plus the second layer's output activations
# The key effect of this is that every word is interpreted in the information context of any words preceding it.

# each of these three layers will use the same weight matrix
# Since layer weights do not change, you might think of the sequential layers as "the same layer" repeated

class LMModel1(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden) # i_h, for input to hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)     # linear layer to create the activations for the next word (h_h, for hidden to hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)    # final linear layer to predict the fourth word (h_o, for hidden to output)

  def forward(self, x):
    h = F.relu(self.h_h(self.i_h(x[:,0])))
    h = h + self.i_h(x[:, 1])
    h = F.relu(self.h_h(h))
    h = h + self.i_h(x[:,2])
    h = F.relu(self.h_h(h))
    return self.h_o(h)



learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.829674,1.999052,0.479439,00:08
1,1.398699,1.768649,0.482291,00:08
2,1.395784,1.656381,0.492988,00:07
3,1.356839,1.714967,0.417162,00:10


In [14]:
# predict the most common token

n, counts = 0, torch.zeros(len(vocab))  # n=0, counts = torch.zeros(30)
for x, y in dls.valid:                  # x, y of validation dataset
  n+= y.shape[0]          # gives 64 which is length of batch
  for i in range_of(vocab): counts[i] += (y==i).long().sum()  # get counts of each vocab

idx = torch.argmax(counts)  # id of token with max. count in validation data
idx, vocab[idx.item()], counts[idx].item()/n  # counts[idx].item()/n gives value between 0 and 1


(tensor(29), 'thousand', 0.15165200855716662)

In [15]:
counts

tensor([106., 637., 159., 107., 106., 159., 108., 106., 464., 442.,   6.,   7.,
          6.,   6.,   7.,   6.,   6.,   7.,   6.,   6.,  64.,  63.,  63.,  64.,
         63.,  63.,  66.,  66., 600., 638.])

# First RNN

In [16]:
# for loop
# we will be able to apply our module equally well to token sequences of different lengths
class LMModel2(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden,vocab_sz)
  def forward(self,x):
    h=0
    for i in range(3):
      h = h + self.i_h(x[:, i])
      h = F.relu(self.h_h(h))
    return self.h_o(h)
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.760143,1.976832,0.47421,00:08
1,1.378174,1.771691,0.474923,00:07
2,1.401172,1.609034,0.489898,00:09
3,1.36431,1.603159,0.491799,00:08


In [25]:
'''
## Improving RNN
 stateful, because it remembers its activations between different calls to forward, which represent its use for different samples in the batch

jargon: Back propagation through time (BPTT): Treating a neural net with effectively one layer per time step (usually refactored using a loop) as one big model, and calculating gradients on it in the usual way. To avoid running out of memory and time, we usually use truncated BPTT, which "detaches" the history of computation steps in the hidden state every few time steps.
'''

class LMModel3(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = 0  # hidden state to remember the last activation from previoius batch
  def forward(self, x):
    for i in range(3):
      self.h = self.h + self.i_h(x[:, i])
      self.h = F.relu(self.h_h(self.h))
    out = self.h_o(self.h)
    self.h = self.h.detach()
    return out
  def reset(self):self.h = 0

In [26]:
m = len(seqs)//bs
m,bs,len(seqs)

'''
The first batch will be composed of the samples:

(0, m, 2*m, ..., (bs-1)*m)
the second batch of the samples:

(1, m+1, 2*m+1, ..., (bs-1)*m+1)
'''

'\nThe first batch will be composed of the samples:\n\n(0, m, 2*m, ..., (bs-1)*m)\nthe second batch of the samples:\n\n(1, m+1, 2*m+1, ..., (bs-1)*m+1)\n'

In [27]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds

In [23]:
# drop_last=True when building our DataLoaders to drop the last batch that does not have a shape of bs

cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=True, shuffle=False)

In [28]:
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.709993,1.794101,0.486298,00:10
1,1.285244,1.788375,0.418269,00:11
2,1.11366,1.558468,0.482692,00:06
3,1.026124,1.499747,0.51875,00:08
4,0.979499,1.534581,0.527644,00:06
5,0.930774,1.494659,0.558654,00:09
6,0.89556,1.573395,0.553606,00:07
7,0.830991,1.574956,0.559135,00:08
8,0.790041,1.615259,0.55649,00:07
9,0.777816,1.62022,0.554087,00:12


## Creating more signals
* It would be better if we predicted the next word after every single word, rather than every three words.

In [31]:
sl = 16 # Sequence length
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1])) for i in range(0, len(nums)-sl-1, sl))
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False
                             )

In [33]:
# Looking at the first element of seqs, we can see that it contains two lists of the same size.
[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [41]:
# modify model so that it outputs a prediction after every word ratheer than end of three-word-sequence
class LMModel4(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = 0
  def forward(self, x):
    outs = []
    for i in range(sl):
      self.h = self.h + self.i_h(x[:, i])
      self.h = F.relu(self.h_h(self.h))
      outs.append(self.h_o(self.h))
    self.h = self.h.detach()
    return torch.stack(outs, dim=1)
  def reset(self):
    self.h=0

def loss_func(inp, targ):
  '''
  * custom loss function
  * This model will return outputs of shape bs x sl x vocab_sz (since we stacked on dim=1). Our targets are of shape bs x sl, so we need to flatten those before using them in F.cross_entropy:
  '''
  return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [42]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.313459,3.165826,0.171387,00:05
1,2.408147,1.968791,0.459717,00:08
2,1.763615,1.820372,0.479167,00:06
3,1.468097,1.780695,0.518392,00:08
4,1.277386,1.958829,0.546875,00:06
5,1.148399,1.787448,0.566325,00:08
6,1.030933,1.8361,0.566325,00:06
7,0.941216,1.827069,0.574382,00:08
8,0.882223,1.80158,0.618734,00:06
9,0.812011,1.917634,0.627848,00:07


## Multilayer RNN
* go deeper: adding more layers between hidden state and output activation (now we have one)

In [49]:
class LMModel5(Module):
  def __init__(self, vocab_sz, n_hidden, n_layers):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = torch.zeros(n_layers, bs, n_hidden)
  def  forward(self, x):
    res, h = self.rnn(self.i_h(x), self.h)
    self.h = h.detach()
    return self.h_o(res)
  def reset(self): self.h.zero_()

learn = Learner(dls, LMModel5(len(vocab), 64, 2),
                              loss_func=CrossEntropyLossFlat(),
                              metrics=accuracy,
                              cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.04569,2.577089,0.457194,00:06
1,2.142148,1.733856,0.471517,00:04
2,1.692065,1.81252,0.382975,00:07
3,1.460941,1.700274,0.489665,00:06
4,1.299092,1.736076,0.502116,00:05
5,1.15191,1.94803,0.50529,00:06
6,1.019612,1.765673,0.524333,00:04
7,0.91093,1.867434,0.538656,00:05
8,0.822357,1.889408,0.548014,00:06
9,0.751477,1.974125,0.554769,00:04


# Exploding and disapparing activations
* In RNN, GRU or LSTM are often used to avoid exploding or vanishing gradients
* LSTM has two hidden states

In [None]:
class LSTMCell(MOdule):
  def __init__(self, ni, nh):
    self.forget_gate = nn.Linear(ni + nh, nh)
    self.input_gate = nn.Linear(ni + nh, nh)
    self.cell_gate = nn.Linear(ni + nh, nh)
    selfl.output_gate = nn.Linear(ni + nh, nh)

  def forward(self, input, state):
    h,c = state
    h = torch.cat([h, input], dim=1)
    forget = torch.sigmoid(self.forget_gate(h))
    c = c * forget
    inp = torch.sigmoid(self.input_gate(h))
    cell = torch.tanh(self.cell_gate(h))
    h=out*torch.tanh(c)
    return h, (h,c)

In [None]:
# One big matrix multiplication instead of four small ones.
class LSTMCell(Module):
  def __init(self, nim nh):
    self.ih=nn.Linear(ni,4*nh)
    self.hh=nn.Linear(nh, 4*nh)
  def forward(self, input, state):
    h,c = state
    # one big multiplication for all the gates is better than 4 smaller
    gates = (self.ih(input) + self.hh(h)).chunk(4,1)
    ingate, forgetgate.outgate = map(torch.sigmoid, gates[:3])
    cellgate = gates[3].tanh()

    c= (forgetgate*c) + (ingate*cellgate)
    h = outgate * c.tanh()
    return h, (h,c)



In [52]:
# pytorch chunk method to split our tensor into four pieces.
t = torch.arange(0, 10); print(t)

t.chunk(2)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9]))

In [56]:
# Training Language model using LSTM
class LMModel6(Module):
  def __init__(self, vocab_sz, n_hidden, n_layers):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
  def forward(self, x):
    res, h = self.rnn(self.i_h(x), self.h)
    self.h = [h_.detach() for h_ in h]
    return self.h_o(res)

  def reset(self):
    for h in self.h: h.zero_()


learn = Learner(dls, LMModel6(len(vocab), 64, 2),
                loss_func=CrossEntropyLossFlat(),
                metrics=accuracy, cbs=ModelResetter)
learn.fit(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.691822,2.408979,0.204508,00:07
1,1.414466,1.788386,0.44043,00:06
2,1.267612,1.811866,0.45638,00:08
3,1.128752,2.167782,0.522542,00:07
4,0.966668,2.022843,0.589437,00:06
5,0.762194,1.734247,0.64738,00:05
6,0.532978,1.650149,0.665934,00:06
7,0.337542,1.50457,0.703206,00:05
8,0.204079,1.234959,0.767659,00:05
9,0.118138,0.936136,0.791341,00:07


# Regularizing LSTM with dropout
* Dropout was introduced by geoffery Hinton
* LSTM regularizatuion
  1. AR: Activation REgularization
    * Make final activations produced by the LSTM that we will try to make as small as possible, instead of the weights.
    * loss += alpha * activations.pow(2).mean()
  2. TAR: Temproal Activation Regularization
    * adding a penalty to the loss to make the difference between two consecutive activations as small as possible
    * loss += beta * (activations[:,1:] - activations[:,:-1]).pow(2).mean()

  * alpha and beta are then two
  hyperparameters to tune.

* weight tying. In a language model, the input embeddings represent a mapping from English words to activations, and the output hidden layer represents a mapping from activations to English words. We might expect, intuitively, that these mappings could be the same. We can represent this in PyTorch by assigning the same weight matrix to each of these layers:

`self.h_o.weight = self.i_h.weight`


In [None]:
class Dropout(Module):
  def __init__(self, p):self.p=p
  def forward(self, x):
    if not self.training:return x
    mask = x.new(*x.shape).bernoulli_(1-p)
    return x*mask.div_(1-p)


In [60]:
class LMModel7(Module):
  def __init__(self, vocab_sz, n_hidden, n_layers, p):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
    self.drop=nn.Dropout(p)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h_o.weights = self.i_h.weight
    self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

  def forward(self, x):
    raw, h = self.rnn(self.i_h(x), self.h)
    out = self.drop(raw)
    self.h = [h_.detach() for h_ in h]
    return self.h_o(out), raw, out

  def reset(self):
    for h in self.h: h.zero_()

  def predict_(self, input):
        self.reset()  # Reset the hidden state before prediction
        input = torch.tensor([input]).unsqueeze(0)  # Convert to tensor and add batch dimension
        with torch.no_grad():
            output, _, _ = self.forward(input)
        predicted_token = torch.argmax(output[0, -1]).item()  # Get the index of the highest probability token
        return predicted_token



In [62]:
learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4),
                    loss_func=CrossEntropyLossFlat(), metrics=accuracy)
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.917637,2.48539,0.438477,00:07
1,1.992776,1.760049,0.433594,00:08
2,1.561213,1.801524,0.490723,00:06
3,1.334763,1.862661,0.500163,00:08
4,1.181411,1.914924,0.546143,00:06
5,1.031555,1.750698,0.629639,00:08
6,0.786314,1.593715,0.721517,00:06
7,0.539222,1.788325,0.737549,00:08
8,0.354956,1.496922,0.785482,00:06
9,0.228535,1.640259,0.781576,00:09


# Todo: Make prediction based on input text

In [105]:
df_test = pd.read_csv('test.csv');df_test.excerpt
l_test = learn.dls.test_dl(df_test.excerpt)
preds = learn.get_preds(dl=dl_test)
preds


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not str

In [106]:
df_test.excerpt

0    one
Name: excerpt, dtype: object

In [104]:
!echo "excerpt" >> test.csv
!echo "one" >> test.csv

* Mero school: initial crawling code