# ULMFit

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#export
from exp.nb_12a import *

## Data

In [None]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [None]:
ll = pickle.load(open(path/'ll_lm.pkl', 'rb'))

In [None]:
bs,bptt = 128,70
data = lm_databunchify(ll, bs, bptt)

In [None]:
vocab = ll.train.proc_x[1].vocab

## Finetuning the LM

Before tackling the classification task, we have to finetune our language model to the IMDB corpus. Make sure you have the pretrained.pth and vocab.pkl files in your IMDB data folder. 

In [None]:
path.ls()

[PosixPath('/home/ubuntu/.fastai/data/imdb/imdb.vocab'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/data_lm.pkl'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/tmp_lm'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/models'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/vocab.pkl'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/test'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/pretrained.pth'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/unsup'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/tmp_clas'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/README'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/ll_lm.pkl'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/train')]

In [None]:
dps = tensor([0.25, 0.1, 0.2, 0.02, 0.15]) * 0.5

In [None]:
emb_sz, nh, nl = 300, 300, 2
model = get_language_model(len(vocab), emb_sz, nh, nl, 0, input_p=dps[0], output_p=dps[1], weight_p=dps[2], 
                           embed_p=dps[3], hidden_p=dps[4])

Match embeddings

In [None]:
old_wgts  = torch.load(path/'pretrained.pth')
old_vocab = pickle.load(open(path/'vocab.pkl', 'rb'))

In [None]:
vocab.index('house'),old_vocab.index('house')

(347, 231)

In [None]:
house_wgt  = old_wgts['0.encoder.weight'][231]
house_bias = old_wgts['1.decoder.bias'][231] 

In [None]:
def match_embeds(old_wgts, old_vocab, new_vocab):
    wgts = old_wgts['0.encoder.weight']
    bias = old_wgts['1.decoder.bias']
    wgts_m,bias_m = wgts.mean(dim=0),bias.mean()
    new_wgts = wgts.new_zeros(len(new_vocab), wgts.size(1))
    new_bias = bias.new_zeros(len(new_vocab))
    for i,w in enumerate(new_vocab): 
        if w in old_vocab:
            idx = old_vocab.index(w)
            new_wgts[i],new_bias[i] = wgts[idx],bias[idx]
        else: new_wgts[i],new_bias[i] = wgts_m,bias_m
    old_wgts['0.encoder.weight']    = new_wgts
    old_wgts['0.encoder_dp.emb.weight'] = new_wgts
    old_wgts['1.decoder.weight']    = new_wgts
    old_wgts['1.decoder.bias']      = new_bias
    return old_wgts

In [None]:
wgts = match_embeds(old_wgts, old_vocab, vocab)

In [None]:
assert torch.allclose(wgts['0.encoder.weight'][347],house_wgt)
assert torch.allclose(wgts['1.decoder.bias'][347],house_bias)

In [None]:
model.load_state_dict(wgts)

In [None]:
torch.save(model.state_dict, path/'tmp_clas'/'init.pth')

Split

In [None]:
model

SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(60003, 300, padding_idx=0)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60003, 300, padding_idx=0)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(300, 300, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(300, 300, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=300, out_features=60003, bias=True)
    (output_dp): RNNDropout()
  )
)

In [None]:
def lm_splitter(m):
    groups = []
    for i in range(len(m[0].rnns)): groups.append(nn.Sequential(m[0].rnns[i], m[0].hidden_dps[i]))
    groups = [nn.Sequential(m[0].encoder, m[0], m[0].input_dp, m[1])]
    return [list(o.parameters()) for o in groups]

In [None]:
for rnn in model[0].rnns:
    for p in rnn.parameters(): p.requires_grad_(False)

In [None]:
cbs = [partial(AvgStatsCallback,accuracy_flat),
       CudaCallback,
       Recorder,
       partial(GradientClipping, clip=0.1),
       partial(RNNTrainer, alpha=2., beta=1.),
       ProgressCallback]

In [None]:
learn = Learner(model, data, cross_entropy_flat, opt_func=adam_opt(), cb_funcs=cbs, splitter=lm_splitter)

In [None]:
lr = 2e-2
sched_lr  = combine_scheds([0.5,0.5], cos_1cycle_anneal(lr/10., lr, lr/1e5))
sched_mom = combine_scheds([0.5,0.5], cos_1cycle_anneal(0.8, 0.7, 0.8))
cbsched = [ParamScheduler('lr', sched_lr), ParamScheduler('mom', sched_mom)]

In [None]:
learn.fit(1, cbs=cbsched)

epoch,train_loss,train_accuracy_flat,valid_loss,valid_accuracy_flat,time
0,4.503475,0.245633,4.301658,0.262508,07:17


In [None]:
for rnn in model[0].rnns:
    for p in rnn.parameters(): p.requires_grad_(True)

In [None]:
lr = 2e-3
sched_lr  = combine_scheds([0.25,0.75], cos_1cycle_anneal(lr/10.,lr, 0))
sched_lr1 = combine_scheds([0.25,0.75], cos_1cycle_anneal(lr/20.,lr/2., 0))
sched_mom = combine_scheds([0.25,0.75], cos_1cycle_anneal(0.8,0.7, 0.8))
cbsched = [ParamScheduler('lr', [sched_lr1, sched_lr1, sched_lr]), ParamScheduler('mom', sched_mom)]

In [None]:
learn.fit(10, cbs=cbsched)

epoch,train_loss,train_accuracy_flat,valid_loss,valid_accuracy_flat,time
0,4.281489,0.261222,4.236016,0.269402,07:36
1,4.212239,0.268824,4.186624,0.274908,07:37
2,4.157397,0.27452,4.15208,0.278439,07:38
3,4.118742,0.278401,4.128485,0.280957,07:39
4,4.092115,0.280964,4.111547,0.282777,07:39
5,4.069495,0.28318,4.099864,0.28373,07:39
6,4.054394,0.284509,4.090035,0.285012,07:39
7,4.042381,0.28562,4.085082,0.285447,07:40
8,4.035013,0.286289,4.082782,0.285659,07:40
9,4.031946,0.286588,4.082381,0.28574,07:39


In [None]:
torch.save(learn.model[0].state_dict(), path/'finetuned_enc.pth')

In [None]:
pickle.dump(vocab, open(path/'vocab_lm.pkl', 'wb'))

In [None]:
torch.save(learn.model.state_dict(), path/'finetuned.pth')

## Classifier

We have to process the data again otherwise pickle will complain. We also have to use the same vocab as the language model.

In [None]:
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))
proc_tok,proc_num,proc_cat = TokenizeProcessor(),NumericalizeProcessor(vocab=vocab),CategoryProcessor()

In [None]:
il = TextList.from_files(path, include=['train', 'test'])
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='test'))
ll = label_by_func(sd, parent_labeler, proc_x = [proc_tok, proc_num], proc_y=proc_cat)

In [None]:
pickle.dump(ll, open(path/'ll_clas.pkl', 'wb'))

In [None]:
ll = pickle.load(open(path/'ll_clas.pkl', 'rb'))
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))

In [None]:
bs,bptt = 64,70
data = clas_databunchify(ll, bs)

### Ignore padding

In [None]:
#export
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
x,y = next(iter(data.train_dl))

In [None]:
x.size()

torch.Size([64, 3311])

In [None]:
lengths = x.size(1) - (x == 1).sum(1)
lengths[:5]

tensor([3311, 2425, 1782, 1486, 1481])

In [None]:
tst_emb = nn.Embedding(len(vocab), 300)

In [None]:
packed = pack_padded_sequence(tst_emb(x), lengths, batch_first=True)

In [None]:
dp = nn.Dropout(p=0.5)

In [None]:
tst = nn.LSTM(300, 300, 2)

In [None]:
y,h = tst(packed)

In [None]:
unpack = pad_packed_sequence(y, batch_first=True)

We need to change our model a little bit to use this.

In [None]:
#export
class AWD_LSTM1(nn.Module):
    "AWD-LSTM inspired by https://arxiv.org/abs/1708.02182."
    initrange=0.1

    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token,
                 hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
        super().__init__()
        self.bs,self.emb_sz,self.n_hid,self.n_layers,self.pad_token = 1,emb_sz,n_hid,n_layers,pad_token
        self.encoder = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.encoder_dp = EmbeddingDropout(self.encoder, embed_p)
        self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz), 1,
                             batch_first=True) for l in range(n_layers)]
        self.rnns = nn.ModuleList([WeightDropout(rnn, weight_p) for rnn in self.rnns])
        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])

    def forward(self, input):
        bs,sl = input.size()
        mask = (input == self.pad_token)
        lengths = sl - mask.long().sum(1)
        n_empty = (lengths == 0).sum()
        if n_empty > 0:
            input = input[:-n_empty]
            lengths = lengths[:-n_empty]
            self.hidden = [(h[0][:,:input.size(0)], h[1][:,:input.size(0)]) for h in self.hidden]
        raw_output = self.input_dp(self.encoder_dp(input))
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output = pack_padded_sequence(raw_output, lengths, batch_first=True)
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            raw_output = pad_packed_sequence(raw_output, batch_first=True)[0]
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
            new_hidden.append(new_h)
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs, mask

    def _one_hidden(self, l):
        "Return one hidden state."
        nh = self.n_hid if l != self.n_layers - 1 else self.emb_sz
        return next(self.parameters()).new(1, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]

### Concat pooling

We will use three things for the classification head of the model: the last hidden state, the average of all the hidden states and the maximum of all the hidden states. The trick is just to, once again, ignore the padding in the last element/average/maximum.

In [None]:
class Pooling(nn.Module):
    def forward(self, input):
        raw_outputs,outputs,mask = input
        output = outputs[-1]
        lengths = output.size(1) - mask.long().sum(dim=1)
        avg_pool = output.masked_fill(mask[:,:,None], 0).sum(dim=1)
        avg_pool.div_(lengths.type(avg_pool.dtype)[:,None])
        max_pool = output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
        x = torch.cat([output[torch.arange(0, output.size(0)),lengths-1], max_pool, avg_pool], 1) #Concat pooling.
        return output,x

In [None]:
enc = AWD_LSTM1(len(vocab), emb_sz, n_hid=nh, n_layers=nl, pad_token=1)
pool = Pooling()
enc.bs = bs
enc.reset()

In [None]:
x,y = next(iter(data.train_dl))
output,c = pool(enc(x))

We can check we have padding with 1s at the end of each text (except the first which is the longest).

In [None]:
x

tensor([[    2,     7,  1150,  ..., 16134,    24,     3],
        [    2,     7,    65,  ...,     1,     1,     1],
        [    2,     7,  4844,  ...,     1,     1,     1],
        ...,
        [    2,    12,  1480,  ...,     1,     1,     1],
        [    2,     7,   584,  ...,     1,     1,     1],
        [    2,    12,   655,  ...,     1,     1,     1]])

PyTorch puts 0s everywhere we had padding in the `output` when unpacking.

In [None]:
torch.allclose((output.sum(dim=2) == 0).float(), (x==1).float())

True

So the last hidden state isn't the last element of `output`. Let's check we got everything right. 

In [None]:
for i in range(bs):
    length = x.size(1) - (x[i]==1).long().sum()
    out_unpad = output[i,:length]
    assert torch.allclose(out_unpad[-1], c[i,:300])
    assert torch.allclose(out_unpad.max(0)[0], c[i,300:600])
    assert torch.allclose(out_unpad.mean(0), c[i,600:])

Our pooling layer properly ignored the padding, so now let's group it with a classifier.

In [None]:
def bn_drop_lin(n_in, n_out, bn=True, p=0., actn=None):
    layers = [nn.BatchNorm1d(n_in)] if bn else []
    if p != 0: layers.append(nn.Dropout(p))
    layers.append(nn.Linear(n_in, n_out))
    if actn is not None: layers.append(actn)
    return layers

In [None]:
class PoolingLinearClassifier(nn.Module):
    "Create a linear classifier with pooling."

    def __init__(self, layers, drops):
        super().__init__()
        mod_layers = []
        activs = [nn.ReLU(inplace=True)] * (len(layers) - 2) + [None]
        for n_in, n_out, p, actn in zip(layers[:-1], layers[1:], drops, activs):
            mod_layers += bn_drop_lin(n_in, n_out, p=p, actn=actn)
        self.layers = nn.Sequential(*mod_layers)

    def forward(self, input):
        raw_outputs,outputs,mask = input
        output = outputs[-1]
        lengths = output.size(1) - mask.long().sum(dim=1)
        avg_pool = output.masked_fill(mask[:,:,None], 0).sum(dim=1)
        avg_pool.div_(lengths.type(avg_pool.dtype)[:,None])
        max_pool = output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
        x = torch.cat([output[torch.arange(0, output.size(0)),lengths-1], max_pool, avg_pool], 1) #Concat pooling.
        x = self.layers(x)
        return x

Then we just have to feed our texts to those two blocks, (but we can't give them all at once to the AWD_LSTM or we'll get OOM error: we'll go for chunks of bptt length to regularly detach the history of our hidden states.)

In [None]:
def pad_tensor(t, bs, val=0.):
    if t.size(0) < bs:
        return torch.cat([t, val + t.new_zeros(bs-t.size(0), *t.shape[1:])])
    return t

In [None]:
class SentenceEncoder(nn.Module):
    def __init__(self, module, bptt, pad_idx=1):
        super().__init__()
        self.bptt,self.module,self.pad_idx = bptt,module,pad_idx

    def concat(self, arrs, bs):
        return [torch.cat([pad_zero(l[si],bs) for l in arrs], dim=1) for si in range(len(arrs[0]))]
    
    def forward(self, input):
        bs,sl = input.size()
        self.module.bs = bs
        self.module.reset()
        raw_outputs,outputs,masks = [],[],[]
        for i in range(0, sl, self.bptt):
            r,o,m = self.module(input[:,i: min(i+self.bptt, sl)])
            masks.append(pad_tensor(m, bs, 1))
            raw_outputs.append(r)
            outputs.append(o)
        return self.concat(raw_outputs, bs),self.concat(outputs, bs),torch.cat(masks,dim=1)

In [None]:
def get_text_classifier(vocab_sz, emb_sz, n_hid, n_layers, n_out, pad_token, bptt, layers=None,
                        drops=None, output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
    "To create a full AWD-LSTM"
    rnn_enc = AWD_LSTM1(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token,
                        hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p)
    enc = SentenceEncoder(rnn_enc, bptt)
    if layers is None: layers = [50]
    if drops is None:  drops = [0.1] * len(layers)
    layers = [3 * emb_sz] + layers + [n_out] 
    drops = [output_p] + drops
    return SequentialRNN(enc, PoolingLinearClassifier(layers, drops))

In [None]:
emb_sz, nh, nl = 300, 300, 2
dps = tensor([0.4, 0.4, 0.5, 0.05, 0.3]) * 0.25
model = get_text_classifier(len(vocab), emb_sz, nh, nl, 2, 1, bptt, input_p=dps[0], output_p=dps[1], weight_p=dps[2], 
                           embed_p=dps[3], hidden_p=dps[4])

### Training

We load our pretrained encoder and freeze it.

In [None]:
def class_splitter(m):
    groups = [nn.Sequential(m[0].encoder, m[0], m[0].input_dp)]
    for i in range(len(m[0].rnns)): groups.append(nn.Sequential(m[0].rnns[i], m[0].hidden_dps[i]))
    groups.append(m[1])
    return [list(o.parameters()) for o in groups]

In [None]:
for p in model[0].parameters(): p.requires_grad_(False)

In [None]:
cbs = [partial(AvgStatsCallback,accuracy),
       CudaCallback,
       Recorder,
       partial(GradientClipping, clip=0.1),
       ProgressCallback]

In [None]:
model[0].module.load_state_dict(torch.load(path/'finetuned_enc.pth'))

In [None]:
learn = Learner(model, data, F.cross_entropy, opt_func=adam_opt(), cb_funcs=cbs)#, splitter=class_splitter)

In [None]:
lr = 1e-2
sched_lr  = combine_scheds([0.3,0.7], cos_1cycle_anneal(lr/25., lr, lr/1e5))
sched_mom = combine_scheds([0.5,0.5], cos_1cycle_anneal(0.8, 0.7, 0.8))
cbsched = [ParamScheduler('lr', sched_lr), ParamScheduler('mom', sched_mom)]

In [None]:
learn.fit(1, cbs=cbsched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,0.356794,0.84432,0.28568,0.88012,00:54


In [None]:
for p in model[0].module.rnns[-1].parameters(): p.requires_grad_(True)

In [None]:
lr = 5e-3
sched_lr  = combine_scheds([0.25,0.75], cos_1cycle_anneal(lr/10., lr, lr/1e5))
sched_lr1  = combine_scheds([0.25,0.75], cos_1cycle_anneal(lr/20., lr/2, lr/2e5))
sched_mom = combine_scheds([0.25,0.75], cos_1cycle_anneal(0.8, 0.7, 0.8))
cbsched = [ParamScheduler('lr', [sched_lr1, sched_lr1, sched_lr1, sched_lr]), ParamScheduler('mom', sched_mom)]

In [None]:
learn.fit(1, cbs=cbsched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,0.267781,0.89044,0.210491,0.91484,01:01


In [None]:
for p in model[0].parameters(): p.requires_grad_(True)

In [None]:
lr = 1e-3
sched_lrs = [combine_scheds([0.25,0.75], cos_1cycle_anneal(lr/((2**i)*10.), lr/(2**i), lr/((2**i)*1e5))) for i in range(4)]
sched_lrs.reverse()
sched_mom = combine_scheds([0.25,0.75], cos_1cycle_anneal(0.8, 0.7, 0.8))
cbsched = [ParamScheduler('lr', sched_lrs), ParamScheduler('mom', sched_mom)]

In [None]:
learn.fit(2, cbs=cbsched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,0.220051,0.91272,0.201665,0.92128,01:11
1,0.209577,0.91944,0.198815,0.92144,01:12


In [None]:
x,y = next(iter(data.valid_dl))

Predicting on the padded batch or on the individual unpadded samples give the same results.

In [None]:
pred_batch = learn.model.eval()(x.cuda())

In [None]:
pred_ind = []
for inp in x:
    length = x.size(1) - (inp == 1).long().sum()
    inp = inp[:length]
    pred_ind.append(learn.model.eval()(inp[None].cuda()))

In [None]:
assert torch.allclose(pred_batch, torch.cat(pred_ind))