In [1]:
% reload_ext autoreload
% autoreload 2
% matplotlib inline
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [4]:
from fastai.text import *
from fastai.lm_rnn import *
from sebastian.eval import eval_ner

In [5]:
"""run this cell for only forward direction"""
class LinearDecoder(nn.Module):
    initrange=0.1
    def __init__(self, n_out, n_hid, dropout, tie_encoder=None, bias=False):
        super().__init__()
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.dropout = LockedDropout(dropout)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight

    def forward(self, input):
        raw_outputs, outputs = input
        output = self.dropout(outputs[-1])
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        result = decoded.view(-1, decoded.size(1))
        return result, raw_outputs, outputs

    
class SequentialRNN(nn.Sequential):
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()
                
                
class RNN_Learner(Learner):
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)

    def _get_crit(self, data): return F.cross_entropy
    def fit(self, *args, **kwargs): return super().fit(*args, **kwargs, seq_first=True)

    def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))
    def load_encoder(self, name): load_model(self.model[0], self.get_model_path(name))
        
        
class TextModel(BasicModel):
    def get_layer_groups(self):
        m = self.model[0]
        return [(m.encoder, m.dropouti), *zip(m.rnns, m.dropouths), (self.model[1])]
    
    
def get_rnn_seq_labeler(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
                      dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, linear_decoder_dp=0.1):
    rnn_enc = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    return SequentialRNN(rnn_enc, LinearDecoder(n_class, emb_sz, linear_decoder_dp))

In [99]:
"""run this cell for bidir"""
class LinearDecoder_bidir(nn.Module):
    initrange=0.1
    def __init__(self, n_out, n_hid, dropout, tie_encoder=None, bias=False):
        super().__init__()
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.dropout = LockedDropout(dropout)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight

    def forward(self, input):
        raw_outputs, outputs = input
        output = self.dropout(outputs)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        result = decoded.view(-1, decoded.size(1))
        return result, raw_outputs, outputs
    
    
##### rewrite sequentialRNN #####
'''changed the class it inherits from nn.Sequential to Sequential'''
class SequentialRNN_bidir(nn.Module):
    def __init__(self, rnn_enc_fw, rnn_enc_bw, linear_decoder, embedding_path, emb_sz, freeze_word2vec=False, wordvec_sz=300):
        super().__init__()
        self.rnn_enc_fw = rnn_enc_fw
        self.rnn_enc_bw = rnn_enc_bw
        self.linear_decoder = linear_decoder
        self.rnn_lm= nn.LSTM(input_size=emb_sz*2+wordvec_sz*2, hidden_size=emb_sz, num_layers=1, batch_first=True, bidirectional=True)
        weights = np.load(embedding_path)
        self.embedding = nn.Embedding.from_pretrained(T(weights), freeze=freeze_word2vec)
        self.rnn = nn.LSTM(input_size=wordvec_sz, hidden_size=wordvec_sz, num_layers=1, batch_first=True, bidirectional=True)
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()
    def forward(self, input):
        input_fw = input
        lstm_out, (n_h, n_cell) = self.rnn(self.embedding(input))
        input_bw = V(np.array([o.cpu().numpy()[::-1] for o in input]))
        raw_outputs_fw, outputs_fw = self.rnn_enc_fw(input_fw)
        raw_outputs_bw, outputs_bw = self.rnn_enc_bw(input_bw)
        bs, sl, _ = outputs_bw[-1].size()
        idx = V(torch.LongTensor([i for i in range(sl-1, -1, -1)]))
        output_bw = outputs_bw[-1].index_select(1, idx)
        outputs_fw_bw = torch.cat([outputs_fw[-1], output_bw], dim=-1)
    
        ## concat forward raw_outputs & backward raw_outputs together
        raw_outputs_bw_ = []
        # concat them together
        for i in range(3):
            bs, sl, _ = raw_outputs_bw[i].size()
            idx = V(torch.LongTensor([i for i in range(sl-1, -1, -1)]))
            raw_output_bw = raw_outputs_bw[i].index_select(1, idx)
            raw_outputs_bw_.append(raw_output_bw)
        raw_outputs_fw_bw = [torch.cat([raw_outputs_fw[i], raw_outputs_bw_[i]]) for i in range(3)]
        # concat output from lstm_out and rnn_lm
        outputs_fw_bw = torch.cat([lstm_out, outputs_fw_bw], dim=-1)
        outputs_fw_bw, (n_h, n_cell) = self.rnn_lm(outputs_fw_bw)
        out = self.linear_decoder((raw_outputs_fw_bw, outputs_fw_bw.contiguous()))
        return out

    
##### rewrite RNN Learner #####
'''rewrite load_encoder to load the encoding modules'''
class RNN_Learner_bidir(Learner):
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)

    def _get_crit(self, data): return F.cross_entropy
    def fit(self, *args, **kwargs): return super().fit(*args, **kwargs, seq_first=True)

    def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))
    def load_encoder(self, name_fw, name_bw): 
        load_model(self.model.rnn_enc_fw, self.get_model_path(name_fw))
        load_model(self.model.rnn_enc_bw, self.get_model_path(name_bw))
##### end #####


##### rewrite textmodel #####
'''get layer groups'''
class TextModel_bidir(BasicModel):
    def get_layer_groups(self):
        m_fw = self.model.rnn_enc_fw
        m_bw = self.model.rnn_enc_bw
        return [(m_fw.encoder, m_fw.dropouti, m_bw.encoder, m_bw.dropouti), 
                *zip(m_fw.rnns, m_fw.dropouths, m_bw.rnns, m_bw.dropouths), 
            (self.model.embedding), (self.model.linear_decoder), (self.model.rnn), (self.model.rnn_lm)]


def get_rnn_seq_labeler_bidir(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
                      dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, linear_decoder_dp=0.1, dir_path='', freeze_word2vec=False):
    rnn_enc = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    rnn_enc_backward = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    return SequentialRNN_bidir(rnn_enc, rnn_enc_backward, LinearDecoder_bidir(n_class, emb_sz*2, linear_decoder_dp), 
                               dir_path/'tmp'/'coNLL_embedding.npy', emb_sz, freeze_word2vec=freeze_word2vec, wordvec_sz=300)

In [100]:
'''common functions'''
def freeze_all_but(learner, n):
    c=learner.get_layer_groups()
    for l in c: set_trainable(l, False)
    set_trainable(c[n], True)

class MultiBatchSeqRNN(RNN_Encoder):
    def __init__(self, bptt, max_seq, *args, **kwargs):
        self.max_seq,self.bptt = max_seq,bptt
        super().__init__(*args, **kwargs)

    def concat(self, arrs):
        return [torch.cat([l[si] for l in arrs]) for si in range(len(arrs[0]))]

    def forward(self, input):
        sl,bs = input.size()
        for l in self.hidden:
            for h in l: h.data.zero_()
        raw_outputs, outputs = super().forward(input)
        return raw_outputs, outputs

    
class SeqDataLoader(DataLoader):
    def get_batch(self, indices):
        res = self.np_collate([self.dataset[i] for i in indices])
        res[1] = np.reshape(res[1], -1)  # reshape the labels to one sequence
        return res


class TextSeqDataset(Dataset):
    def __init__(self, x, y, backwards=False, sos=None, eos=None):
        self.x,self.y,self.backwards,self.sos,self.eos = x,y,backwards,sos,eos

    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]  # we need to get y as array
        if self.backwards: x = list(reversed(x))
        if self.eos is not None: x = x + [self.eos]
        if self.sos is not None: x = [self.sos]+x
        return np.array(x),np.array(y)

    def __len__(self): return len(self.x)

In [111]:
def train_seq(dir_path, lm_id='', train_file_id='', clas_id=None, bs=64, cl=1, bidir=False, startat=0, unfreeze=True,
              lr=0.01, dropmult=1.0, pretrain=True, bpe=False, use_clr=True,
              use_regular_schedule=False, use_discriminative=True, last=False, chain_thaw=False,
              from_scratch=False, freeze_word2vec=False, n_cycle=3, cycle_len=1, cycle_mult=2, linear_decoder_dp=0.1):
    """hyperaparameter settings"""
    bptt,em_sz,nh,nl = 70,400,1150,3
#     bptt, em_sz, nh, nl = 70, 100, 100, 2
    dps = np.array([0.4,0.5,0.05,0.3,0.4])*dropmult
#     dps = np.array([0.4,0.5,0.05,0.3,0.7])*dropmult
#     dps = np.array([0.5, 0.4, 0.04, 0.3, 0.6])*dropmult
    #dps = np.array([0.65,0.48,0.039,0.335,0.34])*dropmult
#     dps = np.array([0.6,0.5,0.04,0.3,0.4])*dropmult

    print(f'prefix {dir_path}; lm_id {lm_id}; train_file_id {train_file_id}; clas_id {clas_id};'
          f' bs {bs}; cl {cl}; bidir {bidir}; '
        f'dropmult {dropmult} unfreeze {unfreeze} startat {startat}; pretrain {pretrain}; bpe {bpe}; use_clr {use_clr};'
        f' use_regular_schedule {use_regular_schedule}; use_discriminative {use_discriminative}; last {last};'
        f' chain_thaw {chain_thaw}; from_scratch {from_scratch}; freeze_word2vec {freeze_word2vec}; bptt {bptt};'
          f' em_sz {em_sz}; nh {nh}; nl {nl}; dropouts {dps}; dropmult {dropmult};'
         f' linear_decoder_dp {linear_decoder_dp}')
    dir_path = Path(dir_path)
 
    lm_file = dir_path/'models'/'lm1_enc'
    lm_file_bw = dir_path/'models'/'lm1_enc_backward'

    opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

    """load datasets"""
    trn_sent = np.load(dir_path / 'tmp' / f'trn_ids{train_file_id}.npy')
    val_sent = np.load(dir_path / 'tmp' / f'val_ids.npy')
    test_sent = np.load(dir_path / 'tmp' / f'test_ids.npy')
    trn_lbls = np.load(dir_path / 'tmp' / f'lbl_trn{train_file_id}.npy')
    val_lbls = np.load(dir_path / 'tmp' / f'lbl_val.npy')
    test_lbls = np.load(dir_path / 'tmp' / f'lbl_test.npy')
    id2label = pickle.load(open(dir_path / 'tmp' / 'itol.pkl', 'rb'))
    c = len(id2label)

    if bpe:
        vs=30002
    else:
        id2token = pickle.load(open(dir_path / 'tmp' / 'itos.pkl', 'rb'))
        vs = len(id2token)

    print('Train sentences shape:', trn_sent.shape)
    print('Train labels shape:', trn_lbls.shape)
    print('Token ids:', [id2token[id_] for id_ in trn_sent[0]])
    print('Label ids:', [id2label[id_] for id_ in trn_lbls[0]])

    trn_ds = TextSeqDataset(trn_sent, trn_lbls)
    val_ds = TextSeqDataset(val_sent, val_lbls)
    test_ds = TextSeqDataset(test_sent, test_lbls)
    trn_samp = SortishSampler(trn_sent, key=lambda x: len(trn_sent[x]), bs=bs//2)
    val_samp = SortSampler(val_sent, key=lambda x: len(val_sent[x]))
    test_samp = SortSampler(test_sent, key=lambda x: len(test_sent[x]))
    trn_dl = SeqDataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=1, sampler=trn_samp)  # TODO why transpose? Should we also transpose the labels?
    val_dl = SeqDataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=val_samp)
    test_dl = SeqDataLoader(test_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=test_samp)
    md = ModelData(dir_path, trn_dl, val_dl, test_dl)

    if bidir:
        m = get_rnn_seq_labeler_bidir(bptt, 70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
                  layers=[em_sz, 50, c], drops=[dps[4], 0.1],
                  dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3], linear_decoder_dp=linear_decoder_dp, 
                                      freeze_word2vec=freeze_word2vec, dir_path=dir_path, )
        learn = RNN_Learner_bidir(md, TextModel_bidir(to_gpu(m)), opt_fn=opt_fn)
        learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        learn.clip=25.
        learn.metrics = [accuracy]
    else:
        m = get_rnn_seq_labeler(bptt, 70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
                  layers=[em_sz, 50, c], drops=[dps[4], 0.1],
                  dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3], linear_decoder_dp=linear_decoder_dp)
        learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
        learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        learn.clip=25.
        learn.metrics = [accuracy]

    

    lrm = 2.6
    if use_discriminative:
#         lrs = np.array([lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])
        lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])
    else:
        lrs = lr
    wd = 1e-5
    if not from_scratch:
        print(f'Loading encoder from {lm_file}...')
        if bidir:
            learn.load_encoder(lm_file, lm_file_bw)
        else:
            learn.load_encoder(lm_file)
    else:
        print('Training classifier from scratch. LM encoder is not loaded.')
        use_regular_schedule = True

    if (startat<1) and pretrain and not last and not chain_thaw and not from_scratch:
        learn.freeze_to(-1)
        learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,
                  use_clr=None if use_regular_schedule or not use_clr else (8,3))
        learn.freeze_to(-2)
        learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,
                  use_clr=None if use_regular_schedule or not use_clr else (8, 3))
        learn.save(f'{PRE}{clas_id}clas_0')
    elif startat==1:
        learn.load(f'{PRE}{clas_id}clas_0')

    if chain_thaw:
        lrs = np.array([0.0001, 0.0001, 0.0001, 0.001])
        ## Emrys
        lrm = 4
        # the 4th is too big, and the word embedding and rnn can increase
        lrs = np.array([lr/(lrm**5), 2*lr/(lrm**5), lr/(lrm**4), lr/(lrm**4), 5e-4, lr/2, 7e-4, 1e-2])
#         lrf = learn.lr_find(lrs) # find the proper learning rate
#         learn.sched.plot()
        # end
        print(f'AWDLSTM learning_rate {lrs[:4]}; embedding_lr {lrs[4]}; linear_decoder_lr {lrs[5]}; rnn_lr {lrs[6]}; lm_lr {lrs[7]}; weight_decay {wd}')
        print('Using chain-thaw. Unfreezing all layers one at a time...')
        n_layers = len(learn.get_layer_groups())
        print('# of layers:', n_layers)
        # fine-tune last layer
        learn.freeze_to(-1)
        print('Fine-tuning layer #7')
        learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,
                  use_clr=None if use_regular_schedule or not use_clr else (8,3))
        n = n_layers-2
        # fine-tune all layers up to the second-last one
        while n>-1:
            print('Fine-tuning layer #%d.' % n)
            freeze_all_but(learn, n)
            learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,
                      use_clr=None if use_regular_schedule or not use_clr else (8,3))
            n -= 1

    if unfreeze:
        learn.unfreeze()
    else:
        learn.freeze_to(-3)

    if last:
        print('Fine-tuning only the last layer...')
        learn.freeze_to(-1)

    if use_regular_schedule:
        print('Using regular schedule. Setting use_clr=None, n_cycles=cl, cycle_len=None.')
        use_clr = None
        n_cycle = n_cycle
        cycle_len = None
    else:
        n_cycle = n_cycle
    print(f'n_cycle {n_cycle}; cycle_len {cycle_len}; cycle_mult {cycle_mult}; use_clr {use_clr}')
    learn.fit(lrs, n_cycle, wds=wd, cycle_len=cycle_len, cycle_mult=cycle_mult, use_clr=(8,8) if use_clr else None) # previously cycle_len=cl
    print('Plotting lrs...')
    learn.sched.plot_lr()
    clas_id = clas_id if clas_id is not None else lm_id
    bidir = 'bidir' if bidir else 'forward'
    learn.save(f'{clas_id}clas_1{bidir}')

    eval_ner(learn, id2label, is_test=False)
    eval_ner(learn, id2label, is_test=True)

In [None]:
train_seq('/fs-object-detection/paperspace/fastai/courses/coNLL/data/nlp_seq/ner/', lm_id='', train_file_id='', clas_id=None,
          bs=64, cl=1, bidir=True, startat=0, unfreeze=True,
              lr=0.01, dropmult=1, pretrain=True, bpe=False, use_clr=False,
              use_regular_schedule=False, use_discriminative=True, last=False, chain_thaw=True,
              from_scratch=False, freeze_word2vec=False, n_cycle=4, cycle_len=1, cycle_mult=2, linear_decoder_dp=0.2)

prefix /fs-object-detection/paperspace/fastai/courses/coNLL/data/nlp_seq/ner/; lm_id ; train_file_id ; clas_id None; bs 64; cl 1; bidir True; dropmult 1 unfreeze True startat 0; pretrain True; bpe False; use_clr False; use_regular_schedule False; use_discriminative True; last False; chain_thaw True; from_scratch False; freeze_word2vec False; bptt 70; em_sz 400; nh 1150; nl 3; dropouts [0.4  0.5  0.05 0.3  0.4 ]; dropmult 1; linear_decoder_dp 0.2
Train sentences shape: (14988,)
Train labels shape: (14988,)
Token ids: ['xbos', '-docstart-']
Label ids: ['_bos_', 'O']
Loading encoder from /fs-object-detection/paperspace/fastai/courses/coNLL/data/nlp_seq/ner/models/lm1_enc...
AWDLSTM learning_rate [0.00001 0.00002 0.00004 0.00004]; embedding_lr 0.0005; linear_decoder_lr 0.005; rnn_lr 0.0007; lm_lr 0.01; weight_decay 1e-05
Using chain-thaw. Unfreezing all layers one at a time...
# of layers: 8
Fine-tuning layer #7


HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.159474   0.186139   0.948293  

Fine-tuning layer #6.


HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.122185   0.153179   0.959319  

Fine-tuning layer #5.


HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                     
    0      0.098153   0.131593   0.960315  

Fine-tuning layer #4.


HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                     
    0      0.093267   0.129416   0.961494  

Fine-tuning layer #3.


HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                     
    0      0.102729   0.13574    0.961593  

Fine-tuning layer #2.


HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                     
    0      0.093768   0.136768   0.961793  

Fine-tuning layer #1.


HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                     
    0      0.100634   0.13684    0.96171   

Fine-tuning layer #0.


HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                     
    0      0.091858   0.136862   0.96171   

n_cycle 4; cycle_len 1; cycle_mult 2; use_clr False


HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                     
    0      0.087901   0.120198   0.965296  
                                                              