In [63]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'

In [64]:
from fastai.text import *
from sebastian.eval import eval_ner
from model.data_utils import minibatches, pad_sequences, get_chunks
from model.config import Config
from model.data_utils import CoNLLDataset
from torch.utils.data.sampler import SequentialSampler, RandomSampler, BatchSampler
import torch.nn.utils.rnn as rnn_utils
from UTILS.lstm_v import LSTM_v1

In [65]:
config = Config()

In [66]:
train = CoNLLDataset(config.filename_train, config.processing_word,
                         config.processing_tag, config.max_iter)
val = CoNLLDataset(config.filename_dev, config.processing_word,
                         config.processing_tag, config.max_iter)
test = CoNLLDataset(config.filename_test, config.processing_word,
                         config.processing_tag, config.max_iter)

In [67]:
class Minibatch(object):
    def __init__(self, data, minibatch_size):
        self.data = data
        self.minibatch_size = minibatch_size
        self.length = None
        
    def __iter__(self):
        x_batch, y_batch = [], []
        for (x, y) in self.data:
            if len(x_batch) == self.minibatch_size:
                char_ids, word_ids = zip(*x_batch)
                word_ids, sequence_lengths = pad_sequences(word_ids, 0)
                char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
                    nlevels=2)
                lbl_ids, lbl_lengths = pad_sequences(y_batch, 0)
                bs, sl, char = np.array(char_ids).shape
                # expand the seq_lens and pad with ones, and concat with the rest
                # seq_len (bs)
                seq_lens_padded = np.concatenate([np.array(sequence_lengths)[:,np.newaxis], np.ones((bs, sl-1))], axis=-1)
                # seq_len, word_ids, word_length, char_ids
                word_ids = np.concatenate([seq_lens_padded[:,:,None], 
                                           np.array(word_ids)[:,:,None], 
                                           np.array(word_lengths)[:,:,None], 
                                           np.array(char_ids)], axis=-1)
                
                yield T(word_ids), T(lbl_ids).view(-1)
                x_batch, y_batch = [], []

            if type(x[0]) == tuple:
                x = zip(*x)
            x_batch += [x]
            y_batch += [y]

#         if len(x_batch) != 0:
#             char_ids, word_ids = zip(*x_batch)
#             word_ids, sequence_lengths = pad_sequences(word_ids, 0)
#             char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
#                 nlevels=2)
#             lbl_ids, lbl_lengths = pad_sequences(y_batch, 0)
#             word_ids = np.concatenate([np.array(word_ids)[:,:,np.newaxis], np.array(char_ids)], axis=-1)
#             yield T(word_ids), T(lbl_ids).view(-1)
    
    def __len__(self):
        if self.length == None:
            self.length = 0
            for _ in self:
                self.length += 1
        return self.length

In [68]:
dir_path = '/home/emrys/Desktop/seq_tagging_cross_lingual/'
trn_dl = Minibatch(train, 20)
val_dl = Minibatch(val, 20)
test_dl = Minibatch(test, 20)
md = ModelData(dir_path, trn_dl, val_dl, test_dl)

In [81]:
class NER_model(nn.Module):
    def __init__(self, config):
        super(NER_model, self).__init__()
        self.config = config
        self.idx_to_tag = {idx: tag for tag, idx in
                           self.config.vocab_tags.items()}
        self.get_word_embeddings()
        self.get_logits()
        
    def get_word_embeddings(self):
        # get word embeding
        _word_embedding = V(self.config.embeddings, requires_grad=True)
        self.word_embedding = nn.Embedding.from_pretrained(_word_embedding)
        
        # get char embedding
        self._char_embedding = nn.Embedding(self.config.nchars, self.config.dim_char)
        self.char_embedding = nn.LSTM(input_size=self.config.dim_char, hidden_size=self.config.hidden_size_char,
                                     num_layers=1, batch_first=True, # not sure here whether batch is first
                                     bidirectional=True)
        # get char embed for tar lang
        self._char_embedding_tar = nn.Embedding(self.config.nchars, self.config.dim_char)
        self.char_embedding_tar = nn.LSTM(input_size=self.config.dim_char, hidden_size=self.config.hidden_size_char,
                                     num_layers=1, batch_first=True, # not sure here whether batch is first
                                     bidirectional=True)
        
    def get_logits(self):
        self.rnn = nn.LSTM(input_size=self.config.dim_word+self.config.dim_char*2,
                          hidden_size=self.config.hidden_size_lstm,
                          num_layers=1, batch_first=True, # not sure whether batch is first
                          bidirectional=True)
        self.dropout_e = nn.Dropout(self.config.dropout if self.train else 0) # e for embedding
        self.dropout = nn.Dropout(self.config.dropout if self.train else 0)
        self.linear = nn.Linear(self.config.hidden_size_lstm*2, self.config.ntags)
        
        # add logits for tar lang
        self.rnn_tar = nn.LSTM(input_size=self.config.dim_word+self.config.dim_char*2,
                          hidden_size=self.config.hidden_size_lstm,
                          num_layers=1, batch_first=True, # not sure whether batch is first
                          bidirectional=True)
        self.dropout_e_tar = nn.Dropout(self.config.dropout if self.train else 0) # e for embedding
        self.linear_tar = nn.Linear(self.config.hidden_size_lstm*4, self.config.ntags)
        
    def forward(self, input):
        seq_lens = input[:,0,0]
        words = input[:, :, 1]
        word_lens = input[:,:,2]
        char = input[:, :, 3:]
        
        bs, sl, _ = char.size()
        char = char.view(-1, _)
        word_lens = word_lens.view(-1)
        word_lens[word_lens==0] = 1
        word_lens = word_lens.type(torch.int)
        
        # for source lang
        _char_embedding = self._char_embedding(char.long())
        char_embedding, (h_n, cell_n) = self.char_embedding(_char_embedding) # try use output instead of hidden size
        char_embedding = h_n.contiguous().view(bs, sl, -1)

        # for tar lang
        _char_embedding_tar = self._char_embedding_tar(char.long()) 
        char_embedding_tar, (h_n_tar, cell_n_tar) = self.char_embedding_tar(_char_embedding_tar) # try use output instead of hidden size
        char_embedding_tar = h_n_tar.contiguous().view(bs, sl, -1)

        # word embedding
        word_embedding = self.word_embedding(words.long())
        seq_lens = seq_lens.type(torch.int)
        
        # concat word embeddings and char embeddings of source
        # and to rnn
        word_embedding_sour = torch.cat([word_embedding, char_embedding], dim=-1)
        word_embedding_dp = self.dropout_e(word_embedding_sour)
        seq_lens = seq_lens.type(torch.int)
        out, (n_h, n_cell) = self.rnn(word_embedding_dp)
        
        # concat word embeddings and char embeddings of tar
        # and to rnn
        word_embedding_tar = torch.cat([word_embedding, char_embedding_tar], dim=-1)
        word_embedding_dp_tar = self.dropout_e_tar(word_embedding_tar)
        out_tar, (n_h_tar, n_cell_tar) = self.rnn_tar(word_embedding_dp_tar)
        
        # concat two outputs
        out = torch.cat([out, out_tar], dim=-1)
        
        out_dp = self.dropout(out)
        out = self.linear_tar(out_dp)
        return out.view(out.size(0)*out.size(1), out.size(-1))

In [82]:
config.dim_word+config.dim_char*2

500

In [83]:
ner_model = NER_model(config)

In [84]:
##### rewrite RNN Learner #####
'''rewrite load_encoder to load the encoding modules'''
class RNN_Learner_bidir(Learner):
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)

    def _get_crit(self, data): return F.cross_entropy
    def fit(self, *args, **kwargs): return super().fit(*args, **kwargs, seq_first=True)

    def save_encoder(self, name_rnn, name_linear, name_char_embedding, name_char_embedding_lstm): 
        torch.save(self.model.rnn.LSTM.state_dict(), name_rnn)
        torch.save(self.model.linear.state_dict(), name_linear)
        torch.save(self.model._char_embedding.state_dict(), name_char_embedding)
        torch.save(self.model.char_embedding.LSTM.state_dict(), name_char_embedding_lstm)
        
    def load_encoder(self, name_rnn, name_linear, name_char_embedding, name_char_embedding_lstm): 
        self.model.rnn.LSTM.load_state_dict(torch.load(name_rnn))
        self.model.linear.load_state_dict(torch.load(name_linear))
        self.model._char_embedding.load_state_dict(torch.load(name_char_embedding))
        self.model.char_embedding.LSTM.load_state_dict(torch.load(name_char_embedding_lstm))
        
##### end #####


##### rewrite textmodel #####
'''get layer groups'''
class TextModel_bidir(BasicModel):
    def get_layer_groups(self):
        return [(self.model._char_embedding, self.model.char_embedding, self.model.rnn, self.model.linear),# source lang 
                (self.model._char_embedding_tar),(self.model.char_embedding_tar), 
                (self.model.word_embedding),(self.model.rnn_tar),(self.model.linear_tar)]

def freeze_all_but(learner, n):
    c=learner.get_layer_groups()
    for l in c: set_trainable(l, False)
    set_trainable(c[n], True)
    
def freeze_one(learner, n):
    c=learner.get_layer_groups()
    set_trainable(c[n], False)
    
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
learn = RNN_Learner_bidir(md, TextModel_bidir(to_gpu(ner_model)), opt_fn=opt_fn)
# learn.load_encoder('results/eng_rnn_params.pkl', 'results/eng_linear_params.pkl', 'results/eng_char_embedding_params.pkl', 'results/eng_char_embedding_lstm_params.pkl')

In [85]:
# freeze_all_but(learn, -1)
# learn.unfreeze()
freeze_one(learn, 0)

In [86]:
learn.fit(0.001, 3, metrics=[accuracy], cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=7), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                     
    0      0.172332   0.161319   0.948725  
    1      0.107887   0.08787    0.974805                     
    2      0.095603   0.080985   0.975442                     
    3      0.089331   0.072113   0.977711                     
    4      0.080795   0.06644    0.979045                     
    5      0.075903   0.064892   0.979427                     
    6      0.072851   0.062446   0.98054                      



[0.062445851747768996, 0.9805404492011287]

In [23]:
torch.save(learn.model.char_embedding.LSTM.state_dict(), 'results/eng_char_embedding_lstm_params.pkl')

In [21]:
learn

NER_model(
  (word_embedding): Embedding(17425, 300)
  (_char_embedding): Embedding(84, 100)
  (dropout): Dropout(p=0.5)
  (linear): Linear(in_features=600, out_features=8, bias=True)
)

In [15]:
learn.save_encoder('results/eng_rnn_params.pkl', 'results/eng_linear_params.pkl')

In [16]:
id2tag = [o for i,o in ner_model.idx_to_tag.items()]

In [17]:
eval_ner(learn, id2tag, is_test=True)

Test f1 measure overall: 0.8515127629903094
{'precision-PER': 0.884896872920825, 'recall-PER': 0.8291770573566085, 'f1-measure-PER': 0.8561313163823125, 'precision-LOC': 0.9170996159927717, 'recall-LOC': 0.9339774557165862, 'f1-measure-LOC': 0.9254615910644589, 'precision-MISC': 0.7221350078492934, 'recall-MISC': 0.6628242074927952, 'f1-measure-MISC': 0.6912096168294015, 'precision-ORG': 0.7221195317313617, 'recall-ORG': 0.706875753920386, 'f1-measure-ORG': 0.7144163364827297, 'precision-overall': 0.8573870573870573, 'recall-overall': 0.8457184150307118, 'f1-measure-overall': 0.8515127629903094}
Test token-level accuracy of NER model: 0.9817.


In [27]:
# padding
import torch.nn.utils.rnn as rnn_utils  

In [43]:
lstm = nn.LSTM(10, 20, bidirectional=True, num_layers=1, batch_first=True)
packed = rnn_utils.pack_padded_sequence(torch.randn(4, 50, 10), T([40, 30, 20, 10]), batch_first=True)
packed_out, packed_hidden = lstm(packed)
unpacked, unpacked_len = rnn_utils.pad_packed_sequence(packed_out)

In [45]:
unpacked, unpacked_len

(tensor([[[-0.0338,  0.0061,  0.0005,  ..., -0.0912,  0.0050,  0.1206],
          [ 0.1199, -0.0857, -0.0399,  ..., -0.0580,  0.1510,  0.1029],
          [-0.0035, -0.0643,  0.0095,  ..., -0.1064,  0.1615, -0.1207],
          [ 0.2136, -0.0328, -0.1314,  ...,  0.0868,  0.0889, -0.0900]],
 
         [[-0.0673, -0.0633, -0.0172,  ..., -0.1604,  0.0258, -0.0048],
          [ 0.0453, -0.2590, -0.1192,  ..., -0.0744,  0.1925, -0.0982],
          [-0.0232, -0.1782, -0.0328,  ..., -0.1592, -0.0142, -0.0713],
          [ 0.1261,  0.0267, -0.0232,  ...,  0.1205,  0.0623, -0.0599]],
 
         [[-0.1563,  0.1211, -0.0770,  ..., -0.2175,  0.1228, -0.0800],
          [-0.0175, -0.2058, -0.0591,  ..., -0.0799,  0.2336, -0.0746],
          [ 0.1256, -0.3455, -0.2181,  ..., -0.1456, -0.0381, -0.0669],
          [-0.0260, -0.0122,  0.0984,  ...,  0.0678,  0.1064, -0.0934]],
 
         ...,
 
         [[-0.1328, -0.0234,  0.0616,  ...,  0.0885,  0.0288, -0.0564],
          [ 0.0000,  0.0000,  0.0000,  

In [57]:
v_rnn = LSTM_v1(10, 20)

In [59]:
out, h = v_rnn.run(torch.ones(4, 50, 10).cuda(), T([10,20,25,25]))