In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'

In [2]:
from fastai.text import *

  return f(*args, **kwds)
  return f(*args, **kwds)
  from numpy.core.umath_tests import inner1d
  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
from model.data_utils import minibatches, pad_sequences, get_chunks
from model.config import Config
from model.data_utils import CoNLLDataset
from torch.utils.data.sampler import SequentialSampler, RandomSampler, BatchSampler

In [4]:
config = Config()

In [5]:
string_classes = (str, bytes)
def get_tensor(batch, pin, half=False):
    if isinstance(batch, (np.ndarray, np.generic)):
        batch = T(batch, half=half, cuda=False).contiguous()
        if pin: batch = batch.pin_memory()
        return to_gpu(batch)
    elif isinstance(batch, string_classes):
        return batch
    elif isinstance(batch, collections.Mapping):
        return {k: get_tensor(sample, pin, half) for k, sample in batch.items()}
    elif isinstance(batch, collections.Sequence):
        return [get_tensor(sample, pin, half) for sample in batch]
    raise TypeError(f"batch must contain numbers, dicts or lists; found {type(batch)}")
    
class DataLoader(object):
    def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, pad_idx=0,
                 num_workers=None, pin_memory=False, drop_last=False, pre_pad=True, half=False,
                 transpose=False, transpose_y=False):
        self.dataset,self.batch_size,self.num_workers = dataset,batch_size,num_workers
        self.pin_memory,self.drop_last,self.pre_pad = pin_memory,drop_last,pre_pad
        self.transpose,self.transpose_y,self.pad_idx,self.half = transpose,transpose_y,pad_idx,half

        if batch_sampler is not None:
            if batch_size > 1 or shuffle or sampler is not None or drop_last:
                raise ValueError('batch_sampler is mutually exclusive with '
                                 'batch_size, shuffle, sampler, and drop_last')

        if sampler is not None and shuffle:
            raise ValueError('sampler is mutually exclusive with shuffle')

        if batch_sampler is None:
            if sampler is None:
                sampler = RandomSampler(dataset) if shuffle else SequentialSampler(dataset)
            batch_sampler = BatchSampler(sampler, batch_size, drop_last)

        if num_workers is None:
            self.num_workers = num_cpus()

        self.sampler = sampler
        self.batch_sampler = batch_sampler

    def __len__(self): return len(self.batch_sampler)

    def jag_stack(self, b):
        if len(b[0].shape) not in (1,2): return np.stack(b)
        ml = max(len(o) for o in b)
        if min(len(o) for o in b)==ml: return np.stack(b)
        res = np.zeros((len(b), ml), dtype=b[0].dtype) + self.pad_idx
#         for i,o in enumerate(b):
#             if self.pre_pad: res[i, -len(o):] = o
#             else:            res[i,  :len(o)] = o
        return res

    def np_collate(self, batch):
        b = batch[0]
        if isinstance(b, (np.ndarray, np.generic)): return self.jag_stack(batch)
        elif isinstance(b, (int, float)): return np.array(batch)
        elif isinstance(b, string_classes): return batch
        elif isinstance(b, collections.Mapping):
            return {key: self.np_collate([d[key] for d in batch]) for key in b}
        elif isinstance(b, collections.Sequence):
            return [self.np_collate(samples) for samples in zip(*batch)]
        raise TypeError(("batch must contain numbers, dicts or lists; found {}".format(type(b))))

    def get_batch(self, indices):
        res = self.np_collate([self.dataset[i] for i in indices])
        if self.transpose:   res[0] = res[0].T
        if self.transpose_y: res[1] = res[1].T
        return res

    def __iter__(self):
        if self.num_workers==0:
            for batch in map(self.get_batch, iter(self.batch_sampler)):
                yield get_tensor(batch, self.pin_memory, self.half)
        else:
            with ThreadPoolExecutor(max_workers=self.num_workers) as e:
                # avoid py3.6 issue where queue is infinite and can result in memory exhaustion
                for c in chunk_iter(iter(self.batch_sampler), self.num_workers*10):
                    for batch in e.map(self.get_batch, c):
                        yield get_tensor(batch, self.pin_memory, self.half)

class SeqDataLoader(DataLoader):
    def get_batch(self, indices):
        res = self.np_collate([self.dataset[i] for i in indices])
        print(np.array(res).shape)
        res[1] = np.reshape(res[1], -1)  # reshape the labels to one sequence
        return res

In [6]:
train = CoNLLDataset(config.filename_train, config.processing_word,
                         config.processing_tag, config.max_iter)
val = CoNLLDataset(config.filename_dev, config.processing_word,
                         config.processing_tag, config.max_iter)
test = CoNLLDataset(config.filename_test, config.processing_word,
                         config.processing_tag, config.max_iter)

In [7]:
class Minibatch(object):
    def __init__(self, data, minibatch_size):
        self.data = data
        self.minibatch_size = minibatch_size
        self.length = None
        
    def __iter__(self):
        x_batch, y_batch = [], []
        for (x, y) in self.data:
            if len(x_batch) == self.minibatch_size:
                char_ids, word_ids = zip(*x_batch)
                word_ids, sequence_lengths = pad_sequences(word_ids, 0)
                char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
                    nlevels=2)
                lbl_ids, lbl_lengths = pad_sequences(y_batch, 0)
                word_ids = np.concatenate([np.array(word_ids)[:,:,np.newaxis], np.array(char_ids)], axis=-1)
                yield T(word_ids), T(lbl_ids).view(-1)
                x_batch, y_batch = [], []

            if type(x[0]) == tuple:
                x = zip(*x)
            x_batch += [x]
            y_batch += [y]

        if len(x_batch) != 0:
            char_ids, word_ids = zip(*x_batch)
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
                nlevels=2)
            lbl_ids, lbl_lengths = pad_sequences(y_batch, 0)
            word_ids = np.concatenate([np.array(word_ids)[:,:,np.newaxis], np.array(char_ids)], axis=-1)
            yield T(word_ids), T(lbl_ids).view(-1)
    
    def __len__(self):
        if self.length == None:
            self.length = 0
            for _ in self:
                self.length += 1
        return self.length

In [13]:
dir_path = '/home/'
trn_dl = Minibatch(train, 20)
val_dl = Minibatch(val, 20)
test_dl = Minibatch(test, 20)
md = ModelData(dir_path, trn_dl, val_dl, test_dl)

In [14]:
class NER_model(nn.Module):
    def __init__(self, config):
        super(NER_model, self).__init__()
        self.config = config
        self.idx_to_tag = {idx: tag for tag, idx in
                           self.config.vocab_tags.items()}
        self.get_word_embeddings()
        self.get_logits()
        
    def get_word_embeddings(self):
        # get word embeding
        _word_embedding = V(self.config.embeddings, requires_grad=True)
        self.word_embedding = nn.Embedding.from_pretrained(_word_embedding)
        
        # get char embedding
        self._char_embedding = nn.Embedding(self.config.nchars, self.config.dim_char)
        self.char_embedding = nn.LSTM(input_size=self.config.dim_char, hidden_size=self.config.hidden_size_char,
                                     num_layers=1, batch_first=True, # not sure here whether batch is first
                                     bidirectional=True)
        
    def get_logits(self):
        self.rnn = nn.LSTM(input_size=self.config.dim_word+self.config.dim_char*2,
                          hidden_size=self.config.hidden_size_lstm,
                          num_layers=1, batch_first=True, # not sure whether batch is first
                          bidirectional=True)
        self.dropout = nn.Dropout(self.config.dropout if self.train else 0)
        self.linear = nn.Linear(self.config.hidden_size_lstm*2, self.config.ntags)
        
    def forward(self, input):
        char = input[:, :, 1:]
        words = input[:, :, 0]
        bs, sl, _ = char.size()
        char = char.view(-1, _)
        _char_embedding = self._char_embedding(char)
        
        char_embedding, (h_n, cell_n) = self.char_embedding(_char_embedding)
        char_embedding = h_n.view(bs, sl, -1)
        word_embedding = self.word_embedding(words)

        # concat word embeddings and char embeddings
        word_embedding = torch.cat([word_embedding, char_embedding], dim=-1)
        word_embedding_dp = self.dropout(word_embedding)
        
        out, (n_h, n_cell) = self.rnn(word_embedding_dp)
        out_dp = self.dropout(out)
        out = self.linear(out_dp)
        return out.view(out.size(0)*out.size(1), out.size(-1))

In [15]:
ner_model = NER_model(config).cuda()

In [20]:
opt = optim.Adam(ner_model.parameters(), 1e-3)
loss_func = F.cross_entropy

In [24]:
fit(ner_model, md, 15, opt, loss_func, metrics=[accuracy])

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.146087   0.102961   0.972651  
    1      0.155913   0.08783    0.975622                    
    2      0.121085   0.078198   0.978691                    
    3      0.115806   0.070489   0.98032                     
    4      0.106726   0.064444   0.982165                     
    5      0.101011   0.060263   0.983474                     
    6      0.09557    0.057024   0.983865                     
    7      0.090391   0.054389   0.984684                     
    8      0.103133   0.052321   0.985416                     
    9      0.084649   0.049545   0.986204                     
    10     0.080803   0.047848   0.986866                     
    11     0.076476   0.046516   0.987143                     
    12     0.073945   0.045714   0.987474                     
    13     0.070619   0.043151   0.988063                     
    14     0.067398   0.042002   0.988545                     


[0.04200169330624004, 0.9885449216916011]

In [23]:
# epoch = 4
# for i in range(epoch):
#     for step, (X_batch, Y_batch) in enumerate(trn_dl):
#         print(step)
#         Y = ner_model(X_batch)
        
# #         bs, sl = Y_batch[0].size()
# #         Y_batch = Y_batch[0].view(-1)
        

# #         Y_onehot = torch.zeros(bs*sl, 8).cuda()
# #         print(Y_onehot.size())
# #         Y_onehot.scatter_(1, Y_batch)
#         _, prediction = torch.max(Y, dim=-1)
#         print(prediction.size())
#         print(Y_batch[0].size())
#         print(V(prediction))
#         loss = loss_func(prediction, Y_batch[0])
#         opt.zero_grad()
#         loss.backward()
#         opt.step()