<a href="https://colab.research.google.com/github/AndrewPopesku/NER_model/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install docopt

Collecting docopt
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=a9679b091233630f4008cb30a8b44555a803091511e15bb841470e655232ede7
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt
Successfully installed docopt-0.6.2


In [None]:
from utils import *
from vocab import *

sentences, tags = read_corpus("data.txt")
max_dict_size = max([len(sent) for sent in sentences])
max_tag_size = max([len(sent) for sent in tags])
sent_vocab = Vocab.build(sentences, max_dict_size, 1, is_tags=False)
tag_vocab = Vocab.build(tags, max_tag_size, 1, is_tags=True)
sent_vocab.save("vocab/sent_vocab.json")
tag_vocab.save("vocab/tag_vocab.json")

In [None]:
from vocab import Vocab
import time
import torch
import torch.nn as nn
import bilstm_crf
import utils
import random

args = {
    'SENT_VOCAB': 'vocab/sent_vocab.json',
    'TAG_VOCAB': 'vocab/tag_vocab.json',
    'TRAIN': 'data.txt',
    '--batch-size': '32',
    '--max-epoch': '10',
    '--log-every': '10',
    '--validation-every': '250',
    '--model-save-path': 'model.pth',
    '--optimizer-save-path': 'optimizer.pth',
    '--cuda': False,
    '--dropout-rate': '0.5',
    '--embed-size': '256',
    '--hidden-size': '256',
    '--lr': '0.001',
    '--clip_max_norm': '5.0',
    '--patience-threshold': '0.98',
    '--max-patience': '4',
    '--max-decay': '4',
    '--lr-decay': '0.5'
}

def train(args):
    """ Training BiLSTMCRF model
    Args:
        args: dict that contains options in command
    """
    sent_vocab = Vocab.load(args['SENT_VOCAB'])
    tag_vocab = Vocab.load(args['TAG_VOCAB'])
    train_data, dev_data = utils.generate_train_dev_dataset(args['TRAIN'], sent_vocab, tag_vocab)
    print('num of training examples: %d' % (len(train_data)))
    print('num of development examples: %d' % (len(dev_data)))

    max_epoch = int(args['--max-epoch'])
    log_every = int(args['--log-every'])
    validation_every = int(args['--validation-every'])
    model_save_path = args['--model-save-path']
    optimizer_save_path = args['--optimizer-save-path']
    min_dev_loss = float('inf')
    device = torch.device('cuda' if args['--cuda'] else 'cpu')
    patience, decay_num = 0, 0

    model = bilstm_crf.BiLSTMCRF(sent_vocab, tag_vocab, float(args['--dropout-rate']), int(args['--embed-size']),
                                 int(args['--hidden-size'])).to(device)
    for name, param in model.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, 0, 0.01)
        else:
            nn.init.constant_(param.data, 0)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))
    train_iter = 0  # train iter num
    record_loss_sum, record_tgt_word_sum, record_batch_size = 0, 0, 0  # sum in one training log
    cum_loss_sum, cum_tgt_word_sum, cum_batch_size = 0, 0, 0  # sum in one validation log
    record_start, cum_start = time.time(), time.time()

    print('start training...')
    for epoch in range(max_epoch):
        for sentences, tags in utils.batch_iter(train_data, batch_size=int(args['--batch-size'])):
            train_iter += 1
            current_batch_size = len(sentences)
            sentences, sent_lengths = utils.pad(sentences, sent_vocab[sent_vocab.PAD], device)
            tags, _ = utils.pad(tags, tag_vocab[tag_vocab.PAD], device)

            # back propagation
            optimizer.zero_grad()
            batch_loss = model(sentences, tags, sent_lengths)  # shape: (b,)
            loss = batch_loss.mean()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=float(args['--clip_max_norm']))
            optimizer.step()

            record_loss_sum += batch_loss.sum().item()
            record_batch_size += current_batch_size
            record_tgt_word_sum += sum(sent_lengths)

            cum_loss_sum += batch_loss.sum().item()
            cum_batch_size += current_batch_size
            cum_tgt_word_sum += sum(sent_lengths)

            # if train_iter % log_every == 0:
            print('log: epoch %d, iter %d, %.1f words/sec, avg_loss %f, time %.1f sec' %
                  (epoch + 1, train_iter, record_tgt_word_sum / (time.time() - record_start),
                    record_loss_sum / record_batch_size, time.time() - record_start))
            record_loss_sum, record_batch_size, record_tgt_word_sum = 0, 0, 0
            record_start = time.time()

            if train_iter % validation_every == 0:
                print('dev: epoch %d, iter %d, %.1f words/sec, avg_loss %f, time %.1f sec' %
                      (epoch + 1, train_iter, cum_tgt_word_sum / (time.time() - cum_start),
                       cum_loss_sum / cum_batch_size, time.time() - cum_start))
                cum_loss_sum, cum_batch_size, cum_tgt_word_sum = 0, 0, 0

                dev_loss = cal_dev_loss(model, dev_data, 64, sent_vocab, tag_vocab, device)
                if dev_loss < min_dev_loss * float(args['--patience-threshold']):
                    min_dev_loss = dev_loss
                    model.save(model_save_path)
                    torch.save(optimizer.state_dict(), optimizer_save_path)
                    patience = 0
                else:
                    patience += 1
                    if patience == int(args['--max-patience']):
                        decay_num += 1
                        if decay_num == int(args['--max-decay']):
                            print('Early stop. Save result model to %s' % model_save_path)
                            return
                        lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay'])
                        model = bilstm_crf.BiLSTMCRF.load(model_save_path, device)
                        optimizer.load_state_dict(torch.load(optimizer_save_path))
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                        patience = 0
                print('dev: epoch %d, iter %d, dev_loss %f, patience %d, decay_num %d' %
                      (epoch + 1, train_iter, dev_loss, patience, decay_num))
                cum_start = time.time()
                if train_iter % log_every == 0:
                    record_start = time.time()
    print('Reached %d epochs, Save result model to %s' % (max_epoch, model_save_path))

train(args)


num of training examples: 71
num of development examples: 18
start training...
log: epoch 1, iter 1, 352.1 words/sec, avg_loss 1346.711548, time 38.5 sec
log: epoch 1, iter 2, 532.0 words/sec, avg_loss 1615.376343, time 30.7 sec
log: epoch 1, iter 3, 502.9 words/sec, avg_loss 1457.900949, time 6.4 sec
log: epoch 2, iter 4, 486.8 words/sec, avg_loss 1537.272095, time 32.1 sec
log: epoch 2, iter 5, 348.4 words/sec, avg_loss 1574.856445, time 46.3 sec
log: epoch 2, iter 6, 1061.3 words/sec, avg_loss 583.607806, time 1.2 sec
log: epoch 3, iter 7, 370.8 words/sec, avg_loss 1595.929077, time 45.4 sec
log: epoch 3, iter 8, 485.5 words/sec, avg_loss 1196.933472, time 26.8 sec
log: epoch 3, iter 9, 1011.0 words/sec, avg_loss 1252.927734, time 3.2 sec
log: epoch 4, iter 10, 470.7 words/sec, avg_loss 970.950989, time 27.2 sec
log: epoch 4, iter 11, 381.6 words/sec, avg_loss 1232.657471, time 44.7 sec
log: epoch 4, iter 12, 1512.4 words/sec, avg_loss 1014.579660, time 2.1 sec
log: epoch 5, iter 13