In [1]:
import random
import numpy as np
import pandas as pd
import os
import sys
import inspect
import torch
from torch import optim
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from transformers import BertConfig, BertTokenizer
from nltk.tokenize import word_tokenize

In [2]:
# set current notebook path
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

In [3]:
from transformers import BertConfig, BertTokenizer
from nltk.tokenize import word_tokenize

from NER_Module.model import BertForWordClassification, forward_word_classification
from NER_Module.utils import ner_metrics_fn, get_lr, metrics_to_string, count_param, set_seed
from NER_Module.data import NerGritDataset, NerDataLoader

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\n167574\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Set random seed
set_seed(33)

33

In [6]:
NerGritDataset.NUM_LABELS

7

In [7]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = NerGritDataset.NUM_LABELS

# Instantiate model
model = BertForWordClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
w2i, i2w = NerGritDataset.LABEL2INDEX, NerGritDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'I-PERSON': 0, 'B-ORGANISATION': 1, 'I-ORGANISATION': 2, 'B-PLACE': 3, 'I-PLACE': 4, 'O': 5, 'B-PERSON': 6}
{0: 'I-PERSON', 1: 'B-ORGANISATION', 2: 'I-ORGANISATION', 3: 'B-PLACE', 4: 'I-PLACE', 5: 'O', 6: 'B-PERSON'}


In [9]:
model= model.cuda()

In [10]:
type(tokenizer)

transformers.models.bert.tokenization_bert.BertTokenizer

In [25]:
model.device

device(type='cuda', index=0)

## Test on sample data

In [12]:
def word_subword_tokenize(sentence, tokenizer):
    # Add CLS token
    subwords = [tokenizer.cls_token_id]
    subword_to_word_indices = [-1] # For CLS

    # Add subwords
    for word_idx, word in enumerate(sentence):
        subword_list = tokenizer.encode(word, add_special_tokens=False)
        subword_to_word_indices += [word_idx for i in range(len(subword_list))]
        subwords += subword_list

    # Add last SEP token
    subwords += [tokenizer.sep_token_id]
    subword_to_word_indices += [-1]

    return subwords, subword_to_word_indices

In [13]:
text = word_tokenize('setu siung 119 rt 5 1 13880 cipayung')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].cpu().squeeze().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,setu,B-PERSON
1,siung,I-PLACE
2,119,B-ORGANISATION
3,rt,B-ORGANISATION
4,5,B-ORGANISATION
5,1,I-PLACE
6,13880,I-PLACE
7,cipayung,I-PERSON


In [14]:
model.device

device(type='cuda', index=0)

In [15]:
## datapath 
data = "F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\NER_Module\\test\\data"
train_file = 'train_preprocess.txt'
validation_file ='valid_preprocess.txt'
test_file = 'test_preprocess_masked_label.txt'
train_data_path = os.path.join(data,train_file)
validation_data_path = os.path.join(data,validation_file)
test_data_path = os.path.join(data,test_file)

In [16]:
model_dir = 'F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\model'
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)

In [17]:
# load data
train_dataset = NerGritDataset(train_data_path, tokenizer, lowercase=True)
valid_dataset = NerGritDataset(validation_data_path, tokenizer, lowercase=True)
test_dataset = NerGritDataset(test_data_path, tokenizer, lowercase=True)

train_loader = NerDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=True)  
valid_loader = NerDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)  
test_loader = NerDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)

In [18]:
train_dataset

<NER_Module.data.dataset.NerGritDataset at 0x25d4e732520>

In [19]:
len(train_dataset)

1672

In [20]:
## validation evaluate
# Evaluate function for validation and test
def evaluate(model, data_loader, i2w, is_test=False, device='cpu'):
    model.eval()
    total_loss, total_correct, total_labels = 0, 0, 0

    list_hyp, list_label, list_seq = [], [], []

    pbar = tqdm(iter(data_loader), leave=True, total=len(data_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device= device)

        
        # Calculate total loss
        test_loss = loss.item()
        total_loss = total_loss + test_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_seq += batch_seq
        metrics = ner_metrics_fn(list_hyp, list_label)

        if not is_test:
            pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        else:
            pbar.set_description("TEST LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
    
    if is_test:
        return total_loss, metrics, list_hyp, list_label, list_seq
    else:
        return total_loss, metrics

In [21]:
best_val_metric = -100
count_stop = 0
exp_id =1
evaluate_every=2
step_size=1
gamma=0.5
valid_criterion= 'F1'

In [22]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [23]:
# Train
scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
n_epochs = 8
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    # evaluate
    if ((epoch+1) % evaluate_every) == 0:
        val_loss, val_metrics = evaluate(model, valid_loader, i2w, is_test=False, device ='cuda')

        # Early stopping
        val_metric = val_metrics[valid_criterion]
        if best_val_metric < val_metric:
            best_val_metric = val_metric
            # save model
            if exp_id is not None:
                torch.save(model.state_dict(), model_dir + "/best_model_" + str(exp_id) + ".th")
            else:
                torch.save(model.state_dict(), model_dir + "/best_model.th")
            count_stop = 0
        else:
            count_stop += 1
            print("count stop:", count_stop)
            if count_stop == early_stop:
                break

  0%|                                                                                          | 0/105 [00:00<?, ?it/s]2021-03-14 16:26:51,376 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.8965 LR:0.00002000:   1%|▎                                    | 1/105 [00:39<1:08:49, 39.70s/it]2021-03-14 16:26:51,715 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.8480 LR:0.00002000:   2%|▋                                      | 2/105 [00:39<47:49, 27.86s/it]2021-03-14 16:26:51,935 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.7927 LR:0.00002000:   3%|█                                      | 3/105 [00:40<33:15, 19.56s/it]2021-03-14 16:26:52,139 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.7075 LR:0.00002000:   4%|█▍                                     | 4/105 [00:40<23:15, 13.82s/it]2021-03-14 16:26:52,548 [NER_Module.model.backbone] [INFO] Device is set to cuda


(Epoch 1) TRAIN LOSS:1.3128 LR:0.00002000:  39%|██████████████▊                       | 41/105 [00:48<00:12,  4.93it/s]2021-03-14 16:27:00,234 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.3087 LR:0.00002000:  40%|███████████████▏                      | 42/105 [00:48<00:12,  5.10it/s]2021-03-14 16:27:00,414 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.3040 LR:0.00002000:  41%|███████████████▌                      | 43/105 [00:48<00:14,  4.23it/s]2021-03-14 16:27:00,745 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.3008 LR:0.00002000:  42%|███████████████▉                      | 44/105 [00:48<00:13,  4.60it/s]2021-03-14 16:27:00,917 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.2967 LR:0.00002000:  43%|████████████████▎                     | 45/105 [00:49<00:11,  5.13it/s]2021-03-14 16:27:01,060 [NER_Module.model.backbone] [INFO] Device is set to cuda


(Epoch 1) TRAIN LOSS:1.2066 LR:0.00002000:  78%|█████████████████████████████▋        | 82/105 [00:56<00:04,  4.93it/s]2021-03-14 16:27:08,527 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.2054 LR:0.00002000:  79%|██████████████████████████████        | 83/105 [00:56<00:04,  5.27it/s]2021-03-14 16:27:08,687 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.2020 LR:0.00002000:  80%|██████████████████████████████▍       | 84/105 [00:56<00:04,  4.67it/s]2021-03-14 16:27:08,957 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.2004 LR:0.00002000:  81%|██████████████████████████████▊       | 85/105 [00:57<00:03,  5.18it/s]2021-03-14 16:27:09,101 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 1) TRAIN LOSS:1.1996 LR:0.00002000:  82%|███████████████████████████████       | 86/105 [00:57<00:03,  5.24it/s]2021-03-14 16:27:09,289 [NER_Module.model.backbone] [INFO] Device is set to cuda


(Epoch 1) TRAIN LOSS:1.1730 ACC:0.86 F1:0.24 REC:0.20 PRE:0.29 LR:0.00002000


2021-03-14 16:27:53,607 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:1.0367 LR:0.00002000:   1%|▎                                    | 1/105 [00:40<1:09:24, 40.04s/it]2021-03-14 16:27:53,879 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:1.1064 LR:0.00002000:   2%|▋                                      | 2/105 [00:40<48:12, 28.08s/it]2021-03-14 16:27:54,054 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:1.0315 LR:0.00002000:   3%|█                                      | 3/105 [00:40<33:31, 19.72s/it]2021-03-14 16:27:54,268 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:1.0064 LR:0.00002000:   4%|█▍                                     | 4/105 [00:40<23:19, 13.86s/it]2021-03-14 16:27:54,454 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:1.0368 LR:0.00002000:   5%|█▊                                     | 5/105 [00:40<16:14,  9.74s/it]

(Epoch 2) TRAIN LOSS:0.9886 LR:0.00002000:  39%|██████████████▊                       | 41/105 [00:47<00:13,  4.67it/s]2021-03-14 16:28:01,658 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:0.9827 LR:0.00002000:  40%|███████████████▏                      | 42/105 [00:48<00:13,  4.52it/s]2021-03-14 16:28:01,898 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:0.9823 LR:0.00002000:  41%|███████████████▌                      | 43/105 [00:48<00:13,  4.73it/s]2021-03-14 16:28:02,083 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:0.9825 LR:0.00002000:  42%|███████████████▉                      | 44/105 [00:48<00:12,  4.94it/s]2021-03-14 16:28:02,266 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:0.9795 LR:0.00002000:  43%|████████████████▎                     | 45/105 [00:48<00:13,  4.40it/s]2021-03-14 16:28:02,550 [NER_Module.model.backbone] [INFO] Device is set to cuda


(Epoch 2) TRAIN LOSS:0.9533 LR:0.00002000:  78%|█████████████████████████████▋        | 82/105 [00:55<00:03,  6.16it/s]2021-03-14 16:28:09,653 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:0.9514 LR:0.00002000:  79%|██████████████████████████████        | 83/105 [00:56<00:03,  5.76it/s]2021-03-14 16:28:09,854 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:0.9501 LR:0.00002000:  80%|██████████████████████████████▍       | 84/105 [00:56<00:03,  5.74it/s]2021-03-14 16:28:10,029 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:0.9495 LR:0.00002000:  81%|██████████████████████████████▊       | 85/105 [00:56<00:03,  5.88it/s]2021-03-14 16:28:10,190 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 2) TRAIN LOSS:0.9479 LR:0.00002000:  82%|███████████████████████████████       | 86/105 [00:56<00:03,  5.32it/s]2021-03-14 16:28:10,419 [NER_Module.model.backbone] [INFO] Device is set to cuda


(Epoch 2) TRAIN LOSS:0.9381 ACC:0.92 F1:0.51 REC:0.52 PRE:0.53 LR:0.00002000


  0%|                                                                                           | 0/14 [00:00<?, ?it/s]2021-03-14 16:28:54,730 [NER_Module.model.backbone] [INFO] Device is set to cuda
VALID LOSS:0.9242 ACC:0.94 F1:0.59 REC:0.61 PRE:0.59:   0%|                                     | 0/14 [00:00<?, ?it/s]2021-03-14 16:28:54,796 [NER_Module.model.backbone] [INFO] Device is set to cuda
VALID LOSS:0.9163 ACC:0.95 F1:0.63 REC:0.65 PRE:0.63:  14%|████▏                        | 2/14 [00:00<00:00, 16.39it/s]2021-03-14 16:28:54,853 [NER_Module.model.backbone] [INFO] Device is set to cuda
VALID LOSS:0.9366 ACC:0.94 F1:0.61 REC:0.62 PRE:0.61:  14%|████▏                        | 2/14 [00:00<00:00, 16.39it/s]2021-03-14 16:28:54,920 [NER_Module.model.backbone] [INFO] Device is set to cuda
VALID LOSS:0.9398 ACC:0.93 F1:0.59 REC:0.59 PRE:0.59:  29%|████████▎                    | 4/14 [00:00<00:00, 16.20it/s]2021-03-14 16:28:54,979 [NER_Module.model.backbone] [INFO] Device is set to cuda


(Epoch 3) TRAIN LOSS:0.8593 LR:0.00002000:  25%|█████████▍                            | 26/105 [00:45<00:15,  5.05it/s]2021-03-14 16:29:44,760 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 3) TRAIN LOSS:0.8601 LR:0.00002000:  26%|█████████▊                            | 27/105 [00:46<00:14,  5.41it/s]2021-03-14 16:29:44,915 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 3) TRAIN LOSS:0.8594 LR:0.00002000:  27%|██████████▏                           | 28/105 [00:46<00:14,  5.40it/s]2021-03-14 16:29:45,100 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 3) TRAIN LOSS:0.8582 LR:0.00002000:  28%|██████████▍                           | 29/105 [00:46<00:13,  5.66it/s]2021-03-14 16:29:45,258 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 3) TRAIN LOSS:0.8560 LR:0.00002000:  29%|██████████▊                           | 30/105 [00:46<00:15,  4.86it/s]2021-03-14 16:29:45,531 [NER_Module.model.backbone] [INFO] Device is set to cuda


(Epoch 3) TRAIN LOSS:0.8346 LR:0.00002000:  64%|████████████████████████▏             | 67/105 [00:54<00:07,  5.29it/s]2021-03-14 16:29:53,154 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 3) TRAIN LOSS:0.8340 LR:0.00002000:  65%|████████████████████████▌             | 68/105 [00:54<00:06,  5.33it/s]2021-03-14 16:29:53,339 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 3) TRAIN LOSS:0.8346 LR:0.00002000:  66%|████████████████████████▉             | 69/105 [00:54<00:06,  5.33it/s]2021-03-14 16:29:53,526 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 3) TRAIN LOSS:0.8327 LR:0.00002000:  67%|█████████████████████████▎            | 70/105 [00:54<00:07,  4.70it/s]2021-03-14 16:29:53,798 [NER_Module.model.backbone] [INFO] Device is set to cuda
(Epoch 3) TRAIN LOSS:0.8304 LR:0.00002000:  68%|█████████████████████████▋            | 71/105 [00:55<00:07,  4.47it/s]2021-03-14 16:29:54,046 [NER_Module.model.backbone] [INFO] Device is set to cuda


(Epoch 3) TRAIN LOSS:0.8110 ACC:0.95 F1:0.68 REC:0.70 PRE:0.67 LR:0.00002000


  0%|                                                                                          | 0/105 [00:29<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
print(df)

In [26]:
text = word_tokenize('Jalan Candi Panggung Barat. No 16 . RT 01 RW 18. Kelurahan Mojolangu, Kecamatan Lowokwaru Malang City , East Java')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,Jalan,B-PLACE
1,Candi,I-PLACE
2,Panggung,I-PLACE
3,Barat,I-PLACE
4,.,O
5,No,O
6,16,O
7,.,O
8,RT,O
9,01,O


In [None]:
## load model and do predict

# model = pred_cls(config=config)
# base_model = base_cls.from_pretrained(model_path, from_tf=False, config=config)

# # Plug pretrained base model to classification model
# if 'bert' in model.__dir__():
#     model.bert = base_model

# load model
def load_model(model_path):
    tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
    config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
    config.num_labels = NerGritDataset.NUM_LABELS
    model.load_state_dict(torch.load(model_path))
    #model = BertForWordClassification.from_pretrained(model_path, config=config)
    return model

In [None]:
model_folder = './'
filename = 'best_model_1.th'
model_path = os.path.join(model_folder,filename)

In [None]:
model_path

In [None]:
custom_model = load_model(model_path=model_path)

In [None]:
custom_model = custom_model.cuda()

In [None]:
# Evaluate on test
custom_model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_word_classification(custom_model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()

print(df)