In [6]:
import random
import numpy as np
import pandas as pd
import os
import sys
import inspect
import torch
from torch import optim
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from transformers import BertConfig, BertTokenizer
from nltk.tokenize import word_tokenize

In [7]:
# set current notebook path
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

In [50]:
from transformers import BertConfig, BertTokenizer 
from nltk.tokenize import word_tokenize

from NER_Module.model import BertForWordClassification#, forward_word_classification
from NER_Module.utils import ner_metrics_fn, get_lr, metrics_to_string, count_param, set_seed
from NER_Module.data import NerGritDataset, NerDataLoader

In [32]:
# Forward function for word classification
def forward_word_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 4:
        (subword_batch, mask_batch, subword_to_word_indices_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 5:
        (subword_batch, mask_batch, token_type_batch, subword_to_word_indices_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    subword_to_word_indices_batch = torch.LongTensor(subword_to_word_indices_batch)
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        subword_to_word_indices_batch = subword_to_word_indices_batch.cuda()
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, subword_to_word_indices_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]
    
    # generate prediction & label list
    list_hyps = []
    list_labels = []
    hyps_list = torch.topk(logits, k=1, dim=-1)[1].squeeze(dim=-1)
    for i in range(len(hyps_list)):
        hyps, labels = hyps_list[i].tolist(), label_batch[i].tolist()        
        list_hyp, list_label = [], []
        for j in range(len(hyps)):
            if labels[j] == -100:
                break
            else:
                list_hyp.append(i2w[hyps[j]])
                list_label.append(i2w[labels[j]])
        list_hyps.append(list_hyp)
        list_labels.append(list_label)
        
    return loss, list_hyps, list_labels

In [33]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\n167574\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Initilaise model

In [34]:
# Set random seed
set_seed(33)

33

In [51]:
NerGritDataset.NUM_LABELS

5

In [52]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = NerGritDataset.NUM_LABELS

# Instantiate model
model = BertForWordClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)
model= model.cuda()

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
w2i, i2w = NerGritDataset.LABEL2INDEX, NerGritDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'I-POI': 0, 'B-POI': 1, 'I-STREET': 2, 'B-STREET': 3, 'O': 4}
{0: 'I-POI', 1: 'B-POI', 2: 'I-STREET', 3: 'B-STREET', 4: 'O'}


In [54]:
model.device

device(type='cuda', index=0)

## Test on sample data

In [38]:
def word_subword_tokenize(sentence, tokenizer):
    # Add CLS token
    subwords = [tokenizer.cls_token_id]
    subword_to_word_indices = [-1] # For CLS

    # Add subwords
    for word_idx, word in enumerate(sentence):
        subword_list = tokenizer.encode(word, add_special_tokens=False)
        subword_to_word_indices += [word_idx for i in range(len(subword_list))]
        subwords += subword_list

    # Add last SEP token
    subwords += [tokenizer.sep_token_id]
    subword_to_word_indices += [-1]

    return subwords, subword_to_word_indices

In [39]:
text = word_tokenize('setu siung 119 rt 5 1 13880 cipayung')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].cpu().squeeze().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,setu,I-POI
1,siung,I-POI
2,119,I-POI
3,rt,I-POI
4,5,I-POI
5,1,I-POI
6,13880,I-POI
7,cipayung,I-POI


## Load data

In [40]:
# ## datapath 
# data = "F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\NER_Module\\test\\data"
# train_file = 'train_preprocess.txt'
# validation_file ='valid_preprocess.txt'
# test_file = 'test_preprocess_masked_label.txt'
# train_data_path = os.path.join(data,train_file)
# validation_data_path = os.path.join(data,validation_file)
# test_data_path = os.path.join(data,test_file)

In [41]:
# data = "F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\NER_Module\\test\\data"
# train_file = 'shopee_train.txt'
# validation_file ='shopee_test.txt'
# test_file = 'test_file.txt'
# model_path = 'F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\model'

In [58]:
#shoppee dataset
model_path = 'F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\model'
data = "F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\data_2"
test_data = "F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\NER_Module\\test\\data"
train_file = 'train_80_file_2.txt'
validation_file ='val_20_file_2.txt'
test_file = 'test_file_2.txt'

In [59]:
train_data_path = os.path.join(data,train_file)
validation_data_path = os.path.join(data,validation_file)
test_data_path = os.path.join(data,test_file)

In [17]:
model_dir = 'F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\model'
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)

In [60]:
# load data
train_dataset = NerGritDataset(train_data_path, tokenizer, lowercase=True)
valid_dataset = NerGritDataset(validation_data_path, tokenizer, lowercase=True)
test_dataset = NerGritDataset(test_data_path, tokenizer, lowercase=True)

train_loader = NerDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=16, num_workers=1, shuffle=True)  
valid_loader = NerDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=16, num_workers=1, shuffle=False)  
test_loader = NerDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=16, num_workers=1, shuffle=False)

In [45]:
train_dataset

<NER_Module.data.dataset.NerGritDataset at 0x1c14999f250>

In [61]:
print(len(train_dataset))
print(len(valid_dataset))
print(len(test_dataset))

240000
60000
50000


## Train model

In [25]:
best_val_metric = -100
count_stop = 0
exp_id = 'notebook_model_18_03_2021'
evaluate_every=5
step_size=1
gamma=0.5
early_stop = 3
valid_criterion= 'F1'
epochs=15

In [26]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [27]:
## validation evaluate
# Evaluate function for validation and test
def evaluate(model, data_loader, i2w, is_test=False, device='cpu'):
    model.eval()
    torch.set_grad_enabled(False)
    total_loss, total_correct, total_labels = 0, 0, 0

    list_hyp, list_label, list_seq = [], [], []

    pbar = tqdm(iter(data_loader), leave=True, total=len(data_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device= device)

        
        # Calculate total loss
        test_loss = loss.item()
        total_loss = total_loss + test_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_seq += batch_seq
        metrics = ner_metrics_fn(list_hyp, list_label)

        if not is_test:
            pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        else:
            pbar.set_description("TEST LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
    
    if is_test:
        return total_loss, metrics, list_hyp, list_label, list_seq
    else:
        return total_loss, metrics

In [28]:
# Train
scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
n_epochs = epochs
for epoch in range(n_epochs):
    print('The best val is :{}'.format(best_val_metric))
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    # evaluate
    if ((epoch+1) % evaluate_every) == 0:
        val_loss, val_metrics = evaluate(model, valid_loader, i2w, is_test=False, device ='cuda')
        print('(Epoch {}) VAL_METRIC :{:.4f}'.format((epoch+1),val_metrics[valid_criterion]))
        # Early stopping
        val_metric = val_metrics[valid_criterion]
        if best_val_metric < val_metric:
            best_val_metric = val_metric
            # save model
            if exp_id is not None:
                torch.save(model.state_dict(), model_dir + "/best_model_" + str(exp_id) + ".th")
            else:
                torch.save(model.state_dict(), model_dir + "/best_model.th")
            count_stop = 0
        else:
            print('The best val is :{} and val_metric is {}'.format(best_val_metric,val_metric))
            count_stop += 1
            print("count stop: {}".format(count_stop))
            if count_stop == early_stop:
                break

  0%|                                                                                        | 0/15000 [00:00<?, ?it/s]

The best val is :-100


(Epoch 1) TRAIN LOSS:0.4245 LR:0.00002000: 100%|█████████████████████████████████| 15000/15000 [20:14<00:00, 12.35it/s]
  0%|                                                                                        | 0/15000 [00:00<?, ?it/s]

(Epoch 1) TRAIN LOSS:0.4245 ACC:0.89 F1:0.68 REC:0.73 PRE:0.63 LR:0.00002000
The best val is :-100


(Epoch 2) TRAIN LOSS:nan LR:0.00002000:  68%|████████████████████████▌           | 10209/15000 [13:48<06:28, 12.32it/s]


KeyboardInterrupt: 

## Predict on test data

In [None]:
## load model if not there

# define model path 
# model_folder = './'
# filename = 'best_model_1.th'
# model_path = os.path.join(model_folder,filename)

# def load_model(model_path):
#     model.load_state_dict(torch.load(model_path))
#     model = model.cuda()
#     return model

# trained_model = load_model(model_path)

In [None]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
result_df = pd.DataFrame({'label':list_hyp}).reset_index()
print(result_df)

### Save result in csv 

In [None]:
## path 
save_path = model_path
filename = 'shopee_notebook'
filename = filename + '.csv'
result_path = os.path.join(save_path, filename)

In [None]:
result_df.to_csv(result_path, index=False)

## Test on sample sentecnces

In [None]:
text = word_tokenize('Jalan Candi Panggung Barat. No 16 . RT 01 RW 18. Kelurahan Mojolangu, Kecamatan Lowokwaru Malang City , East Java')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

In [None]:
custom_model = load_model(model_path=model_path)

In [19]:
import os
from NER_Module import execute_main

In [20]:
model_path = 'F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\model'
data = "F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\data_3"
test_data = "F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\NER_Module\\test\\data"

In [21]:
test_file = 'test_file.txt'

In [22]:
test_data_path = os.path.join(data,test_file)

In [23]:
## model_path 
model_folder = './'
filename = 'best_model_pipeline_final_model.th'
trained_model_path = os.path.join(model_folder,filename)

In [24]:
result_path ='F:\\Bala_EU_DSVM_BACKUP\\Shopee_challenge\\Shopee_Challenge\\model'

In [25]:
## model initialisation
predict_df = execute_main(train_dataset_path=None, valid_dataset_path=None,
                          test_dataset_path=test_data_path, model_dir=model_folder, model_filename=filename,
                          predict_only=True, random_state=33, device='cuda',
                          result_path=result_path, result_filename='shopee_pipeline_final',
                          convert_shopee=False, shopee_prediction_file=None)

2021-03-20 09:32:32,842 [NER_Module.model.model] [INFO] Loading pretrained indobert model.....
INFO:NER_Module.model.model:Loading pretrained indobert model.....
Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2021-03-20 09:32:41,842 [NER_Module.model.model] [INFO] Initialised model with indo-bert!!!!
INFO:NER_Module.model.model:Initialised model with indo-bert!!!!
2021-03-20 09:32:41,843 [NER_Module.model.model] [INFO] Device: cuda
INFO:NER_Module.model.model:Device: cuda
2021-03-20 09:32:41,845 [NER_Module.model.model] [INFO] Loading model from local path in cuda device
INFO:NER_Module.model.model:Loading model from local path in cuda device
2021-03-20 09:32:42,489 [NER_Module.model.model] [INFO] Loading Tokenizer from indo-Bert
INFO: