## Import

In [None]:
import os
# Change to available GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [None]:
import nltk
import numpy as np
import sklearn_crfsuite
import tensorflow as tf
import torch
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from seqeval.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn_crfsuite import metrics
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from torch import cuda
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    BertConfig,
    BertForTokenClassification,
    BertTokenizerFast,
)
from tqdm import tqdm
from utils.bert_utils import dataset
from utils.crf_utils import sent2features, sent2labels
from utils.file_utils import open_file_pos_tag
from utils.ner_utils import remove_bio_tags

## Prepare File

In [None]:
# File format must be conll or conllu
train_data_path = ""
val_data_path = ""
test_data_path = ""

# Read POS tagged data
train_data = open_file_pos_tag(train_data_path)
val_data = open_file_pos_tag(val_data_path)
test_data = open_file_pos_tag(test_data_path)

## Model

### CRF

In [None]:
# Convert input data to feature set
X_train = [sent2features(sent) for sent in train_data]
y_train = [sent2labels(sent) for sent in train_data]
X_val = [sent2features(sent) for sent in val_data]
y_val = [sent2labels(sent) for sent in val_data]
X_test = [sent2features(sent) for sent in test_data]
y_test = [sent2labels(sent) for sent in test_data]

In [None]:
# Initialize CRF and trainer
crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
)

In [None]:
# Train CRF model
crf.fit(X_train, y_train)

In [None]:
# Get test result
y_pred = crf.predict(X_test)

In [None]:
print(metrics.flat_classification_report(y_test, y_pred, digits=4))

### BERT

In [None]:
base_model = "model/indobert-large-p2-finetuned-pos"
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
train_sentences = [[word[0] for word in sentence] for sentence in train_data]
val_sentences = [[word[0] for word in sentence] for sentence in val_data]
test_sentences = [[word[0] for word in sentence] for sentence in test_data]

In [None]:
train_labels = [[s[1] for s in sentence] for sentence in train_data]
val_labels = [[s[1] for s in sentence] for sentence in val_data]
test_labels = [[s[1] for s in sentence] for sentence in test_data]

In [None]:
unique_labels = list(
    set([lab for label in train_labels + val_labels + test_labels for lab in label])
)
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
## Parameters
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 20
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [None]:
training_set = dataset(train_sentences, train_labels, tokenizer, MAX_LEN, labels_to_ids)
validation_set = dataset(val_sentences, val_labels, tokenizer, MAX_LEN, labels_to_ids)
testing_set = dataset(test_sentences, test_labels, tokenizer, MAX_LEN, labels_to_ids)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4
                }

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(validation_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
model = BertForTokenClassification.from_pretrained(base_model, num_labels=len(unique_labels), id2label=ids_to_labels, label2id=labels_to_ids)
model.to(device)

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    val_loss, val_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    nb_val_examples, nb_val_steps = 0, 0
    tr_preds, tr_labels = [], []
    val_preds, val_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(tqdm(training_loader)):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        tr_logits = outputs[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        # if idx % 100==0:
        #     loss_step = tr_loss/nb_tr_steps
        #     print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_step = tr_loss/nb_tr_steps
    
    model.eval()
    for idx, batch in enumerate(val_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        val_logits = outputs[1]
        val_loss += loss.item()

        nb_val_steps += 1
        nb_val_examples += labels.size(0)
        
        # if idx % 100==0:
        #     loss_step = val_loss/nb_val_steps
        #     print(f"Validation loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = val_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        val_labels.extend(labels)
        val_preds.extend(predictions)

        tmp_val_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        val_accuracy += tmp_val_accuracy
    

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    print(f"Validation loss epoch: {val_loss/nb_val_steps}")
    print(f"Validation accuracy epoch: {val_accuracy/nb_val_steps}")

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    # with torch.no_grad():
    for idx, batch in enumerate(testing_loader):

        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        eval_logits = outputs[1]

        eval_loss += loss.item()

        nb_eval_steps += 1
        nb_eval_examples += labels.size(0)

        # if idx % 100==0:
        #     loss_step = eval_loss/nb_eval_steps
        #     print(f"Validation loss per 100 evaluation steps: {loss_step}")

        # compute evaluation accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        eval_labels.extend(labels)
        eval_preds.extend(predictions)

        tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        eval_accuracy += tmp_eval_accuracy

    labels = [[model.config.id2label[id.item()]] for id in eval_labels]
    predictions = [[model.config.id2label[id.item()]] for id in eval_preds]
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

In [None]:
print(classification_report(labels, predictions, digits=4))
print(metrics.flat_classification_report(labels, predictions, digits=4))

### Bi-LSTM

In [None]:
all_data = train_data + val_data + test_data

In [None]:
maxlen = max([len(s) for s in all_data])
print ('Maximum sequence length:', maxlen)

In [None]:
words = list(set([word[0] for sentence in all_data for word in sentence]))
words.append("ENDPAD")

In [None]:
n_words = len(words); n_words

In [None]:
tags = list(set([word[1] for sentence in all_data for word in sentence]))
n_tags = len(tags)

In [None]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
X = [[word2idx[w[0]] for w in s] for s in train_data]
X_test = [[word2idx[w[0]] for w in s] for s in test_data]

In [None]:
X = pad_sequences(maxlen=maxlen, sequences=X, padding="post",value=n_words - 1)
X_test = pad_sequences(maxlen=maxlen, sequences=X_test, padding="post",value=n_words - 1)

In [None]:
y = [[tag2idx[w[1]] for w in s] for s in train_data]
y_test = [[tag2idx[w[1]] for w in s] for s in test_data]

In [None]:
y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
y_test = pad_sequences(maxlen=maxlen, sequences=y_test, padding="post", value=tag2idx["O"])

In [None]:
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [None]:
input = Input(shape=(maxlen,))
model = Embedding(input_dim=n_words, output_dim=maxlen, input_length=maxlen)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer

In [None]:
model = Model(input, out)

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),])

In [None]:
history = model.fit(X, np.array(y), batch_size=32, epochs=20, validation_split=0.05, verbose=1)

In [None]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out

def tolabel(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [None]:
test_pred = model.predict(X_test, verbose=1)   
pred_labels = pred2label(test_pred)
test_labels = tolabel(y_test)

In [None]:
print(classification_report(test_labels, pred_labels, digits=4))
print(metrics.flat_classification_report(test_labels, pred_labels, digits=4))