In [1]:
import talos as ta
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Flatten, Dense, TimeDistributed, \
    SpatialDropout1D, Bidirectional, Conv1D, MaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

%matplotlib inline

Using TensorFlow backend.


In [2]:
# class ConfusionMatrixMetric(tf.keras.metrics.Metric):
#     def __init__(self, num_classes, **kwargs):
#         super(ConfusionMatrixMetric,self).__init__(name='confusion_matrix_metric',**kwargs)
#         self.num_classes=num_classes
#         self.total_cm = self.add_weight("total", shape=(num_classes,num_classes), initializer="zeros")
        
#     def reset_states(self):
#         for s in self.variables:
#             s.assign(tf.zeros(shape=s.shape))
            
#     def update_state(self, y_true, y_pred,sample_weight=None):
#         self.total_cm.assign_add(self.confusion_matrix(y_true,y_pred))
#         return self.total_cm
        
#     def result(self):
#         return self.process_confusion_matrix()
    
#     def confusion_matrix(self,y_true, y_pred):
#         y_pred = tf.argmax(y_pred, axis=2)
#         y_true = tf.argmax(y_true, axis=2)

#         y_pred = tf.reshape(y_pred, [-1])
#         y_true = tf.reshape(y_true, [-1])
  
#         cm = tf.math.confusion_matrix(
#             y_true, 
#             y_pred, 
#             dtype=tf.float32, 
#             num_classes=self.num_classes
#         )
        
#         return cm
    
#     def process_confusion_matrix(self):
#         cm = self.total_cm
#         diag_part=tf.linalg.diag_part(cm)
#         precision=diag_part/(tf.reduce_sum(cm,0)+tf.constant(1e-15))
#         recall=diag_part/(tf.reduce_sum(cm,1)+tf.constant(1e-15))
#         f1=2*precision*recall/(precision+recall+tf.constant(1e-15))
#         return precision, recall, f1
    
#     def fill_output(self,output):
#         results=self.result()
#         for i in range(self.num_classes):
#             output['precision_{}'.format(i)]=results[0][i]
#             output['recall_{}'.format(i)]=results[1][i]
#             output['f1_{}'.format(i)]=results[2][i]

In [3]:
class SeqModel(tf.keras.Sequential):
    def train_step(self, data):
        x, y = data
        
        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)
            loss = self.compiled_loss(
                y,
                y_pred,
                regularization_losses=self.losses,
            )

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y, y_pred)
        output={m.name: m.result() for m in self.metrics[:-1]}
        
        if 'confusion_matrix_metric' in self.metrics_names:
            self.metrics[-1].fill_output(output)

        return output
        
        
    def test_step(self, data):
        x, y = data

        y_pred = self(x, training=False)
        loss = self.compiled_loss(
            y,
            y_pred,
            regularization_losses=self.losses,
        )

        self.compiled_metrics.update_state(y, y_pred)
        output={m.name: m.result() for m in self.metrics[:-1]}
        
        if 'confusion_matrix_metric' in self.metrics_names:
            self.metrics[-1].fill_output(output)    
        
        return output

In [4]:
df = pd.read_csv('../data/data.csv')

In [5]:
df.head()

Unnamed: 0,word,tag
0,Pengamat,O
1,politik,O
2,dari,O
3,Universitas,B-ORGANIZATION
4,Gadjah,I-ORGANIZATION


In [6]:
sentences = []
cnt = 1

for i in df.itertuples():
    sentences.append(cnt)
    
    if '.' in str(i.word):
        cnt += 1
        
df['sentence #'] = sentences
df.head()

Unnamed: 0,word,tag,sentence #
0,Pengamat,O,1
1,politik,O,1
2,dari,O,1
3,Universitas,B-ORGANIZATION,1
4,Gadjah,I-ORGANIZATION,1


In [7]:
agg_func = lambda s: [(w, t) for w, t in zip(s['word'].values.tolist(), s['tag'].values.tolist())]
grouped = df.groupby('sentence #').apply(agg_func)
sentences = [s for s in grouped]

In [8]:
words = list(set(df['word'].values))
words.append('PADDING')
num_words = len(words)
tags = list(set(df['tag'].values))
num_tags = len(tags)

In [9]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate([tag for tag in tags if tag != 'O'])}
tag2idx['O'] = len(tags)-1

In [10]:
tag2idx

{'U-QUANTITY': 0,
 'U-ORGANIZATION': 1,
 'I-ORGANIZATION': 2,
 'B-QUANTITY': 3,
 'I-TIME': 4,
 'L-TIME': 5,
 'L-ORGANIZATION': 6,
 'B-LOCATION': 7,
 'I-QUANTITY': 8,
 'B-ORGANIZATION': 9,
 'B-TIME': 10,
 'I-LOCATION': 11,
 'L-PERSON': 12,
 'I-PERSON': 13,
 'U-TIME': 14,
 'U-LOCATION': 15,
 'L-LOCATION': 16,
 'U-PERSON': 17,
 'B-PERSON': 18,
 'L-QUANTITY': 19,
 'O': 20}

In [11]:
num_words

13031

In [12]:
model = gensim.models.Word2Vec.load("../checkpoint/w2vec_wiki_id_case")

In [13]:
embedding_matrix = np.zeros((len(words), 400))

for i, w in enumerate(words):
    try:
        embedding_vector = model.wv[w]
        embedding_matrix[i] = embedding_vector
    except:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), 400)

In [14]:
max_len = 20
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding='post', value=num_words-1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding='post', value=tag2idx['O'])
y = [to_categorical(i, num_classes=num_tags) for i in y]

In [15]:
np.array(y).shape

(4892, 20, 21)

In [16]:
x_train, x_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.3, random_state=1)
x_val, x_test, y_val, y_test = train_test_split(x_tmp, y_tmp, test_size=0.5, random_state=1)

In [51]:
model = SeqModel([
    Input(shape=(max_len, )),
    Embedding(
        input_dim=embedding_matrix.shape[0], 
        weights=[embedding_matrix], 
        output_dim=embedding_matrix.shape[1], 
        input_length=max_len,
        trainable=False
    ),
    Conv1D(128, 3, activation='relu', padding='same'),
    SpatialDropout1D(0.1),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(num_tags, activation='softmax'))
])
model.summary()

Model: "seq_model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 80, 400)           5212400   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 80, 128)           153728    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 80, 128)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 80, 200)           183200    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 80, 21)            4221      
Total params: 5,553,549
Trainable params: 341,149
Non-trainable params: 5,212,400
_________________________________________________________________


In [52]:
f1_score = tfa.metrics.F1Score(
    num_classes=num_tags,
    average='micro',
    name='f1_score',
    threshold=0.5
)

loss = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

model.compile(loss=loss, optimizer=optimizer, metrics=[f1_score, ConfusionMatrixMetric(num_tags)])

es=tf.keras.callbacks.EarlyStopping(patience=10)

history = model.fit(
    x_train, np.array(y_train),
    validation_data=(x_val, np.array(y_val)),
    epochs=100, verbose=3, callbacks=[es], batch_size=64
)

Train on 3424 samples, validate on 734 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100


In [53]:
tags = [t for t in tag2idx]

In [54]:
from sklearn.metrics import f1_score

In [55]:
preds = []
actuals = []

for i, xt in enumerate(x_val):
    p = model.predict(np.array([xt]))
    p = np.argmax(p, axis=-1)
    y_true = np.argmax(np.array(y_val), axis=-1)[i]
    
    for true, pred in zip(y_true, p[0]):
        actuals.append(true)
        preds.append(pred)
        
print(f1_score(preds, actuals, average='micro'))
print(precision_score(preds, actuals, average='micro'))
print(recall_score(preds, actuals, average='micro'))

0.9827145776566758
0.9827145776566758
0.9827145776566758


In [56]:
print(classification_report(actuals, preds, target_names=tags, ))

                precision    recall  f1-score   support

      I-PERSON       0.64      0.67      0.66        57
I-ORGANIZATION       0.65      0.53      0.58       150
      U-PERSON       0.85      0.85      0.85       256
        B-TIME       0.83      0.52      0.64        48
    U-LOCATION       0.78      0.81      0.79       211
    L-LOCATION       0.77      0.79      0.78       109
L-ORGANIZATION       0.63      0.75      0.69       142
    I-QUANTITY       0.56      0.25      0.35        59
      B-PERSON       0.85      0.85      0.85       227
    I-LOCATION       0.76      0.72      0.74        67
    L-QUANTITY       0.55      0.28      0.38        74
        U-TIME       0.62      0.25      0.36        20
    U-QUANTITY       0.00      0.00      0.00         1
    B-LOCATION       0.80      0.75      0.77       109
        I-TIME       0.83      0.82      0.83       136
B-ORGANIZATION       0.70      0.75      0.72       142
        L-TIME       0.78      0.62      0.69  

In [57]:
preds = []
actuals = []

for i, xt in enumerate(x_val):
    p = model.predict(np.array([xt]))
    p = np.argmax(p, axis=-1)
    y_true = np.argmax(np.array(y_val), axis=-1)[i]
    
    if 13030 in list(xt):
        first_pad_idx = list(xt).index(13030)
        _zip = zip(y_true[:first_pad_idx], p[0][:first_pad_idx])
    else:
        _zip = zip(y_true, p[0])
    
    for true, pred in _zip:
        actuals.append(true)
        preds.append(pred)

print(f1_score(preds, actuals, average='micro'))
print(precision_score(preds, actuals, average='micro'))
print(recall_score(preds, actuals, average='micro'))

0.9315530379661474
0.9315530379661474
0.9315530379661474
