In [1]:
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Flatten, Dense, TimeDistributed, \
    SpatialDropout1D, Bidirectional, Conv1D, MaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D, \
    Concatenate, concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

%matplotlib inline

In [2]:
# class ConfusionMatrixMetric(tf.keras.metrics.Metric):
#     def __init__(self, num_classes, **kwargs):
#         super(ConfusionMatrixMetric,self).__init__(name='confusion_matrix_metric',**kwargs)
#         self.num_classes=num_classes
#         self.total_cm = self.add_weight("total", shape=(num_classes,num_classes), initializer="zeros")
        
#     def reset_states(self):
#         for s in self.variables:
#             s.assign(tf.zeros(shape=s.shape))
            
#     def update_state(self, y_true, y_pred,sample_weight=None):
#         self.total_cm.assign_add(self.confusion_matrix(y_true,y_pred))
#         return self.total_cm
        
#     def result(self):
#         return self.process_confusion_matrix()
    
#     def confusion_matrix(self,y_true, y_pred):
#         y_pred = tf.argmax(y_pred, axis=2)
#         y_true = tf.argmax(y_true, axis=2)

#         y_pred = tf.reshape(y_pred, [-1])
#         y_true = tf.reshape(y_true, [-1])
  
#         cm = tf.math.confusion_matrix(
#             y_true, 
#             y_pred, 
#             dtype=tf.float32, 
#             num_classes=self.num_classes
#         )
        
#         return cm
    
#     def process_confusion_matrix(self):
#         cm = self.total_cm
#         diag_part=tf.linalg.diag_part(cm)
#         precision=diag_part/(tf.reduce_sum(cm,0)+tf.constant(1e-15))
#         recall=diag_part/(tf.reduce_sum(cm,1)+tf.constant(1e-15))
#         f1=2*precision*recall/(precision+recall+tf.constant(1e-15))
#         return precision, recall, f1
    
#     def fill_output(self,output):
#         results=self.result()
#         for i in range(self.num_classes):
#             output['precision_{}'.format(i)]=results[0][i]
#             output['recall_{}'.format(i)]=results[1][i]
#             output['f1_{}'.format(i)]=results[2][i]

In [3]:
df = pd.read_csv('../data/data.csv')

In [4]:
df.head()

Unnamed: 0,word,tag
0,Pengamat,O
1,politik,O
2,dari,O
3,Universitas,B-ORGANIZATION
4,Gadjah,I-ORGANIZATION


In [5]:
sentences = []
cnt = 1

for i in df.itertuples():
    sentences.append(cnt)
    
    if '.' in str(i.word):
        cnt += 1
        
df['sentence #'] = sentences
df.head()

Unnamed: 0,word,tag,sentence #
0,Pengamat,O,1
1,politik,O,1
2,dari,O,1
3,Universitas,B-ORGANIZATION,1
4,Gadjah,I-ORGANIZATION,1


In [6]:
agg_func = lambda s: [(w, t) for w, t in zip(s['word'].values.tolist(), s['tag'].values.tolist())]
grouped = df.groupby('sentence #').apply(agg_func)
sentences = [s for s in grouped]

In [7]:
words = list(set(df['word'].values))
words.append('PADDING')
num_words = len(words)
tags = list(set(df['tag'].values))
num_tags = len(tags)

In [8]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate([tag for tag in tags if tag != 'O'])}
tag2idx['O'] = len(tags)-1

In [9]:
tag2idx

{'B-LOCATION': 0,
 'L-PERSON': 1,
 'U-TIME': 2,
 'B-ORGANIZATION': 3,
 'I-PERSON': 4,
 'L-ORGANIZATION': 5,
 'L-TIME': 6,
 'B-QUANTITY': 7,
 'I-ORGANIZATION': 8,
 'B-PERSON': 9,
 'B-TIME': 10,
 'U-ORGANIZATION': 11,
 'U-PERSON': 12,
 'I-QUANTITY': 13,
 'I-LOCATION': 14,
 'U-LOCATION': 15,
 'L-QUANTITY': 16,
 'U-QUANTITY': 17,
 'L-LOCATION': 18,
 'I-TIME': 19,
 'O': 20}

In [45]:
max_len = 80
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding='post', value=num_words-1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding='post', value=tag2idx['O'])
y = [to_categorical(i, num_classes=num_tags) for i in y]

In [46]:
x_train, x_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=1)

In [47]:
model = gensim.models.Word2Vec.load("../checkpoint/w2vec_wiki_id_case")

In [48]:
embedding_matrix = np.zeros((len(words), 400))

for i, w in enumerate(words):
    try:
        embedding_vector = model.wv[w]
        embedding_matrix[i] = embedding_vector
    except:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), 400)

In [49]:
embedding_matrix.shape

(13031, 400)

In [50]:
input_sequence = Input(shape=(max_len, ))
model = Embedding(
    input_dim=embedding_matrix.shape[0], 
    weights=[embedding_matrix], 
    output_dim=embedding_matrix.shape[1], 
    input_length=max_len,
    trainable=False
)(input_sequence)
model = SpatialDropout1D(0.5)(model)
model2g = Conv1D(128, 2, activation='relu', padding='same')(model)
model4g = Conv1D(128, 4, activation='relu', padding='same')(model)
model6g = Conv1D(128, 6, activation='relu', padding='same')(model)
model8g = Conv1D(128, 8, activation='relu', padding='same')(model)
model10g = Conv1D(128, 10, activation='relu', padding='same')(model)
model = concatenate([model2g, model4g, model6g, model8g, model10g])
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.5))(model)
output_sequence = Dense(num_tags, activation='softmax')(model)
model = Model(input_sequence, output_sequence)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 80)]         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 80, 400)      5212400     input_4[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 80, 400)      0           embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_15 (Conv1D)              (None, 80, 128)      102528      spatial_dropout1d_3[0][0]        
____________________________________________________________________________________________

In [51]:
f1_score = tfa.metrics.F1Score(
    num_classes=num_tags,
    average='micro',
    name='f1_score',
    threshold=0.5
)

loss = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(lr=0.001)

model.compile(loss=loss, optimizer=optimizer, metrics=[f1_score])

In [52]:
early_stopping = EarlyStopping(patience=10, )

history = model.fit(
    x_train, np.array(y_train),
    validation_data=(x_val, np.array(y_val)),
    batch_size=64,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping]
)

Train on 3424 samples, validate on 734 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100


In [53]:
from sklearn.metrics import f1_score

In [54]:
preds = []
actuals = []

for i, xt in enumerate(x_val):
    p = model.predict(np.array([xt]))
    p = np.argmax(p, axis=-1)
    y_true = np.argmax(np.array(y_val), axis=-1)[i]
    
    for true, pred in zip(y_true, p[0]):
        actuals.append(true)
        preds.append(pred)
        
print(f1_score(preds, actuals, average='micro'))

0.9808242506811989


In [55]:
preds = []
actuals = []

for i, xt in enumerate(x_val):
    p = model.predict(np.array([xt]))
    p = np.argmax(p, axis=-1)
    y_true = np.argmax(np.array(y_val), axis=-1)[i]
    
    if 13030 in list(xt):
        first_pad_idx = list(xt).index(13030)
        _zip = zip(y_true[:first_pad_idx], p[0][:first_pad_idx])
    else:
        _zip = zip(y_true, p[0])
    
    for true, pred in _zip:
        actuals.append(true)
        preds.append(pred)

print(f1_score(preds, actuals, average='micro'))

0.9240677051722976
