In [1]:
import talos as ta
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Flatten, Dense, TimeDistributed, \
    SpatialDropout1D, Bidirectional, Conv1D, MaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

%matplotlib inline

Using TensorFlow backend.


In [2]:
# class ConfusionMatrixMetric(tf.keras.metrics.Metric):
#     def __init__(self, num_classes, **kwargs):
#         super(ConfusionMatrixMetric,self).__init__(name='confusion_matrix_metric',**kwargs)
#         self.num_classes=num_classes
#         self.total_cm = self.add_weight("total", shape=(num_classes,num_classes), initializer="zeros")
        
#     def reset_states(self):
#         for s in self.variables:
#             s.assign(tf.zeros(shape=s.shape))
            
#     def update_state(self, y_true, y_pred,sample_weight=None):
#         self.total_cm.assign_add(self.confusion_matrix(y_true,y_pred))
#         return self.total_cm
        
#     def result(self):
#         return self.process_confusion_matrix()
    
#     def confusion_matrix(self,y_true, y_pred):
#         y_pred = tf.argmax(y_pred, axis=2)
#         y_true = tf.argmax(y_true, axis=2)

#         y_pred = tf.reshape(y_pred, [-1])
#         y_true = tf.reshape(y_true, [-1])
  
#         cm = tf.math.confusion_matrix(
#             y_true, 
#             y_pred, 
#             dtype=tf.float32, 
#             num_classes=self.num_classes
#         )
        
#         return cm
    
#     def process_confusion_matrix(self):
#         cm = self.total_cm
#         diag_part=tf.linalg.diag_part(cm)
#         precision=diag_part/(tf.reduce_sum(cm,0)+tf.constant(1e-15))
#         recall=diag_part/(tf.reduce_sum(cm,1)+tf.constant(1e-15))
#         f1=2*precision*recall/(precision+recall+tf.constant(1e-15))
#         return precision, recall, f1
    
#     def fill_output(self,output):
#         results=self.result()
#         for i in range(self.num_classes):
#             output['precision_{}'.format(i)]=results[0][i]
#             output['recall_{}'.format(i)]=results[1][i]
#             output['f1_{}'.format(i)]=results[2][i]

In [3]:
class SeqModel(tf.keras.Sequential):
    def train_step(self, data):
        x, y = data
        
        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)
            loss = self.compiled_loss(
                y,
                y_pred,
                regularization_losses=self.losses,
            )

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y, y_pred)
        output={m.name: m.result() for m in self.metrics[:-1]}
        
        if 'confusion_matrix_metric' in self.metrics_names:
            self.metrics[-1].fill_output(output)

        return output
        
        
    def test_step(self, data):
        x, y = data

        y_pred = self(x, training=False)
        loss = self.compiled_loss(
            y,
            y_pred,
            regularization_losses=self.losses,
        )

        self.compiled_metrics.update_state(y, y_pred)
        output={m.name: m.result() for m in self.metrics[:-1]}
        
        if 'confusion_matrix_metric' in self.metrics_names:
            self.metrics[-1].fill_output(output)    
        
        return output

In [4]:
df = pd.read_csv('../data/data.csv')

In [5]:
df.head()

Unnamed: 0,word,tag
0,Pengamat,O
1,politik,O
2,dari,O
3,Universitas,B-ORGANIZATION
4,Gadjah,I-ORGANIZATION


In [6]:
sentences = []
cnt = 1

for i in df.itertuples():
    sentences.append(cnt)
    
    if '.' in str(i.word):
        cnt += 1
        
df['sentence #'] = sentences
df.head()

Unnamed: 0,word,tag,sentence #
0,Pengamat,O,1
1,politik,O,1
2,dari,O,1
3,Universitas,B-ORGANIZATION,1
4,Gadjah,I-ORGANIZATION,1


In [7]:
agg_func = lambda s: [(w, t) for w, t in zip(s['word'].values.tolist(), s['tag'].values.tolist())]
grouped = df.groupby('sentence #').apply(agg_func)
sentences = [s for s in grouped]

In [8]:
words = list(set(df['word'].values))
words.append('PADDING')
num_words = len(words)
tags = list(set(df['tag'].values))
num_tags = len(tags)

In [9]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate([tag for tag in tags if tag != 'O'])}
tag2idx['O'] = len(tags)-1

In [10]:
tag2idx

{'I-TIME': 0,
 'B-TIME': 1,
 'I-PERSON': 2,
 'I-ORGANIZATION': 3,
 'B-ORGANIZATION': 4,
 'U-LOCATION': 5,
 'U-QUANTITY': 6,
 'U-ORGANIZATION': 7,
 'L-PERSON': 8,
 'B-LOCATION': 9,
 'U-PERSON': 10,
 'B-QUANTITY': 11,
 'L-ORGANIZATION': 12,
 'B-PERSON': 13,
 'U-TIME': 14,
 'I-QUANTITY': 15,
 'L-TIME': 16,
 'L-QUANTITY': 17,
 'I-LOCATION': 18,
 'L-LOCATION': 19,
 'O': 20}

In [11]:
num_words

13031

In [12]:
max_len = 80
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding='post', value=num_words-1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding='post', value=tag2idx['O'])
y = [to_categorical(i, num_classes=num_tags) for i in y]

In [13]:
np.array(y).shape

(4892, 80, 21)

In [14]:
x_train, x_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.3, random_state=1)
x_val, x_test, y_val, y_test = train_test_split(x_tmp, y_tmp, test_size=0.5, random_state=1)

In [15]:
model = gensim.models.Word2Vec.load("../checkpoint/w2vec_wiki_id_case")

In [16]:
embedding_matrix = np.zeros((len(words), 400))

for i, w in enumerate(words):
    try:
        embedding_vector = model.wv[w]
        embedding_matrix[i] = embedding_vector
    except:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), 400)

In [31]:
params = {
    'optimizer': ['Adam'],
    'lr': [0.01],
    'units': [50],
    'dropout': [0.5],
    'batch_size': [64, 128, 256]
}

In [32]:
def create_model(x_train, y_train, x_val, y_val, params):
    model = SeqModel([
        Input(shape=(max_len, )),
        Embedding(
            input_dim= embedding_matrix.shape[0], 
            weights=[embedding_matrix], 
            output_dim=embedding_matrix.shape[1], 
            input_length=max_len,
            trainable=False
        ),
        SpatialDropout1D(params['dropout']),
        Bidirectional(LSTM(units=params['units'], return_sequences=True, recurrent_dropout=params['dropout'])),
        Dense(num_tags, activation='softmax')
    ])

    f1_score_m = tfa.metrics.F1Score(
        num_classes=num_tags,
        average='micro',
        name='f1_score',
        threshold=0.5
    )
    
    if params['optimizer'] == 'Nadam':
        optm = tf.keras.optimizers.Nadam(lr=params['lr'], )
        
    if params['optimizer'] == 'Adam':
        optm = tf.keras.optimizers.Adam(lr=params['lr'])
        
    if params['optimizer'] == 'RMSprop':
        optm = tf.keras.optimizers.RMSprop(lr=params['lr'])
    
    model.compile(
        loss=tf.keras.losses.CategoricalCrossentropy(), 
        optimizer=optm,
        metrics=[f1_score_m]
    )
    
    es = tf.keras.callbacks.EarlyStopping(patience=10)

    history = model.fit(
        x_train, np.array(y_train),
        validation_data=(x_val, np.array(y_val)),
        epochs=100, verbose=1, callbacks=[es], batch_size=params['batch_size']
    )
    
    return history, model

In [33]:
t = ta.Scan(x=x_train,
            y=np.array(y_train),
            x_val=x_val,
            y_val=np.array(y_val),
            model=create_model,
            params=params,
            experiment_name='bilstm_w2v_opt_v2', 
            val_split=None,
)

  0%|          | 0/4 [00:00<?, ?it/s]

Train on 3424 samples, validate on 734 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100


 25%|██▌       | 1/4 [07:26<22:20, 446.70s/it]

Train on 3424 samples, validate on 734 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


 50%|█████     | 2/4 [15:10<15:03, 451.88s/it]

Train on 3424 samples, validate on 734 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100


 75%|███████▌  | 3/4 [23:40<07:49, 469.39s/it]

Train on 3424 samples, validate on 734 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100


100%|██████████| 4/4 [35:21<00:00, 530.33s/it]


In [34]:
t.data[['loss', 'f1_score', 'val_loss', 'val_f1_score', 'batch_size']] \
    .sort_values(by=['val_f1_score'], ascending=False)

Unnamed: 0,loss,f1_score,val_loss,val_f1_score,batch_size
1,0.035668,0.988037,0.054054,0.984472,64
3,0.039222,0.986979,0.051227,0.984379,256
2,0.036361,0.987768,0.054267,0.984085,128
0,0.037876,0.987528,0.056256,0.983888,32
