In [None]:
import tensorflow as tf
tf.random.set_seed(1)
tf.compat.v1.enable_eager_execution()

import numpy as np
np.random.seed(1)

import gensim
import pandas as pd
import matplotlib.pyplot as plt
import talos as ta

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, \
    SpatialDropout1D, Bidirectional, Conv1D, concatenate
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import Callback

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix

%matplotlib inline

In [None]:
print(tf.test.gpu_device_name())

/device:GPU:0


In [None]:
df = pd.read_csv('../data/data.csv')

In [None]:
df.head()

Unnamed: 0,word,tag
0,Pengamat,O
1,politik,O
2,dari,O
3,Universitas,B-ORGANIZATION
4,Gadjah,I-ORGANIZATION


In [None]:
sentences = []
cnt = 1

for i in df.itertuples():
    sentences.append(cnt)
    
    if '.' in str(i.word):
        cnt += 1
        
df['sentence #'] = sentences
df.head()

Unnamed: 0,word,tag,sentence #
0,Pengamat,O,1
1,politik,O,1
2,dari,O,1
3,Universitas,B-ORGANIZATION,1
4,Gadjah,I-ORGANIZATION,1


In [None]:
agg_func = lambda s: [(w, t) for w, t in zip(s['word'].values.tolist(), s['tag'].values.tolist())]
grouped = df.groupby('sentence #').apply(agg_func)
sentences = [s for s in grouped]

In [None]:
words = list(set(df['word'].values))
words.append('PADDING')
num_words = len(words)
tags = list(set(df['tag'].values))
num_tags = len(tags)

In [None]:
tags = sorted([t for t in tags if t != 'O'], key=lambda x: (x[2], x[0]))
tags.append('O')

In [None]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
tag2idx

{'B-LOCATION': 0,
 'B-ORGANIZATION': 4,
 'B-PERSON': 8,
 'B-QUANTITY': 12,
 'B-TIME': 16,
 'I-LOCATION': 1,
 'I-ORGANIZATION': 5,
 'I-PERSON': 9,
 'I-QUANTITY': 13,
 'I-TIME': 17,
 'L-LOCATION': 2,
 'L-ORGANIZATION': 6,
 'L-PERSON': 10,
 'L-QUANTITY': 14,
 'L-TIME': 18,
 'O': 20,
 'U-LOCATION': 3,
 'U-ORGANIZATION': 7,
 'U-PERSON': 11,
 'U-QUANTITY': 15,
 'U-TIME': 19}

### Pad Sequence

In [None]:
max_len = 40
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding='post', value=num_words-1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding='post', value=tag2idx['O'])
y = [to_categorical(i, num_classes=num_tags) for i in y]

In [None]:
x_train, x_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=1)
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5, random_state=1)

### Pre-trained Word2Vec Embedding

In [None]:
model = gensim.models.Word2Vec.load("../checkpoint/w2vec_wiki_id_case")

In [None]:
embedding_matrix = np.zeros((len(words), 400))

for i, w in enumerate(words):
    try:
        embedding_vector = model.wv[w]
        embedding_matrix[i] = embedding_vector
    except:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), 400)

Initial parameters

In [None]:
def f1_score_micro(y_true, y_pred):
    y_true = np.argmax(y_true.numpy(), axis=-1).reshape(-1)
    y_pred = np.argmax(y_pred.numpy(), axis=-1).reshape(-1)
    
    # label O will be ignored during training and evaluation
    o_pad_idx = np.where(y_true==20) # 20 is the idx for label O
    
    # remove label O
    y_true = np.delete(y_true, o_pad_idx)
    y_pred = np.delete(y_pred, o_pad_idx)
    
    # compute f1 score with micro average
    score = f1_score(y_true, y_pred, average='micro')
    
    return score

In [None]:
def create_model(x_train, y_train, x_val, y_val, params):
    input_sequence = Input(shape=(max_len, ))
    
    model = Embedding(
        input_dim= embedding_matrix.shape[0], 
        weights=[embedding_matrix], 
        output_dim=embedding_matrix.shape[1], 
        input_length=max_len,
        trainable=False
    )(input_sequence)
    
    # dropout layer
    model = SpatialDropout1D(params['dropout'])(model)
            
    # lstm layer
    model = Bidirectional(LSTM(units=params['lstm_units'], return_sequences=True))(model)

    # convolution layer
    model2g = Conv1D(params['filters'], 2, activation='relu', padding='same')(model)
    model4g = Conv1D(params['filters'], 4, activation='relu', padding='same')(model)
    model6g = Conv1D(params['filters'], 6, activation='relu', padding='same')(model)
    model8g = Conv1D(params['filters'], 8, activation='relu', padding='same')(model)
    model10g = Conv1D(params['filters'], 10, activation='relu', padding='same')(model)
    
    model = concatenate([model2g, model4g, model6g, model8g, model10g])
    
    # output layer
    output_sequence = Dense(num_tags, activation='softmax')(model)
    
    # model
    model = Model(input_sequence, output_sequence)
    
    # learning algorithm (optimizer)
    if params['optimizer'] == 'Nadam':
        optm = tf.keras.optimizers.Nadam(lr=params['lr'])
        
    if params['optimizer'] == 'Adam':
        optm = tf.keras.optimizers.Adam(lr=params['lr'])
        
    if params['optimizer'] == 'RMSprop':
        optm = tf.keras.optimizers.RMSprop(lr=params['lr'])
    
    loss = tf.keras.losses.CategoricalCrossentropy()

    model.compile(loss=loss, optimizer=optm, metrics=[f1_score_micro], run_eagerly=True)

    early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_f1_score_micro', mode='max')

    history = model.fit(
      x_train, np.array(y_train),
      validation_data=(x_val, np.array(y_val)),
      epochs=100, verbose=1, callbacks=[early_stopping], batch_size=params['batch_size']
    )

    return history, model

In [None]:
params = {
    'optimizer': ['Adam'],
    'lr': [0.01],
    'filters': [64],
    'lstm_units': [100],
    'dropout': [0.1, 0.3, 0.5, 0.7],
    'batch_size': [128]
}

In [None]:
t = ta.Scan(x=x_train,
    y=np.array(y_train),
    x_val=x_val,
    y_val=np.array(y_val),
    model=create_model,
    params=params,
    experiment_name='bilstm_cnns_w2v_opt_v3', 
    val_split=None
)


  0%|          | 0/4 [00:00<?, ?it/s][A

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100



 25%|██▌       | 1/4 [01:07<03:22, 67.49s/it][A

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100



 50%|█████     | 2/4 [01:37<01:52, 56.22s/it][A

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100



 75%|███████▌  | 3/4 [02:10<00:49, 49.23s/it][A

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100



100%|██████████| 4/4 [02:53<00:00, 43.44s/it]


In [None]:
# round 4
t.data[['loss', 'f1_score_micro', 'val_loss', 'val_f1_score_micro', 'dropout']] \
    .sort_values(by=['val_f1_score_micro'], ascending=False)

Unnamed: 0,loss,f1_score_micro,val_loss,val_f1_score_micro,dropout
3,0.048002,0.863403,0.137955,0.76346,0.7
1,0.040812,0.891935,0.135959,0.736122,0.3
2,0.033616,0.907585,0.144163,0.724609,0.5
0,0.006004,0.986169,0.270796,0.702142,0.1


In [None]:
# round 3
t.data[['loss', 'f1_score_micro', 'val_loss', 'val_f1_score_micro', 'filters']] \
    .sort_values(by=['val_f1_score_micro'], ascending=False)

Unnamed: 0,loss,f1_score_micro,val_loss,val_f1_score_micro,filters
1,0.021418,0.942006,0.165591,0.732211,64
0,0.016788,0.95494,0.182246,0.724843,32
2,0.03437,0.907809,0.139293,0.712943,128
3,1.342312,0.0,1.307674,0.0,256


In [None]:
# round 2
t.data[['loss', 'f1_score_micro', 'val_loss', 'val_f1_score_micro', 'lstm_units']] \
    .sort_values(by=['val_f1_score_micro'], ascending=False)

Unnamed: 0,loss,f1_score_micro,val_loss,val_f1_score_micro,lstm_units
1,0.02347,0.936328,0.154423,0.743864,100
0,0.018684,0.951146,0.180076,0.721642,50
2,0.024138,0.935081,0.160149,0.715991,200


In [None]:
# round 1
t.data[['loss', 'f1_score_micro', 'val_loss', 'val_f1_score_micro', 'optimizer', 'lr']] \
    .sort_values(by=['val_f1_score_micro'], ascending=False)

Unnamed: 0,loss,f1_score_micro,val_loss,val_f1_score_micro,optimizer,lr
1,0.033131,0.909018,0.155255,0.749481,Adam,0.01
2,0.018617,0.959671,0.242784,0.725582,RMSprop,0.01
0,0.01427,0.961801,0.200763,0.722709,Nadam,0.01
4,0.053579,0.844449,0.143233,0.680638,Adam,0.001
3,0.05252,0.849761,0.139515,0.665578,Nadam,0.001
5,0.062011,0.819389,0.13663,0.652294,RMSprop,0.001
8,0.128877,0.603348,0.149714,0.567691,RMSprop,0.0001
7,0.138077,0.576162,0.153528,0.530675,Adam,0.0001
6,0.140273,0.570369,0.157246,0.529732,Nadam,0.0001
