In [32]:
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_addons as tfa

tf.compat.v1.enable_eager_execution()

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Flatten, Dense, TimeDistributed, \
    SpatialDropout1D, Bidirectional, Conv1D, MaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

%matplotlib inline

In [81]:
class ConfusionMatrixMetric(tf.keras.metrics.Metric):
    def __init__(self, num_classes, **kwargs):
        super(ConfusionMatrixMetric,self).__init__(name='confusion_matrix_metric',**kwargs)
        self.num_classes=num_classes
        self.f1_score_micro = self.add_weight("total", shape=(num_classes, num_classes), initializer="zeros")
        
    def reset_states(self):
        for s in self.variables:
            s.assign(tf.zeros(shape=s.shape))
            
    def update_state(self, y_true, y_pred, sample_weight=None):
        self.f1_score_micro.assign_add(self.compute_f1_score_micro(y_true, y_pred))
        return self.f1_score_micro
        
    def result(self):
        return self.process_f1_score_micro()
    
    def compute_f1_score_micro(self, y_true, y_pred):
        y_true = np.argmax(y_true.numpy(), axis=-1).reshape(-1)
        y_pred = np.argmax(y_pred.numpy(), axis=-1).reshape(-1)
        
        o_pad_idx = np.where(y_true==20)
        
        y_true = np.delete(y_true, o_pad_idx)
        y_pred = np.delete(y_pred, o_pad_idx)
  
        f1s = f1_score(y_true, y_pred, average='micro')
        
        print(f1s)
        
        return f1s
    
    def process_f1_score_micro(self):
        return self.f1_score_micro
    
    def fill_output(self,output):
        results = self.result()
        return results

In [2]:
class SeqModel(tf.keras.Sequential):
    def train_step(self, data):
        x, y = data
        
        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)
            loss = self.compiled_loss(
                y,
                y_pred,
                regularization_losses=self.losses,
            )

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y, y_pred)
        output={m.name: m.result() for m in self.metrics[:-1]}
        
        if 'confusion_matrix_metric' in self.metrics_names:
            self.metrics[-1].fill_output(output)

        return output
        
        
    def test_step(self, data):
        x, y = data

        y_pred = self(x, training=False)
        loss = self.compiled_loss(
            y,
            y_pred,
            regularization_losses=self.losses,
        )

        self.compiled_metrics.update_state(y, y_pred)
        output={m.name: m.result() for m in self.metrics[:-1]}
        
        if 'confusion_matrix_metric' in self.metrics_names:
            self.metrics[-1].fill_output(output)    
        
        return output

In [3]:
df = pd.read_csv('../data/data.csv')

In [4]:
df.head()

Unnamed: 0,word,tag
0,Pengamat,O
1,politik,O
2,dari,O
3,Universitas,B-ORGANIZATION
4,Gadjah,I-ORGANIZATION


In [5]:
sentences = []
cnt = 1

for i in df.itertuples():
    sentences.append(cnt)
    
    if '.' in str(i.word):
        cnt += 1
        
df['sentence #'] = sentences
df.head()

Unnamed: 0,word,tag,sentence #
0,Pengamat,O,1
1,politik,O,1
2,dari,O,1
3,Universitas,B-ORGANIZATION,1
4,Gadjah,I-ORGANIZATION,1


In [6]:
agg_func = lambda s: [(w, t) for w, t in zip(s['word'].values.tolist(), s['tag'].values.tolist())]
grouped = df.groupby('sentence #').apply(agg_func)
sentences = [s for s in grouped]

In [7]:
words = list(set(df['word'].values))
words.append('PADDING')
num_words = len(words)
tags = list(set(df['tag'].values))
num_tags = len(tags)

In [8]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate([tag for tag in tags if tag != 'O'])}
tag2idx['O'] = len(tags)-1

In [9]:
tag2idx

{'L-ORGANIZATION': 0,
 'I-TIME': 1,
 'U-ORGANIZATION': 2,
 'L-LOCATION': 3,
 'U-PERSON': 4,
 'I-LOCATION': 5,
 'U-LOCATION': 6,
 'U-TIME': 7,
 'U-QUANTITY': 8,
 'L-QUANTITY': 9,
 'B-LOCATION': 10,
 'B-TIME': 11,
 'I-QUANTITY': 12,
 'L-PERSON': 13,
 'I-PERSON': 14,
 'I-ORGANIZATION': 15,
 'B-ORGANIZATION': 16,
 'L-TIME': 17,
 'B-QUANTITY': 18,
 'B-PERSON': 19,
 'O': 20}

In [10]:
num_words

13031

In [11]:
max_len = 60
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding='post', value=num_words-1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding='post', value=tag2idx['O'])
y = [to_categorical(i, num_classes=num_tags) for i in y]

In [12]:
np.array(y).shape

(4892, 60, 21)

In [13]:
X[0]

array([ 2319,   574,  1942,  4847,  5615,  2764, 12565,  2384,  1880,
       12565,  5882, 12565,  7341,  8932,  1785,  6126,  1397, 10136,
        4265,  1527, 11466,  7710,  7514,  8643,  2595,  9699, 12853,
        5469,  7959, 13030, 13030, 13030, 13030, 13030, 13030, 13030,
       13030, 13030, 13030, 13030, 13030, 13030, 13030, 13030, 13030,
       13030, 13030, 13030, 13030, 13030, 13030, 13030, 13030, 13030,
       13030, 13030, 13030, 13030, 13030, 13030], dtype=int32)

In [14]:
from collections import Counter

In [15]:
len(arr)

NameError: name 'arr' is not defined

In [None]:
64*60

In [16]:
len(arr.reshape(-1))

NameError: name 'arr' is not defined

In [17]:
arr = np.argmax(y[:64], axis=-1)
print(arr.shape)
Counter(arr.reshape(-1))

(64, 60)


Counter({20: 3554,
         16: 13,
         15: 21,
         0: 13,
         19: 23,
         13: 23,
         4: 26,
         2: 15,
         11: 6,
         1: 33,
         17: 6,
         6: 11,
         10: 6,
         5: 14,
         3: 6,
         14: 50,
         18: 10,
         9: 10})

In [18]:
idx = np.where(arr.reshape(-1)==20)
idx

(array([   0,    1,    2, ..., 3837, 3838, 3839]),)

In [19]:
np.delete(arr.reshape(-1), idx)

array([16, 15,  0, 19, 13, 16,  0, 19, 13,  4,  2,  4,  4,  2,  2,  4, 11,
        1,  1,  1,  1,  1,  1, 17,  4,  4,  2, 16, 15, 15,  0, 19, 13, 19,
       13,  4,  4,  6,  4,  4,  4,  4,  4,  4,  4,  2, 16, 15,  0,  4,  2,
        4,  2, 19, 13, 19, 13,  4, 19, 13, 10,  5,  5,  5,  5,  3, 11,  1,
        1,  1,  1,  1,  1, 17,  6, 19, 13, 16, 15,  0,  6, 19, 14, 14, 13,
       16,  0,  4,  6,  4, 16, 15, 15, 15, 15, 15, 15,  0, 10,  5,  5,  3,
       18,  9,  6, 19, 14, 13,  6, 19, 13,  6, 19, 14, 13,  6, 19, 13,  6,
       19, 14, 13, 16,  0, 19, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13,
       10,  3,  2, 16,  0, 19, 13,  6, 11,  1,  1,  1,  1,  1,  1, 17, 16,
       15, 15, 15, 15, 15,  0, 16, 15,  0, 18,  9, 19, 14, 14, 14, 14, 14,
       14, 14, 14, 13, 18,  9, 19, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13,
       18,  9, 19, 14, 14, 14, 14, 14, 13, 18,  9, 19, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 13, 18,  9,  4,  6,  2,  4,  4,  2,  4,
       19, 14, 13, 16,  0

In [20]:
len(np.delete(arr, idx))

286

In [21]:
x_train, x_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=1)

In [22]:
model = SeqModel([
    Input(shape=(max_len, )),
    Embedding(
        input_dim=num_words,
        output_dim=400,
        input_length=max_len,
    ),
    SpatialDropout1D(0.1),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    Dense(num_tags, activation='softmax')
])
model.summary()

Model: "seq_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 60, 400)           5212400   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 60, 400)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 60, 200)           400800    
_________________________________________________________________
dense (Dense)                (None, 60, 21)            4221      
Total params: 5,617,421
Trainable params: 5,617,421
Non-trainable params: 0
_________________________________________________________________


In [None]:
class ConfusionMatrixMetric(tf.keras.metrics.Metric):
    """
    A custom Keras metric to compute the running average of the confusion matrix
    """
    def __init__(self, num_classes, **kwargs):
        super(ConfusionMatrixMetric,self).__init__(name='confusion_matrix_metric',**kwargs) # handles base args (e.g., dtype)
        self.num_classes=num_classes
        self.total_cm = self.add_weight("total", shape=(num_classes,num_classes), initializer="zeros")
        
    def reset_states(self):
        for s in self.variables:
            s.assign(tf.zeros(shape=s.shape))
            
    def update_state(self, y_true, y_pred,sample_weight=None):
        self.total_cm.assign_add(self.confusion_matrix(y_true,y_pred))
        return self.total_cm
        
    def result(self):
        return self.process_confusion_matrix()
    
    def confusion_matrix(self,y_true, y_pred):
        """
        Make a confusion matrix
        """
        y_pred=tf.argmax(y_pred,1)
        cm=tf.math.confusion_matrix(y_true,y_pred,dtype=tf.float32,num_classes=self.num_classes)
        return cm
    
    def process_confusion_matrix(self):
        "returns precision, recall and f1 along with overall accuracy"
        cm=self.total_cm
        diag_part=tf.linalg.diag_part(cm)
        precision=diag_part/(tf.reduce_sum(cm,0)+tf.constant(1e-15))
        recall=diag_part/(tf.reduce_sum(cm,1)+tf.constant(1e-15))
        f1=2*precision*recall/(precision+recall+tf.constant(1e-15))
        return precision,recall,f1
    
    def fill_output(self,output):
        results=self.result()
        for i in range(self.num_classes):
            output['precision_{}'.format(i)]=results[0][i]
            output['recall_{}'.format(i)]=results[1][i]
            output['F1_{}'.format(i)]=results[2][i]
    


In [None]:
class BinaryTruePositives(tf.keras.metrics.Metric):

  def __init__(self, name='binary_true_positives', **kwargs):
    super(BinaryTruePositives, self).__init__(name=name, **kwargs)
    self.true_positives = self.add_weight(name='tp', initializer='zeros')

  def update_state(self, y_true, y_pred, sample_weight=None):
    y_true = tf.cast(y_true, tf.bool)
    y_pred = tf.cast(y_pred, tf.bool)

    values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
    values = tf.cast(values, self.dtype)
    if sample_weight is not None:
      sample_weight = tf.cast(sample_weight, self.dtype)
      values = tf.multiply(values, sample_weight)
    self.true_positives.assign_add(tf.reduce_sum(values))

  def result(self):
    return self.true_positives

  def reset_states(self):
    self.true_positives.assign(0)

In [122]:
class F1ScoreMetric(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score_metric', **kwargs):
        super(F1ScoreMetric, self).__init__(name=name, **kwargs)
        self.f1_score_micro = self.add_weight(name='f1_score_metric', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = np.argmax(y_true.numpy(), axis=-1).reshape(-1)
        y_pred = np.argmax(y_pred.numpy(), axis=-1).reshape(-1)

        o_pad_idx = np.where(y_true==20)

        y_true = np.delete(y_true, o_pad_idx)
        y_pred = np.delete(y_pred, o_pad_idx)

        score = f1_score(y_true, y_pred, average='micro')
        self.f1_score_micro.assign_add(score)

    def result(self):
        return self.f1_score_micro

    def reset_states(self):
        self.f1_score_micro.assign(0)

In [124]:
m = F1ScoreMetric()
m.update_state([0, 1, 1, 1], [0, 1, 0, 0])
print('Intermediate result:', float(m.result()))

m.update_state([1, 1, 1, 1], [0, 1, 1, 0])
print('Final result:', float(m.result()))

AttributeError: 'list' object has no attribute 'numpy'

In [None]:
# class F1ScoreMetric(tf.keras.metrics.Metric):
#     def __init__(self, num_classes, **kwargs):
#         super(ConfusionMatrixMetric,self).__init__(name='f1_score_metric', **kwargs)
#         self.num_classes=num_classes
#         self.total_cm = self.add_weight("total", shape=(num_classes,num_classes), initializer="zeros")
        
#     def reset_states(self):
#         for s in self.variables:
#             s.assign(tf.zeros(shape=s.shape))
            
#     def update_state(self, y_true, y_pred, sample_weight=None):
#         self.total_cm.assign_add(self.f1_score_micro(y_true,y_pred))
#         return self.total_cm
    
#     def result(self):
#         return self.process_confusion_matrix()
    
#     def f1_score_micro(self, y_true, y_pred):
#         y_true = np.argmax(y_true.numpy(), axis=-1).reshape(-1)
#         y_pred = np.argmax(y_pred.numpy(), axis=-1).reshape(-1)

#         o_pad_idx = np.where(y_true==20)

#         y_true = np.delete(y_true, o_pad_idx)
#         y_pred = np.delete(y_pred, o_pad_idx)

#         score = f1_score(y_true, y_pred, average='micro')

#         return score
    
    
    
    

In [33]:
input_sequence = Input(shape=(max_len, ))
model = Embedding(
        input_dim=num_words,
        output_dim=400,
        input_length=max_len,
)(input_sequence)
model = SpatialDropout1D(0.5)(model)
model2g = Conv1D(128, 2, activation='relu', padding='same')(model)
model4g = Conv1D(128, 4, activation='relu', padding='same')(model)
model6g = Conv1D(128, 6, activation='relu', padding='same')(model)
model8g = Conv1D(128, 8, activation='relu', padding='same')(model)
model10g = Conv1D(128, 10, activation='relu', padding='same')(model)
model = concatenate([model2g, model4g, model6g, model8g, model10g])
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.5))(model)
output_sequence = Dense(num_tags, activation='softmax')(model)
model = Model(input_sequence, output_sequence)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 60, 400)      5212400     input_3[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 60, 400)      0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 60, 128)      102528      spatial_dropout1d_2[0][0]        
______________________________________________________________________________________________

In [34]:
def f1_micro_metric(y_true, y_pred):
    y_true = np.argmax(y_true.numpy(), axis=-1).reshape(-1)
    y_pred = np.argmax(y_pred.numpy(), axis=-1).reshape(-1)

    o_pad_idx = np.where(y_true==20)

    y_true = np.delete(y_true, o_pad_idx)
    y_pred = np.delete(y_pred, o_pad_idx)

    score = f1_score(y_true, y_pred, average='micro')

    return score

In [35]:
loss = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

model.compile(
    loss=loss, 
    optimizer=optimizer, 
    metrics=[f1_micro_metric],
    run_eagerly=True,
    experimental_run_tf_function=False,
)

In [None]:
es = tf.keras.callbacks.EarlyStopping(patience=10, monitor='f1_micro_metric', mode='max')

history = model.fit(
    x_train, np.array(y_train),
    validation_data=(x_val, np.array(y_val)),
    epochs=1, verbose=1, callbacks=[es], batch_size=64
)

 4/54 [=>............................] - ETA: 47s - loss: 1.8398 - f1_micro_metric: 0.0070

In [134]:
history.history

{'loss': [0.07117744535207748], 'val_loss': [0.11728736758232117]}