### Package Preparation

In [1]:
import tensorflow as tf
from tensorflow.keras import backend as K
import numpy as np
import h5py
import os
import datetime
from random import shuffle
from functools import reduce
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/gdrive')
# !pip install tensorflow-gpu
# %load_ext tensorboard

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


### Configurations

In [0]:
max_seg = 10
max_word = 18
level_class_cnt = 3
test_percentage = 0.2

dropout_rate = 0.5
eta = 1e-4
hidden_feature_dim = 100
attention_key_dim = 100
gru_feature_dim = 50
kernel_heights = [3, 4, 5]

batch_size = 256
epochs = 10

input_path = '/content/gdrive/My Drive/data_source/milnet/model_inputs/electronics.hdf5'
w2v_weights_path = '/content/gdrive/My Drive/data_source/milnet/model_inputs/fasttext_weights.npy'
# tensorboard_log_dir = '/Users/Frost/Desktop/log/'

model_out_path = '/content/gdrive/My Drive/data_source/milnet/results/model_stop_w2v_35_11_edu.h5'

sample_amount = 0
mini_batch_cnt = 0
with h5py.File(input_path) as in_file:
    for index in range(len(in_file['label/'].keys())):
        mini_batch_cnt += 1
        sample_amount += len(in_file['label/' + str(index)])
batch_indices = [*range(mini_batch_cnt)]
shuffle(batch_indices)
train_batches = batch_indices[0:int(mini_batch_cnt * (1 - test_percentage))]
test_batches = batch_indices[int(mini_batch_cnt * (1 - test_percentage)):]

### Data Preloading

In [0]:
w2v = np.load(w2v_weights_path, allow_pickle=True)
w2v_len = w2v.shape[1]

### Data Loading Methods

In [0]:
def __label_map(raw_label):
    if raw_label < 2:
        return 0
    elif raw_label < 3:
        return 1
    else:
        return 2

def __balance_data(feature_array, label_array):
    to_balance_indices = np.where(label_array != 1)[0]
    removal_indices = np.random.choice(to_balance_indices, to_balance_indices.shape[0] // 2)
    return np.delete(feature_array, removal_indices, axis=0), np.delete(label_array, removal_indices, axis=0)

def data_generator(batch_indices, max_seg=max_seg, max_word=max_word, epochs=epochs, use_balance=True):
    global batch_size, input_path
    with h5py.File(input_path) as in_file:
        feature_array, label_array = np.zeros((batch_size, max_seg, max_word)), np.zeros((batch_size, 1))
        batch_index = 0
        for _ in range(epochs):
            shuffle(batch_indices)
            for index in batch_indices:
                doc, label = in_file['document/' + str(index)], in_file['label/' + str(index)]
                random_doc_order = [*range(len(doc))]
                shuffle(random_doc_order)
                for i in random_doc_order:
                    feature_array[batch_index] = doc[i][:max_seg, :max_word]
                    label_array[batch_index] = __label_map(label[i])
                    batch_index += 1
                    if batch_index == batch_size:
                        if use_balance:
                            yield __balance_data(feature_array, label_array)
                        else:
                            yield feature_array, label_array
                        batch_index = 0
                        feature_array, label_array = np.zeros((batch_size, max_seg, max_word)), np.zeros((batch_size, 1))

### Methods Definition

In [0]:
''' Slice a piece from one dimension.

The layer would slice the `index`th dimension from `target_dim` dimension of
the input tensor, which have `total_dim` dimensions, then squeeze the tensor
over the sliced dimension.

Args:
    total_dim (int): The total number of dimensions of the input tensor.
    target_dim (int): The index of the dimension that need to slice.
    index (int): The index of the dimension to keep in the slicing operation.

Returns:
    (Model): A keras model that implement the operation.
'''
def __get_filter_layer(total_dim, target_dim, index):
    def tensor_filter(tensor_in):
        nonlocal index
        begin = [0 if i != target_dim else index for i in range(total_dim)]
        size = [-1 if i != target_dim else 1 for i in range(total_dim)]
        return tf.squeeze(tf.slice(tensor_in, begin, size), axis=target_dim)
    return tf.keras.models.Sequential([
        tf.keras.layers.Lambda(tensor_filter)
    ])


''' Implement `submodel` for each slice of tensor.

The model would slice its input tensor into pieces using `__get_filter_layer` 
along `branch_index`th dimension, then for each slice, implement submodel, 
finally the outputs of different submodels would be concated and reshaped to 
meet the demand of output.

Args:
    input_shape tuple(int): The shape of the input tensor.
    branch_index (int): The index of the dimension to slice, start from 0 as 
        sample amount dimension.
    output_shape tuple(int): The shape of the output tensor.
    submodel (Model): The model to apply to different slices.
    args (dict): The argument dictionary for `submodel`.
'''
def get_branch_model(input_shape, branch_index, output_shape, submodel, args={}):
    model_input = tf.keras.Input(input_shape)
    sliced_inputs = [__get_filter_layer(len(input_shape) + 1, branch_index, i)(model_input) 
                     for i in range(input_shape[branch_index - 1])]
    sub_instance = submodel(**args)
    branch_models = [sub_instance(sliced_inputs[i]) 
                     for i in range(input_shape[branch_index - 1])]
    expand_layer = tf.keras.layers.Lambda(lambda x: tf.keras.backend.expand_dims(x, axis=1))
    expanded_outputs = [expand_layer(branch_models[i]) for i in range(input_shape[0])]
    concated_layer = tf.keras.layers.Concatenate(axis=1)(expanded_outputs)
    return tf.keras.Model(model_input, concated_layer)


''' A CNN unit to encode segment with single kernel height.

The unit would apply a convolution to its input to get a 2-dimensional 
tensor, then apply max overtime pooling to get a single dimensional tensor.

Args:
    input_shape ((int, int)): The shape of segment matrix. (word_max, w2v_len)
    hidden_feature_dim (int): The dimension of the hidden feature.
    kernel_height (int): The height of the convolution kernel.
    eta (float): The multiplier of the L2 regularizer term.

Returns:
    (Model): The CNN model to encode the segment matrix.
'''
def __get_sentence_encode_unit(input_shape, hidden_feature_dim, kernel_height, eta):
    cnned_height = input_shape[0] - kernel_height + 1
    return tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(
            filters=hidden_feature_dim, 
            kernel_size=kernel_height,
            kernel_regularizer=tf.keras.regularizers.l2(eta)
        ),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.ReLU(),
        tf.keras.layers.MaxPool1D(cnned_height)
    ])


''' A CNN unit to encode segment with multiple kernel heights

The unit would apply operation defined in `__get_sentence_encode_unit` for 
different kernel heights, then concat the result as a 1-dimensional tensor.

Args:
    input_shape ((int, int)): The shape of the document. (word_max, w2v_len)
    hidden_feature_dim (int): The dimension of the hidden feature.
    kernel_heights ([int]): The list of the kernel heights.
    eta (float): The multiplier of the L2 regularizer term.

Returns:
    (Model): The CNN model to encode the segment matrix.
'''
def __get_multi_kernel_encode_unit(input_shape, hidden_feature_dim, kernel_heights, eta):
    model_input = tf.keras.Input(input_shape)
    cnn_layers = [__get_sentence_encode_unit((input_shape), hidden_feature_dim, h, eta)
                     (model_input) for h in kernel_heights]
    concated_layers = tf.keras.layers.Concatenate()(cnn_layers)
    model_output = tf.keras.layers.Flatten()(concated_layers)
    return tf.keras.Model(model_input, model_output)


''' The softmax linear classifier for predicting segment sentiment.

Args:
    class_cnt (int): Number of classes in the classification.
    dropout_rate (int): The drop out rate of the drop out layer.
    eta (float): The multiplier of the L2 regularizer term.

Returns:
    (Model): The softmax linear classifier to predict segment sentiment.
'''
def __get_seg_classifier_unit(class_cnt, dropout_rate, eta):
    return tf.keras.models.Sequential([
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(
            units=class_cnt, 
            activation='softmax',
            kernel_regularizer=tf.keras.regularizers.l2(eta),
            bias_regularizer=tf.keras.regularizers.l2(eta)
        )
    ])


''' The unit to get the attention weight for a segment from hidden feature.

Args:
    gru_feature_dim: The number of out dimensions of GRU layer.
    dropout_rate: The drop out rate of the drop out layer.
    eta (float): The multiplier of the L2 regularizer term.

Returns:
    (Model): The model for predicting attention weight for a segment.

'''
def __get_attention_unit(attention_key_dim, dropout_rate, eta):
    return tf.keras.models.Sequential([
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(
            units=attention_key_dim, 
            activation='tanh',
            kernel_regularizer=tf.keras.regularizers.l2(eta),
            bias_regularizer=tf.keras.regularizers.l2(eta)
        ),
        tf.keras.layers.Dense(
            units=1, 
            use_bias=False, 
            activation='softmax',
            bias_regularizer=tf.keras.regularizers.l2(eta)
        )
    ])


''' A bidirectional-GRU unit to extract the hidden vectors.

The hidden vectors are used to predict the attention weights of the model.

Args:
    gru_feature_dim (int): The output dimension of the GRU layer.

Returns:
    (Model): The bidirectional-GRU unit to predict the hidden vectors.
'''
def get_bidirectional_gru_model(gru_feature_dim):
    return tf.keras.models.Sequential([
        tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(gru_feature_dim, return_sequences=True)
        )
    ])

''' Merge the attention weights and the instance predictions.

This model will merge the attentions and instance predictions using weighted sum.

Args:
    class_cnt (int): Number of classes in the classification.
    eta (float): The multiplier of the L2 regularizer term.
'''
def get_merge_model(class_cnt, eta):
    return tf.keras.models.Sequential([
        tf.keras.layers.Dot(axes=1),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(
            units=class_cnt, 
            activation='softmax',
            kernel_regularizer=tf.keras.regularizers.l2(eta),
            bias_regularizer=tf.keras.regularizers.l2(eta)
        )
    ])
    

''' A method to test the accuracy of the model, as well as precision, recal and F1 score

Args:
    model (Compiled Model): A tf.keras model for prediction
    generator (generator): A generator to generate batches of test data
    class_cnt (int): Number of classes of the label
'''
def performance_judge(model, generator, class_cnt):
    eps = np.finfo(float).eps
    accuracy, precisions, recalls, f1s = [], [], [], []
    for i, (features, labels) in enumerate(generator):
        predicted = model.predict(features)
        precisions.append([])
        recalls.append([])
        f1s.append([])
        contingency_table = np.zeros((class_cnt, class_cnt))
        for index in range(features.shape[0]):
            contingency_table[int(labels[index][0])][np.argmax(predicted[index])] += 1
        accuracy.append(np.trace(contingency_table) / features.shape[0])
        for index in range(class_cnt):
            pass
            precisions[i].append(contingency_table[index][index] / (np.sum(contingency_table[:, index]) + eps))
            recalls[i].append(contingency_table[index][index] / (np.sum(contingency_table[index, :]) + eps))
            f1s[i].append(2 * precisions[i][-1] * recalls[i][-1] / ((precisions[i][-1] + recalls[i][-1]) + eps))
    precisions = [float(sum(l))/len(l) for l in zip(*precisions)]
    recalls = [float(sum(l))/len(l) for l in zip(*recalls)]
    f1s = [float(sum(l))/len(l) for l in zip(*f1s)]
    print('Accuracy:', round(reduce(lambda x, y: x + y, accuracy) / len(accuracy), 3))
    for index in range(class_cnt):
        print('_____ Class', index, '_____')
        print('Precision\t', round(precisions[index], 3))
        print('Recall\t\t', round(recalls[index], 3))
        print('F1 Score\t', round(f1s[index], 3))

### Model Construction and Compile

In [6]:
print('Constructing Model ...', end='')

model_input = tf.keras.Input((max_seg, max_word))

embedding_layer = tf.keras.layers.Embedding(
    input_dim=w2v.shape[0], 
    output_dim=w2v_len, 
    weights=[w2v], 
    input_length=max_word, 
    trainable=False
)(model_input)

encoding_model = get_branch_model(
    input_shape=(max_seg, max_word, w2v_len), 
    branch_index=1, 
    output_shape=(max_seg, len(kernel_heights) * hidden_feature_dim), 
    submodel=__get_multi_kernel_encode_unit, 
    args={
        'input_shape': (max_word, w2v_len), 
        'hidden_feature_dim': hidden_feature_dim,
        'kernel_heights': kernel_heights, 
        'eta': eta
    }
)(embedding_layer)

biglu_model = get_bidirectional_gru_model(
    gru_feature_dim=gru_feature_dim
)(encoding_model)

attention_model = get_branch_model(
    input_shape=(max_seg, 2 * gru_feature_dim), 
    branch_index=1, 
    output_shape=(max_seg, 1), 
    submodel=__get_attention_unit,
    args={
        'attention_key_dim': attention_key_dim,
        'dropout_rate': dropout_rate,
        'eta': eta
    }
)(biglu_model)

classification_model = get_branch_model(
    input_shape=(max_seg, len(kernel_heights) * hidden_feature_dim), 
    branch_index=1, 
    output_shape=(max_seg, level_class_cnt), 
    submodel=__get_seg_classifier_unit,
    args={
        'class_cnt': level_class_cnt,
        'dropout_rate': dropout_rate,
        'eta': eta
    }
)(encoding_model)

merge_model = get_merge_model(
    class_cnt=level_class_cnt,
    eta=eta
)([attention_model, classification_model])

model = tf.keras.Model(model_input, merge_model)

print('\rModel Constructed. Compiling ...', end='')

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

print('\rModel Compiled.')

model.summary()

W0711 18:54:15.965303 140647093032832 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Constructing Model ...

W0711 18:54:19.315662 140647093032832 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0711 18:54:23.511056 140647093032832 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0711 18:54:23.512325 140647093032832 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with

Model Compiled.
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 10, 18)]     0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 18, 300)  56619900    input_1[0][0]                    
__________________________________________________________________________________________________
model_1 (Model)                 (None, 10, 300)      361500      embedding[0][0]                  
__________________________________________________________________________________________________
sequential_13 (Sequential)      (None, 10, 100)      105300      model_1[1][0]                    
____________________________________________________________________________

### Training and Evaluation

In [7]:
logdir = os.path.join('logs', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

model.fit_generator(
    data_generator(train_batches, use_balance=False), 
    steps_per_epoch=(sample_amount * (1 - test_percentage) // batch_size) - 1,
    epochs=epochs,
    callbacks=[tensorboard_callback]
)

# model.save(model_out_path)

print('########## Training Error ##########')
performance_judge(model, data_generator(train_batches, epochs=1), level_class_cnt)

print('############ Test Error ############')
performance_judge(model, data_generator(test_batches, epochs=1), level_class_cnt)

Epoch 1/10


W0711 18:55:20.470489 140647093032832 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


   1/1279 [..............................] - ETA: 4:59:01 - loss: 2.7637 - acc: 0.3945

W0711 18:55:33.770370 140647093032832 callbacks.py:257] Method (on_train_batch_end) is slow compared to the batch update (0.291299). Check your callbacks.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 171/1279 [===>..........................] - ETA: 2:55 - loss: 1.0557 - acc: 0.4157

KeyboardInterrupt: ignored