In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K
import prepare_data
from prepare_data import tokenizer, tokenize_if_small_enough
import read
from keras.utils import to_categorical
# # Initialize session
sess = tf.Session()

# # Params for bert model and tokenization
# # bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
# # max_seq_length = 256

W0514 01:01:49.830568 140307046811392 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


Instructions for updating:
Colocations handled automatically by placer.


W0514 01:01:51.452780 140307046811392 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0514 01:01:53.491857 140307046811392 saver.py:1483] Saver not created because there are no variables in the graph to restore
Using TensorFlow backend.


# Data

First, we load the data

In [2]:
sentence=True
no_context=False
neeg_dataset=False
conceptnet=True
input_size=10000
logs = './logs'

In [3]:
train_dataset = read.read_data_iterator('dataset/gw_extractions_enriched.pickle')
features = list(tokenize_if_small_enough(train_dataset,
                                         sentence, no_context,
                                         is_neeg=neeg_dataset,
                                         conceptnet=conceptnet,
                                         input_size=100))
sample_size = len(features)
training_pct = 0.8
val_pct = 0.1
test_pct = 0.1
train_set_size = int(sample_size * training_pct)
val_set_size = int(sample_size * val_pct)
test_set_size = sample_size - train_set_size - val_set_size

train_features = features[:train_set_size]
val_features = features[train_set_size:train_set_size+val_set_size]
test_features = features[train_set_size+val_set_size:train_set_size+val_set_size+test_set_size]

# Tokenize

Next, tokenize our text to create `input_ids`, `input_masks`, and `segment_ids`

In [6]:
def pad_to_max_of_max(ls_ls):
    inner_max_pad = max([max([np.array(t).shape for t in twe], key=lambda x:x[0])[0] 
                   for twe in ls_ls])
    inner_maxed = [np.array([np.pad(np.array(a), ((0, inner_max_pad-np.array(a).shape[0]), (0, 0)), mode='constant') for a in ls]) for ls in ls_ls]
    outer_max = max((e.shape[0] for e in inner_maxed))
    return np.array([np.pad(e, ((0, outer_max-e.shape[0]), (0, 0), (0, 0)), mode='constant') for e in inner_maxed])
    
def pad_to_max(ls):
    ls = [np.array(e) for e in ls]
    max_pad = max((e.shape[0] for e in ls))
    return np.array([np.pad(e, ((0, max_pad-e.shape[0]), (0, 0)), mode='constant') for e in ls])

In [7]:
train_input_ids = np.array([f.input_ids for f in train_features])
train_input_masks = np.array([f.input_mask for f in train_features])
train_segment_ids = np.array([f.segment_ids for f in train_features])
train_labels = np.array([to_categorical(f.label_id - 1, num_classes=5) for f in train_features])
train_word_events = pad_to_max_of_max([f.event_concept_vectors for f in train_features])
train_word_candidates = pad_to_max_of_max([f.candidate_concept_vectors for f in train_features])
train_sent_events = pad_to_max([[np.concatenate([f.event_sentence_pos[i], f.event_sentence_dep[i]]) 
                      for i in range(len(f.event_sentence_pos))] 
                     for f in train_features])
train_sent_candidates = np.zeros((train_sent_events.shape[0], 5, 1, train_sent_events.shape[-1]))


val_input_ids = np.array([f.input_ids for f in val_features])
val_input_masks = np.array([f.input_mask for f in val_features])
val_segment_ids = np.array([f.segment_ids for f in val_features])
val_labels = np.array([to_categorical(f.label_id - 1, num_classes=5) for f in val_features])
val_word_events = pad_to_max_of_max([f.event_concept_vectors for f in val_features])
val_word_candidates = pad_to_max_of_max([f.candidate_concept_vectors for f in val_features])
val_sent_events = pad_to_max([[np.concatenate([f.event_sentence_pos[i], f.event_sentence_dep[i]]) 
                      for i in range(len(f.event_sentence_pos))] 
                     for f in val_features])
val_sent_candidates = np.zeros((val_sent_events.shape[0], 5, 1, val_sent_events.shape[-1]))

# Model Definition

In [8]:
class BertLayer(tf.layers.Layer):
    def __init__(self, n_fine_tune_layers=10, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            prepare_data.BERT_MODEL_HUB,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )
        
        trainable_vars = self.bert.variables

        # Remove unused layers
        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]

        # Select how many layers to fine tune
        trainable_vars = trainable_vars[-self.n_fine_tune_layers :]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
            
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
            "pooled_output"
        ]
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [9]:
class MultiBertLayer(BertLayer):
    def call(self, inputs):
        return [super(MultiBertLayer, self).call(ip) for ip in zip(*inputs)]

    def compute_output_shape(self, input_shape):
        #(batch size, num_labels, max_seq_size)
        return (input_shape[0], input_shape[1], self.output_size)

In [10]:
# Build model
def build_model(max_seq_length, num_labels, word_vec_len=0, sentence_vec_len=0, num_sents=5): 
    
    #Inputs
    in_ids = layers.Input(shape=(num_labels, max_seq_length), name="input_ids")
    in_masks = layers.Input(shape=(num_labels, max_seq_length), name="input_masks")
    in_segments = layers.Input(shape=(num_labels, max_seq_length), name="segment_ids")
    inputs = [in_ids, in_masks, in_segments]
    
    if word_vec_len:
        in_candidates_words = layers.Input(shape=(num_labels, None, word_vec_len), name="input_candidates_words")
        in_events_words = layers.Input(shape=(num_sents, None, word_vec_len), name="input_event_words")
        inputs.extend([in_candidates_words, in_events_words])
        
    if sentence_vec_len:
        in_candidates_sentences = layers.Input(shape=(num_labels, 1, sentence_vec_len), name="input_candidates_sentences")
        in_events_sentences = layers.Input(shape=(num_sents, sentence_vec_len), name="input_events_sentences")
        inputs.extend([in_candidates_sentences, in_events_sentences])
    
    
    #Split inputs if they should be operated on individually
    split_in_ids = [layers.Lambda(lambda x: x[:, i, :])(in_ids) for i in range(num_labels)]
    split_in_masks = [layers.Lambda(lambda x: x[:, i, :])(in_masks) for i in range(num_labels)]
    split_in_segments = [layers.Lambda(lambda x: x[:, i, :])(in_segments) for i in range(num_labels)]
    
    if word_vec_len:
        #Split by candidate
        split_in_candidates_words = [layers.Lambda(lambda x: x[:, i, :, :])(in_candidates_words) for i in range(num_labels)]
        #Split by sentence
        split_in_events_words = [layers.Lambda(lambda x: x[:, i, :, :])(in_events_words) for i in range(num_sents)]
        
    if sentence_vec_len:
        split_in_candidates_sentences = [layers.Lambda(lambda x: x[:, i, :])(in_candidates_sentences) 
                                         for i in range(num_labels)]
    
    
    #Bert
    bert_outputs = MultiBertLayer(n_fine_tune_layers=0)([split_in_ids, split_in_masks, split_in_segments])
    
    
    #Autoencoders to convert word and sentence embeddings into fixed vector embeddings
    
    
    word_to_sentence_autoencoder = layers.LSTM(sentence_vec_len if sentence_vec_len else 300,
                                               name="word_to_sword_to_sentence_autoencoder")
    sentences_to_vec_autoencoder = layers.LSTM(300, name='sentences_to_vec_autoencoder')
    
    if word_vec_len:
        autoencoded_word_event_sentence_vectors = [
            layers.Lambda(lambda x: K.expand_dims(word_to_sentence_autoencoder(x), 1))(ew) 
            for ew in split_in_events_words]
#         autoencoded_candidates = [
#             word_to_sentence_autoencoder(layers.Lambda(lambda x: K.expand_dims(x, 1))(cw))
#             for cw in split_in_candidates_words]
    
        autoencoded_candidates = [
            layers.Lambda(lambda x: K.expand_dims(x, 1))(word_to_sentence_autoencoder(cw))
            for cw in split_in_candidates_words
        ]
        word_event_candidate_autoencoded = [
            layers.Concatenate(axis=1)(autoencoded_word_event_sentence_vectors + [ac]) 
            for ac in autoencoded_candidates]

        autencoded_chains_from_words = [sentences_to_vec_autoencoder(wec) 
                                        for wec in word_event_candidate_autoencoded]
    if sentence_vec_len:
        sentence_candidate_vectors = [
            layers.Concatenate(axis=1)([in_events_sentences, ics]) 
            for ics in split_in_candidates_sentences]
        autencoded_chains_from_sents = [sentences_to_vec_autoencoder(scv) 
                                        for scv in sentence_candidate_vectors]
    
    if word_vec_len and sentence_vec_len:
        enhancing_vectors = [layers.Concatenate(axis=1)([wv, sv])
            for (wv, sv) in 
            zip(autencoded_chains_from_words, 
                autencoded_chains_from_sents)]
    elif word_vec_len:
        enhancing_vectors = autencoded_chains_from_words
    elif sentence_vec_len:
        enhancing_vectors = autencoded_chains_from_sents
        
        
    #Combine Bert and autoencoder embeddings (if provided)
    if word_vec_len or sentence_vec_len:
        augmented_outputs = [layers.Concatenate(axis=1)([bo, ev]) for (bo, ev) in zip(bert_outputs, enhancing_vectors)]
    else:
        augmented_outputs = bert_outputs
    concat_output = layers.Concatenate(axis=1)(augmented_outputs)
    
    #Single Hidden Layer and classification
    dense = layers.Dense(256, activation='relu')(concat_output)
    pred = layers.Dense(num_labels, activation='softmax')(dense)
    
    model = models.Model(inputs=inputs, outputs=pred)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    print(inputs)
    return model

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [11]:
model = build_model(prepare_data.MAX_SEQ_LENGTH, num_labels=5, word_vec_len=300, sentence_vec_len=169, num_sents=train_word_events.shape[1])

# Instantiate variables


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0514 01:02:40.342494 140307046811392 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0514 01:02:42.025304 140307046811392 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0514 01:02:42.832443 140307046811392 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0514 01:02:43.687577 140307046811392 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0514 01:02:44.592120 140307046811392 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0514 01:02:45.334447 140307046811392 saver.py:1483] Saver not created because there are no variables in the graph to restore


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_candidates_words (InputLa (None, 5, None, 300) 0                                            
__________________________________________________________________________________________________
input_event_words (InputLayer)  (None, 5, None, 300) 0                                            
__________________________________________________________________________________________________
lambda_15 (Lambda)              (None, None, 300)    0           input_candidates_words[0][0]     
__________________________________________________________________________________________________
lambda_16 (Lambda)              (None, None, 300)    0           input_candidates_words[0][0]     
__________________________________________________________________________________________________
lambda_17 

# Train

In [None]:
initialize_vars(sess)

callbacks = [
    keras.callbacks.ModelCheckpoint(os.path.join(logs, '{epoch:02d}.h5')),
    keras.callbacks.TensorBoard(log_dir=logs, update_freq=1000)
]


model.fit(
    [train_input_ids, train_input_masks, train_segment_ids, 
     train_word_candidates, train_word_events, train_sent_candidates, train_sent_events], 
    train_labels,
    validation_data=([val_input_ids, val_input_masks, val_segment_ids, 
                      val_word_candidates, np.array(val_word_events), val_sent_candidates, val_sent_events], val_labels),
    epochs=3,
    batch_size=1,
    callbacks=callbacks
)

Train on 36 samples, validate on 4 samples
Instructions for updating:
Use tf.cast instead.


W0514 01:02:54.206941 140307046811392 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3


# Predict

In [None]:
model.save('BertModel.h5')
pre_save_preds = model.predict([test_input_ids[0:100], 
                                test_input_masks[0:100], 
                                test_segment_ids[0:100]]
                              ) # predictions before we clear and reload model

# Clear and load model
model = None
model = build_model(max_seq_length)
initialize_vars(sess)
model.load_weights('BertModel.h5')

post_save_preds = model.predict([test_input_ids[0:100], 
                                test_input_masks[0:100], 
                                test_segment_ids[0:100]]
                              ) # predictions after we clear and reload model
all(pre_save_preds == post_save_preds) # Are they the same?