In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install -q tensorflow-text
!pip install -q tf-models-official
!pip install tensorflow-determinism

[K     |████████████████████████████████| 3.4MB 11.1MB/s 
[K     |████████████████████████████████| 1.1MB 12.1MB/s 
[K     |████████████████████████████████| 645kB 41.9MB/s 
[K     |████████████████████████████████| 37.6MB 78kB/s 
[K     |████████████████████████████████| 358kB 37.9MB/s 
[K     |████████████████████████████████| 706kB 41.1MB/s 
[K     |████████████████████████████████| 1.2MB 36.5MB/s 
[K     |████████████████████████████████| 174kB 44.9MB/s 
[K     |████████████████████████████████| 102kB 10.3MB/s 
[K     |████████████████████████████████| 51kB 5.2MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
Collecting tensorflow-determinism
  Downloading https://files.pythonhosted.org/packages/76/56/79d74f25b326d8719753172496abc524980fa67d1d98bb247021376e370a/tensorflow-determinism-0.3.0.tar.gz
Building wheels for collected packages: tensorflow-determinism
  Building wheel for tensorf

In [3]:
general_settings = {
    "seed": 2021,
    "batch_size": 32,
    "validation_split": 0.15,
    "destination_path": "/content/gdrive/MyDrive/Colab Notebooks/output",
    "threshold": 0.55,
    "embedding_size": 0,
    "lstm_hidden_size": 128
}

import os
from typing import List, Dict, Any, Tuple
import json
import shutil
from sklearn.metrics import accuracy_score, classification_report
import random
import numpy as np

"""# Set seed to prevent non-determinism
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC']='1'
os.environ['PYTHONHASHSEED']=str(general_settings['seed'])
random.seed(general_settings['seed'])
np.random.seed(general_settings['seed'])"""

import tensorflow as tf
tf.random.set_seed(general_settings['seed'])
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer
from tensorflow.data import Dataset
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras import layers

import matplotlib.pyplot as plt

"""from fwd9m.tensorflow import enable_determinism
enable_determinism()"""

tf.get_logger().setLevel('ERROR')



def get_test_lists_from_elements(elements_list: List[Dict[str, Any]]) -> Tuple[List[str], List[int], tf.Tensor, tf.Tensor, tf.Tensor, List[int], List[int]]:
    questions = []
    labels = []
    templates_list = []
    embeddings_list = []
    num_embeddings_list = []
    # Save labels in a specific list, otherwise we cannot know the true label of questions without DeepPavlov candidate queries
    labels_list_per_question = []
    # Append every candidate query as an element to predict and save the number of candidate queries for each question. After model evaluation execution, for every
    # question the answer with the maximum probability will be the final prediction. Labels are saved for each candidate query for tensors size consistency, although
    # they will be used once per question
    for element in elements_list:
        num_embeddings_list.append(len(element['deeppavlov_embeddings']))
        labels_list_per_question.append(element['answerable'])
        for query_embeddings in element['deeppavlov_embeddings']:
            labels.append(element['answerable'])
            final_embedding = []
            for embedding in query_embeddings:
                final_embedding.extend(embedding)
            embeddings_list.append(final_embedding)
            templates_list.append(element['template_conv_encoding'])
            # If template encoding length is not saved yet into general_settings, calculate and save it. Do the same for embedding length if the saved value is smaller
            if not "template_encoding_length" in general_settings:
                general_settings['template_encoding_length'] = len(element['template_conv_encoding'])
            if general_settings['embedding_size'] < len(final_embedding):
                general_settings['embedding_size'] = len(final_embedding)
            if element['question']:
                questions.append(element['question'])
            else:
                questions.append(element['NNQT_question'])
    # Add temporarily a list with the maximum length of zeros, because the test set might not have any example with the maximum possible embeddings length, and so
    # the input dimension would be wrong. Doing so "pad_sequence" should pad train and test set uniformly. Before returning the embedding list the last element is dropped
    embeddings_list.append([0] * general_settings['embedding_size'])
    embeddings_list = tf.keras.preprocessing.sequence.pad_sequences(embeddings_list, padding="post")
    embeddings_list = embeddings_list[:-1]
    embeddings_tensor = tf.convert_to_tensor(embeddings_list, dtype=tf.float32)
    masking_layer = layers.Masking()
    masked_embeddings_tensor = masking_layer(embeddings_tensor)
    templates_tensor = tf.convert_to_tensor(templates_list, dtype=tf.float32)
    return questions, labels, templates_tensor, embeddings_tensor, masked_embeddings_tensor, num_embeddings_list, labels_list_per_question

# Support function to avoid code repetition, get questions and labels lists
def get_lists_from_elements(elements_list: List[Dict[str, Any]]) -> Tuple[List[str], List[int], tf.Tensor, tf.Tensor, tf.Tensor]:
    questions = []
    labels = []
    embeddings_list = []
    templates_list = []
    for element in elements_list:
        labels.append(element['answerable'])
        final_embedding = []
        for embedding in element['embeddings']:
            final_embedding.extend(embedding)
        embeddings_list.append(final_embedding)
        templates_list.append(element['template_encoding'])
        # If template encoding length is not saved yet into general_settings, calculate and save it. Do the same for embedding length if the saved value is smaller
        if not "template_encoding_length" in general_settings:
            general_settings['template_encoding_length'] = len(element['template_encoding'])
        if general_settings['embedding_size'] < len(final_embedding):
            general_settings['embedding_size'] = len(final_embedding)
        if element['question']:
            questions.append(element['question'])
        else:
            questions.append(element['NNQT_question'])
    embeddings_list = tf.keras.preprocessing.sequence.pad_sequences(embeddings_list, padding="post")
    embeddings_tensor = tf.convert_to_tensor(embeddings_list, dtype=tf.float32)
    masking_layer = layers.Masking()
    masked_embeddings_tensor = masking_layer(embeddings_tensor)
    templates_tensor = tf.convert_to_tensor(templates_list, dtype=tf.uint8)
    return questions, labels, templates_tensor, embeddings_tensor, masked_embeddings_tensor

# Load dataset data
with open("/content/gdrive/MyDrive/Colab Notebooks/data/LC_QuAD_2_train_balanced_with_embeddings_no_dp.json", "r") as json_file:
    train_data = json.load(json_file)
    train_questions, train_labels, train_templates, train_embeddings, train_masked_embeddings = get_lists_from_elements(train_data)
with open("/content/gdrive/MyDrive/Colab Notebooks/data/LC_QuAD_2_valid_balanced_with_embeddings_no_dp.json", "r") as json_file:
    valid_data = json.load(json_file)
    valid_questions, valid_labels, valid_templates, valid_embeddings, valid_masked_embeddings = get_lists_from_elements(valid_data)
with open("/content/gdrive/MyDrive/Colab Notebooks/data/LC_QuAD_2_test_balanced_with_embeddings.json", "r") as json_file:
    test_data = json.load(json_file)
    test_questions, test_labels, test_templates, test_embeddings, test_masked_embeddings, \
    test_num_embeddings_list, test_labels_list_per_question = get_test_lists_from_elements(test_data)

# Create datasets
AUTOTUNE = tf.data.AUTOTUNE

"""train_set_length = len(train_questions)
val_set_length = int(train_set_length * general_settings['validation_split'])"""
train_set = Dataset.from_tensor_slices(({"questions": train_questions, "templates": train_templates, "embeddings": train_embeddings, \
                                         "masked_embeddings": train_masked_embeddings}, train_labels))

val_set = Dataset.from_tensor_slices(({"questions": valid_questions, "templates": valid_templates, "embeddings": valid_embeddings, \
                                       "masked_embeddings": valid_masked_embeddings}, valid_labels))

"""train_set = train_set.shuffle(train_set_length, seed=general_settings['seed'])
val_set = train_set.take(val_set_length)
train_set = train_set.skip(val_set_length)"""
train_set = train_set.batch(general_settings['batch_size'])
val_set = val_set.batch(general_settings['batch_size'])
train_set = train_set.cache().prefetch(buffer_size=AUTOTUNE)
val_set = val_set.cache().prefetch(buffer_size=AUTOTUNE)

test_set = Dataset.from_tensor_slices(({"questions": test_questions, "templates": test_templates, "embeddings": test_embeddings, \
                                        "masked_embeddings": test_masked_embeddings}, test_labels))
test_set = test_set.batch(general_settings['batch_size'])
test_set = test_set.cache().prefetch(buffer_size=AUTOTUNE)

# Build model
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
"""tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3'"""

def build_classifier_model():
  questions_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='questions')
  templates_input = tf.keras.layers.Input(shape=(general_settings['template_encoding_length']), dtype=tf.float32, name='templates')
  embeddings_input = tf.keras.layers.Input(shape=(general_settings['embedding_size']), dtype=tf.float32, name='embeddings')
  masked_embeddings_input = tf.keras.layers.Input(shape=(general_settings['embedding_size']), dtype=tf.string, name='masked_embeddings')
  # Preprocess text input
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(questions_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  # 'pooled_output' is the [CLS] token
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.25)(net)

  # Process embeddings
  embeddings_input_exp = tf.expand_dims(embeddings_input, axis=1)
  emb_net = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(general_settings['lstm_hidden_size']))(embeddings_input_exp)
  emb_net = tf.keras.layers.Dropout(0.25)(emb_net)
  emb_net = tf.keras.layers.Dense(general_settings['lstm_hidden_size'], activation='relu')(emb_net)

  # Process the concatenation of template, BERT result and LSTM result
  net = tf.keras.layers.Dense(512 + general_settings['lstm_hidden_size'], activation=None)(tf.concat([templates_input, net, emb_net], 1))
  net = tf.keras.layers.Dense(512 + general_settings['lstm_hidden_size'], activation=None)(net)
  net = tf.keras.layers.Dense(512 + general_settings['lstm_hidden_size'], activation=None)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model([questions_input, templates_input, embeddings_input, masked_embeddings_input], net)

# Define BCE loss function and accuracy metric
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy(threshold=general_settings['threshold'])

epochs = 70
steps_per_epoch = tf.data.experimental.cardinality(train_set).numpy()
num_train_steps = steps_per_epoch * epochs
# Number of steps (10%) of fixed learning rate before linear decay
num_warmup_steps = int(0.1*num_train_steps)

# For BERT fine-tuning is recommended a low learning rate (between 2e-5 and 5e-5)
init_lr = 1e-6
# Optimizer is AdamW, a version of Adam that uses weights decay instead of moments
"""optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')"""
optimizer = Adam(init_lr)
# Build model and load information
classifier_model = build_classifier_model()
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
callbacks = [
    ModelCheckpoint(
        # Path where to save the model. The two parameters below mean that we will overwrite
        # the current checkpoint if and only if the `val_loss` score has improved.
        # The saved model name will include the current epoch.
        filepath=general_settings['destination_path'] + "/bert_question_embeddings",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_loss",
        verbose=1,
    ),
  EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
]

# Freeze BERT training
"""for w in classifier_model.get_layer('BERT_encoder').weights:
    w._trainable = False"""

# Train
print(f'Training model with {tfhub_handle_encoder}')
"""history = classifier_model.fit(x=train_set,
                               validation_data=val_set,
                               shuffle=False,
                               epochs=epochs,
                               callbacks=callbacks)"""

# Load model
classifier_model = load_model(general_settings['destination_path'] + "/bert_question_embeddings")

# Test
y_pred = classifier_model.predict(test_set).tolist()

# Get the label majority class
zero_labels = 0
one_labels = 0
for question_label in test_labels_list_per_question:
    if question_label == 0:
        zero_labels += 1
    else:
        one_labels += 1
# With a perfectly balanced test set, the chosen majority class is 0
if one_labels > zero_labels:
    majority_class = 1
else:
    majority_class = 0
# Save predictions for each question, choosing the candidate query with the max probability
real_y_pred = []
y_index = 0
for num_embeddings in test_num_embeddings_list:
    if num_embeddings > 0:
        for index in range(num_embeddings):
            if index == 0:
                best_output = y_pred[y_index]
                best_index = 0
            elif y_pred[y_index + index] > best_output:
                best_index = index
        real_y_pred.append(int(tf.sigmoid(y_pred[best_index]) > general_settings['threshold']))
        y_index += num_embeddings
    else:
        # Question without candidate queries, answer with the majority class
        real_y_pred.append(majority_class)

# Metrics print
print('Classification Report:')
print(classification_report(test_labels_list_per_question, real_y_pred, labels=[1,0], digits=4))

# Write predictions to file
with open(general_settings['destination_path'] + "/bert_question_embeddings/model_predictions.txt", "w") as answers_file:
    answers_file.write(str(real_y_pred))

Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1


Exception ignored in: <function CapturableResourceDeleter.__del__ at 0x7fefa3f0f560>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/training/tracking/tracking.py", line 208, in __del__
    self._destroy_resource()
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py", line 828, in __call__
    result = self._call(*args, **kwds)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py", line 871, in _call
    self._initialize(args, kwds, add_initializers_to=initializers)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py", line 726, in _initialize
    *args, **kwds))
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py", line 2969, in _get_concrete_function_internal_garbage_collected
    graph_function, _ = self._maybe_define_function(args, kwargs)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/p

Classification Report:
              precision    recall  f1-score   support

           1     0.5529    0.6552    0.5997       319
           0     0.5436    0.4367    0.4843       300

    accuracy                         0.5493       619
   macro avg     0.5482    0.5459    0.5420       619
weighted avg     0.5484    0.5493    0.5438       619

