In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install -q tensorflow-text
!pip install -q tf-models-official
!pip install tensorflow-determinism

[K     |████████████████████████████████| 3.4MB 8.3MB/s 
[K     |████████████████████████████████| 1.1MB 8.6MB/s 
[K     |████████████████████████████████| 706kB 27.4MB/s 
[K     |████████████████████████████████| 174kB 50.9MB/s 
[K     |████████████████████████████████| 37.6MB 80kB/s 
[K     |████████████████████████████████| 1.2MB 56.0MB/s 
[K     |████████████████████████████████| 51kB 8.5MB/s 
[K     |████████████████████████████████| 358kB 52.0MB/s 
[K     |████████████████████████████████| 102kB 15.5MB/s 
[K     |████████████████████████████████| 645kB 50.9MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
Collecting tensorflow-determinism
  Downloading https://files.pythonhosted.org/packages/76/56/79d74f25b326d8719753172496abc524980fa67d1d98bb247021376e370a/tensorflow-determinism-0.3.0.tar.gz
Building wheels for collected packages: tensorflow-determinism
  Building wheel for tensorflo

In [4]:
general_settings = {
    "seed": 2021,
    "batch_size": 32,
    "validation_split": 0.15,
    "destination_path": "/content/gdrive/MyDrive/Colab Notebooks/output",
    "threshold": 0.5
}

import os
from typing import List, Dict, Any, Tuple
import json
import shutil
from sklearn.metrics import accuracy_score, classification_report
import random
import numpy as np

"""# Set seed to prevent non-determinism
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC']='1'
os.environ['PYTHONHASHSEED']=str(general_settings['seed'])
random.seed(general_settings['seed'])
np.random.seed(general_settings['seed'])"""

import tensorflow as tf
tf.random.set_seed(general_settings['seed'])
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer
from tensorflow.data import Dataset
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import load_model

import matplotlib.pyplot as plt

"""from fwd9m.tensorflow import enable_determinism
enable_determinism()"""

tf.get_logger().setLevel('ERROR')



# Support function to avoid code repetition, get questions_answers and labels lists
def get_lists_from_elements(elements_list: List[Dict[str, Any]]) -> Tuple[List[str], List[int]]:
    questions_answers = []
    labels = []
    for element in elements_list:
        answer = element['deeppavlov_answer']
        if element['question']:
            questions_answers.append(element['question'] + " [SEP] " + answer)
        else:
            questions_answers.append(element['NNQT_question'] + " [SEP] " + answer)
        if 'has_answer' in element and element['has_answer'] == False:
            labels.append(0)
        else:
            labels.append(1)
    return questions_answers, labels

# Load dataset data
with open("/content/gdrive/MyDrive/Colab Notebooks/data/LC_QuAD_2_train_balanced_with_embeddings_no_dp.json", "r") as json_file:
    train_data = json.load(json_file)
    train_questions_answers, train_labels = get_lists_from_elements(train_data)
with open("/content/gdrive/MyDrive/Colab Notebooks/data/LC_QuAD_2_valid_balanced_with_embeddings_no_dp.json", "r") as json_file:
    valid_data = json.load(json_file)
    valid_questions_answers, valid_labels = get_lists_from_elements(valid_data)
with open("/content/gdrive/MyDrive/Colab Notebooks/data/LC_QuAD_2_test_balanced_with_embeddings.json", "r") as json_file:
    test_data = json.load(json_file)
    test_questions_answers, test_labels = get_lists_from_elements(test_data)

# Create datasets
AUTOTUNE = tf.data.AUTOTUNE

"""train_set_length = len(train_questions)
val_set_length = int(train_set_length * general_settings['validation_split'])"""
train_set = Dataset.from_tensor_slices((train_questions_answers, train_labels))

val_set = Dataset.from_tensor_slices((valid_questions_answers, valid_labels))

"""train_set = train_set.shuffle(train_set_length, seed=general_settings['seed'])
val_set = train_set.take(val_set_length)
train_set = train_set.skip(val_set_length)"""
train_set = train_set.batch(general_settings['batch_size'])
val_set = val_set.batch(general_settings['batch_size'])
train_set = train_set.cache().prefetch(buffer_size=AUTOTUNE)
val_set = val_set.cache().prefetch(buffer_size=AUTOTUNE)

test_set = Dataset.from_tensor_slices((test_questions_answers, test_labels))
test_set = test_set.batch(general_settings['batch_size'])
test_set = test_set.cache().prefetch(buffer_size=AUTOTUNE)

# Build model
"""tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'"""
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  # Preprocess text input
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  # 'pooled_output' is the [CLS] token
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.25)(net)
  net = tf.keras.layers.Dense(768, activation=None)(net)
  net = tf.keras.layers.Dense(768, activation=None)(net)
  net = tf.keras.layers.Dense(768, activation=None)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

# Define BCE loss function and accuracy metric
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy(threshold=general_settings['threshold'])

epochs = 70
steps_per_epoch = tf.data.experimental.cardinality(train_set).numpy()
num_train_steps = steps_per_epoch * epochs
# Number of steps (10%) of fixed learning rate before linear decay
num_warmup_steps = int(0.1*num_train_steps)

# For BERT fine-tuning is recommended a low learning rate (between 2e-5 and 5e-5)
init_lr = 4e-6
# Optimizer is AdamW, a version of Adam that uses weights decay instead of moments
"""optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')"""
optimizer = Adam(init_lr)
# Build model and load information
classifier_model = build_classifier_model()
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
callbacks = [
    ModelCheckpoint(
        # Path where to save the model. The two parameters below mean that we will overwrite
        # the current checkpoint if and only if the `val_loss` score has improved.
        # The saved model name will include the current epoch.
        filepath=general_settings['destination_path'] + "/bert_question_answer",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_loss",
        verbose=1,
    ),
  EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
]

# Freeze BERT training
"""for w in classifier_model.get_layer('BERT_encoder').weights:
    w._trainable = False"""

# Train
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_set,
                               validation_data=val_set,
                               shuffle=False,
                               epochs=epochs,
                               callbacks=callbacks)

# Load model
classifier_model = load_model(general_settings['destination_path'] + "/bert_question_answer")

# Test
y_pred = classifier_model.predict(test_set).tolist()
# Get answer label from y_pred, using a Sigmoid to normalize the result
y_final_pred = []
for pred in y_pred:
  normalized_pred = tf.sigmoid(pred)
  if normalized_pred > general_settings['threshold']:
    y_final_pred.append(1)
  else:
    y_final_pred.append(0)

# Print final metrics
print('Classification Report:')
print(classification_report(test_labels, y_final_pred, labels=[1,0], digits=4))

# Write predictions to file
with open(general_settings['destination_path'] + "/bert_question_answer/model_predictions.txt", "w") as answers_file:
    answers_file.write(str(y_final_pred))

Training model with https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
Classification Report:
              precision    recall  f1-score   support

           1     0.5727    0.7774    0.6596       319
           0     0.6183    0.3833    0.4733       300

    accuracy                         0.5864       619
   macro avg     0.5955    0.5804    0.5664       619
weighted avg     0.5948    0.5864    0.5693       619

