In [1]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import GlobalMaxPooling1D, Dense, Bidirectional, LSTM, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, concatenate
from tensorflow.python.client import device_lib
from tensorflow.keras import backend as K

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

In [None]:
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [2]:
from preprocessing import TextPreprocessOld

srcLang = "eng"
tgtLang = "fra"
src_vocab_size = 20000
src_len = 150
tgt_vocab_size = 20000
tgt_len = 150
train_data_dir = "/linguistics/ethan/DL_Prototype/datasets/TQA/train"
test_tb_data_dir = "/linguistics/ethan/DL_Prototype/datasets/TQA/test/TB_test"
test_tm_data_dir = "/linguistics/ethan/DL_Prototype/datasets/TQA/test/TM_test"

label_class_map = {"good": 1, "bad": 0}
tp = TextPreprocessOld(srcLang, tgtLang, src_vocab_size=src_vocab_size, src_len=src_len,
                    tgt_vocab_size=tgt_vocab_size, tgt_len=tgt_len)
train_src_integers, train_tgt_integers, train_labels = tp.read_dataset_from_directory(train_data_dir, label_class_map)
test_tb_src_integers, test_tb_tgt_integers, test_tb_labels = tp.read_dataset_from_directory(test_tb_data_dir, label_class_map)
test_tm_src_integers, test_tm_tgt_integers, test_tm_labels = tp.read_dataset_from_directory(test_tm_data_dir, label_class_map)

# remove samples in training data that could exist in test data.
test_data = set(zip(test_tm_src_integers, test_tm_tgt_integers, test_tm_labels)).union(
            set(zip(test_tb_src_integers, test_tb_tgt_integers, test_tb_labels)))
train_data = set(zip(train_src_integers, train_tgt_integers, train_labels))
train_data = train_data.difference(test_data)

train_src_integers = np.array([td[0] for td in train_data])
train_tgt_integers = np.array([td[1] for td in train_data])
train_labels = np.array([td[2] for td in train_data])

print("Final number of training samples: {}".format(train_src_integers.shape[0]))
del train_data, test_data


Importing Data
	5 pairs of good English-French files found.
	9 pairs of bad English-French files found.
Importing Data Complete.
	352872 good entries
	417208 bad entries

Importing Data
	1 pairs of good English-French files found.
	1 pairs of bad English-French files found.
Importing Data Complete.
	400 good entries
	400 bad entries

Importing Data
	1 pairs of good English-French files found.
	1 pairs of bad English-French files found.
Importing Data Complete.
	400 good entries
	400 bad entries
Final number of training samples: 765175


In [3]:
train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=2)
test_tm_labels = tf.keras.utils.to_categorical(test_tm_labels, num_classes=2)
test_tb_labels = tf.keras.utils.to_categorical(test_tb_labels, num_classes=2)

In [None]:
# sample 100 good labeled and 100 bad labeled TM
import pandas as pd
output = "/linguistics/ethan/DL_Prototype/datasets/train_sample/bad_tm.sample.xlsx"
good_tm = []
bad_tm = []
for src, tgt, label in zip(train_src_integers, train_tgt_integers, train_labels):
    if len(bad_tm) == 100:
        break
    if label == 0:
        bad_tm.append((src, tgt))
df = pd.DataFrame(bad_tm, columns=["English", "French"])
df.to_excel(output, header=True, index=None)
# train_tgt_integers[:10]

In [None]:
# check if test data has TM which also exists in training data
# test_data = set(zip(test_tm_src_integers, test_tm_tgt_integers, test_tm_labels))
# train_data = set(zip(train_src_integers, train_tgt_integers, train_labels))
# train_data = train_data.difference(test_data)
# next(iter(train_data))

In [None]:
# intersect_sources = [p[0] for p in intersect_data]
# train_src_integers, train_tgt_integers, train_labels
# np.where(test_tm_src_integers in intersect_sources, True, False)
# a = np.array(["1","2","3","4"])
# b = np.array(["1","2","5","4", "6"])
# (np.in1d(a, b))
# sum(np.in1d(test_tm_src_integers, train_src_integers))
# new_train_src_integers, new_train_tgt_integers, new_train_labels = [], [], []
# for s, t, l in zip(train_src_integers, train_tgt_integers, train_labels):
#     if (s not in test_tm_src_integers) and (t not in test_tm_tgt_integers):
#         new_train_src_integers.append(s)
#         new_train_tgt_integers.append(t)
#         new_train_labels.append(l)
# len(set(zip(train_src_integers, train_tgt_integers, train_labels)))
# train_src_integers.shape[0]

In [4]:
preprocessor_dir = "/linguistics/ethan/DL_Prototype/models/universal-sentence-encoder-cmlm_multilingual-preprocess_2"
LaBSE_dir = "/linguistics/ethan/DL_Prototype/models/LaBSE2_encoder"

In [None]:
# Text vectorization using pretrained preprocessor.
with tf.device("/cpu:0"):
    preprocessor = hub.KerasLayer(preprocessor_dir, trainable=False)
    
#     train_src_integers = preprocessor(train_src_integers)
#     train_tgt_integers = preprocessor(train_tgt_integers)

    test_src_integers = preprocessor(test_src_integers)
    test_tgt_integers = preprocessor(test_tgt_integers)

In [None]:
# train_src_integers["input_type_ids"].shape

In [None]:
encoder = hub.KerasLayer(LaBSE_dir, trainable=False)

In [5]:
"""Define TQC models with and without preprocessor layer as part of model."""

def normalize(embeds):
    l2_norm = np.linalg.norm(embeds, 2, axis=1, keepdims=True)
    return embeds / l2_norm

def build_model(max_seq_len):

    # preprocessor = hub.KerasLayer(
    #     "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
    # encoder = hub.KerasLayer("https://tfhub.dev/google/LaBSE/2", trainable=False)
    
    src_word_ids = Input((max_seq_len,), dtype=tf.int32)
    src_mask_ids = Input((max_seq_len,), dtype=tf.int32)
    src_type_ids = Input((max_seq_len,), dtype=tf.int32)
    
    tgt_word_ids = Input((max_seq_len,), dtype=tf.int32)
    tgt_mask_ids = Input((max_seq_len,), dtype=tf.int32)
    tgt_type_ids = Input((max_seq_len,), dtype=tf.int32)
    
    src_x = {"input_word_ids": src_word_ids,
             "input_mask": src_mask_ids,
             "input_type_ids": src_type_ids}
    
    tgt_x = {"input_word_ids": tgt_word_ids,
             "input_mask": tgt_mask_ids,
             "input_type_ids": tgt_type_ids}
    
    src_x = encoder(src_x)["default"]
    tgt_x = encoder(tgt_x)["default"]
    
    src_x = tf.math.l2_normalize(src_x, axis=1, epsilon=1e-12, name=None)
    tgt_x = tf.math.l2_normalize(tgt_x, axis=1, epsilon=1e-12, name=None)
    
    # np.matmul(english_embeds, np.transpose(italian_embeds))
    x = tf.concat([src_x, tgt_x], axis=1)
    #  x = GlobalMaxPooling1D(x)
    
    x = Dense(256, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)
    
    model = Model([src_word_ids, src_mask_ids, src_type_ids, 
               tgt_word_ids, tgt_mask_ids, tgt_type_ids], output)
    #  model = Model([src_x, tgt_x], output)
    
    return model

def build_model_with_preprocessor(max_seq_len, preprocessor_dir, LaBSE_dir):
    
    src_texts = tf.keras.layers.Input(shape=(), dtype=tf.string, name="input_src_text")
    tgt_texts = tf.keras.layers.Input(shape=(), dtype=tf.string, name="input_tgt_text")

    preprocessor = hub.KerasLayer(preprocessor_dir, trainable=False)
    encoder = hub.KerasLayer(LaBSE_dir, trainable=False)
    
    src_x = preprocessor(src_texts)
    tgt_x = preprocessor(tgt_texts)
    
    src_x = encoder(src_x)["default"]
    tgt_x = encoder(tgt_x)["default"]
    
    src_x = tf.math.l2_normalize(src_x, axis=1, epsilon=1e-12, name=None)
    tgt_x = tf.math.l2_normalize(tgt_x, axis=1, epsilon=1e-12, name=None)
    
    # np.matmul(english_embeds, np.transpose(italian_embeds))
    x = tf.concat([src_x, tgt_x], axis=1)
    #  x = GlobalMaxPooling1D(x)

    x = Dense(128, activation='relu')(x)
    x = Dense(8, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)
    
    model = Model([src_texts, tgt_texts], output)
    
    return model


def build_model_with_preprocessor_and_lstm(max_seq_len, preprocessor_dir, LaBSE_dir, softmax=False):
    """Once softmax output layer is turned on, make sure to onehot encode labeled data to shape (n, num_classes)"""
    
    src_texts = tf.keras.layers.Input(shape=(), dtype=tf.string, name="input_src_text")
    tgt_texts = tf.keras.layers.Input(shape=(), dtype=tf.string, name="input_tgt_text")

    preprocessor = hub.KerasLayer(preprocessor_dir, trainable=False)
    encoder = hub.KerasLayer(LaBSE_dir, trainable=False)
    
    src_x = preprocessor(src_texts)
    tgt_x = preprocessor(tgt_texts)
    
    src_x = encoder(src_x)["sequence_output"]
    tgt_x = encoder(tgt_x)["sequence_output"]
    
    # src_x = tf.math.l2_normalize(src_x, axis=-1, epsilon=1e-12, name=None)
    # tgt_x = tf.math.l2_normalize(tgt_x, axis=-1, epsilon=1e-12, name=None)
    
    # np.matmul(english_embeds, np.transpose(italian_embeds))
    # sequence_output = tf.concat([src_x, tgt_x], axis=-1)
    sequence_output = concatenate([src_x, tgt_x])
    
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = Bidirectional(LSTM(768, return_sequences=True))(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = GlobalAveragePooling1D()(bi_lstm)
    max_pool = GlobalMaxPooling1D()(bi_lstm)
    concat = concatenate([avg_pool, max_pool])
    dropout = Dropout(0.3)(concat)
    
    x = Dense(2048, activation="relu")(dropout)
    x = Dense(512, activation="relu")(x)
    x = Dense(8, activation="relu")(x)
    
    if softmax:
        output = Dense(2, activation='softmax')(x)
        
    else:
        output = Dense(1, activation='sigmoid')(x)
    
    
    model = Model([src_texts, tgt_texts], output)
    
    return model

In [None]:
# embeds = tf.constant([[1,2,3,4], [1,2, 3, 4]], dtype=tf.float32)
# embeds / np.linalg.norm(embeds, 2, axis=1, keepdims=True)
# # embeds
# tf.math.l2_normalize(
#     embeds, axis=1, epsilon=1e-12, name=None
# )


In [None]:
x = Input((128,))
output = Dense(2, activation='softmax')(x)
# output = Dense(1, activation='sigmoid')(x)
model = Model(x, output)

model.summary()
x = tf.random.uniform((1,128))
model.predict(x)

In [None]:
# 1 / np.sqrt(30)
# tf.keras.layers.Lambda(
#       lambda x: tf.nn.l2_normalize(x, axis=1))(embeds)
# tf.nn.l2_normalize(embeds, axis=1)

In [6]:
"""Specify training hyperparameters"""

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

gpu_devices = get_available_gpus()
epochs = 10
max_seq_len = 128
batch_size = 128
num_samples = len(train_labels)
steps_per_epoch = int(num_samples / batch_size)

In [None]:
# model = build_model(max_seq_len)
# model2 = build_model_with_preprocessor(max_seq_len, preprocessor_dir, LaBSE_dir)
model3 = build_model_with_preprocessor_and_lstm(max_seq_len, preprocessor_dir, LaBSE_dir, softmax=True)
# del model3
# model2.summary()

In [None]:
# model2.summary()
# mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])
# with mirrored_strategy.scope():
# for d in ["/gpu:0", "/gpu:1"]:
    
#     with tf.device(d):

#         model = build_model(max_seq_len)
#         model.compile(optimizer="adam",
#                       loss="binary_crossentropy",
#                       metrics=["accuracy"])
#         model.fit(x=[train_src_integers["input_word_ids"], train_src_integers["input_mask"], train_src_integers["input_type_ids"],
#                      train_tgt_integers["input_word_ids"], train_tgt_integers["input_mask"], train_tgt_integers["input_type_ids"]],
#                   y=train_labels,
#                   batch_size=batch_size,
#                   epochs=epochs,
#                   steps_per_epoch=steps_per_epoch,
#                   validation_split=0.1)

In [14]:
# train_src_integers.shape
len(train_labels)

765175

In [None]:
# continue training from checkpoint
# checkpoint_path = "/linguistics/ethan/DL_Prototype/models/TQA_models/Multilingual-LaBSE-Bidirectional-LSTM_ckpts/tqc-0002.ckpt"
# new_model = tf.keras.models.load_model(checkpoint_path)
# assert_allclose(model.predict(x_train),
#                 new_model.predict(x_train),
#                 1e-5)

# fit the model
# checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]
# new_model.fit(x=[test_tm_src_integers, test_tm_tgt_integers],
#                y=test_tm_labels, epochs=3, batch_size=16, callbacks=callbacks_list)

In [None]:
type(train_labels)
type(train_src_integers)
model3

In [None]:
"""Evalution on Test data."""

In [None]:
# Get predictions from trained model on test inputs.
# predictions = model.predict([test_src_integers["input_word_ids"], test_src_integers["input_mask"], test_src_integers["input_type_ids"],
#                              test_tgt_integers["input_word_ids"], test_tgt_integers["input_mask"], test_tgt_integers["input_type_ids"]])
tm_predictions = model3.predict([test_tm_src_integers, test_tm_tgt_integers])
tb_predictions = model3.predict([test_tb_src_integers, test_tb_tgt_integers])

tm_pred = [1 if p > 0.5 else 0 for p in tm_predictions]
tb_pred = [1 if p > 0.5 else 0 for p in tb_predictions]

print(confusion_matrix(test_tm_labels, tm_pred, labels=[1,0]))
print(confusion_matrix(test_tb_labels, tb_pred, labels=[1,0]))

In [None]:
# threshold = 0.3
def evaluate(predictions, true_labels):
    records = []
    for threshold in np.linspace(0.1, 0.9, num=9):
        pred = [1 if p > threshold else 0 for p in predictions]
        acc = accuracy_score(true_labels, pred)
        rec = recall_score(true_labels, pred, labels=[1, 0])
        pre = precision_score(true_labels, pred, labels=[1, 0])
        f1 = f1_score(true_labels, pred, labels=[1, 0])
        records.append((threshold, acc, rec, pre, f1))

    df = pd.DataFrame(records, columns=["Threshold", "Accuracy", "Recall", "Precision", "F1"])
    return df

evaluate(tm_predictions, test_tm_labels)
# evaluate(tb_predictions, test_tb_labels)

In [None]:
cls_model_dir = "/linguistics/ethan/DL_Prototype/models/LaBSE2_based_tqc"
model.save(cls_model_dir)

In [None]:
"""Load complete trained model."""

# Load trained model weights for defined model.
cls_weights_dir = "/linguistics/ethan/DL_Prototype/models/TQA_models/Multilingual-LaBSE-Bidirectional-LSTM_ckpts/training_job_7/tqc-0005.ckpt"
model3.load_weights(cls_weights_dir)

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

custom_objects={"recall_m": recall_m, "precision_m": precision_m}
cls_model_dir = "/linguistics/ethan/DL_Prototype/models/TQA_models/Multilingual-LaBSE-Bidirectional-LSTM_ckpts/tqc-0009.ckpt"
model3 = tf.keras.models.load_model(cls_model_dir, custom_objects)

In [None]:
# prediction = model3.predict()
# output = 
# new_model3 = tf.keras.Model(model3.inputs,
# dir(model3)
# input_array = np.array([[1,2], [3,4]])
# print(input_array.shape)
# dense_output = Dense(1, activation='sigmoid')(input_array)
# print(dense_output) 
# tf.where(dense_output > 0.5, 1, 0)
# dense_output = np.array([[0.2], [0.4], [0.6]])
# np.where(dense_output > 0.5, 1, 0)

In [None]:
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
accuracy_fn = tf.keras.metrics.BinaryAccuracy()
logits = np.array([[0.4, 0.6], [0.2, 0.8]])
targets = np.array([[1, 0], [0, 1]])
print(loss_fn(targets, logits, None))
print(accuracy_fn(targets, logits, None))
print(tf.nn.softmax(logits))

In [None]:
def classify1(text_pairs, threshold=0.5):
    
    src_integers = preprocessor([p[0] for p in text_pairs])
    tgt_integers = preprocessor([p[1] for p in text_pairs])
    
    predictions = model.predict([src_integers["input_word_ids"], src_integers["input_mask"], src_integers["input_type_ids"],
                                 tgt_integers["input_word_ids"], tgt_integers["input_mask"], tgt_integers["input_type_ids"]])
    
    return [1 if p > threshold else 0 for p in predictions]

def classify2(text_pairs, threshold=0.5):
    
    return [1 if p > threshold else 0 for p in model3.predict(text_pairs)]

In [None]:
text_pairs = [("Poly Property Group Company Limited", 
               "Poly Property Group Company Limited"),
              
              ("Amazon also indicated that it has moved its AI plans from hype to reality.", 
               "Amazon a aussi indiqué que ses projets d’IA, jusqu’alors des mythes, étaient devenus réalité."),
              
              ("A regulation may not be made before the earliest of", 
               "Le règlement ne peut être pris avant le premier en date des jours suivants"),
             
              ("Nancy J. Kyle is a vice chairman and director of CGTC.",
               "Nancy J. Kyle est vice-présidente du conseil d’administration et")]

classify(text_pairs, threshold=0.5)

In [None]:
src_texts = tf.constant([ "Poly Property Group Company Limited",
                          "Amazon also indicated that it has moved its AI plans from hype to reality.", 
                          "A regulation may not be made before the earliest of",
                          "Nancy J. Kyle is a vice chairman and director of CGTC.",
                          "You can copy-paste"])

tgt_texts = tf.constant(["Poly Property Group Company Limited", 
                         "Amazon a aussi indiqué que ses projets d’IA, jusqu’alors des mythes, étaient devenus réalité.",
                         "Le règlement ne peut être pris avant le premier en date des jours suivants",
                         "Nancy J. Kyle est vice-présidente du conseil d’administration et",
                         "Il est possible de copier-coller"])

["Poly Property Group Company Limited", "Amazon a aussi indiqué que ses projets d’IA, jusqu’alors des mythes, étaient devenus réalité.", "Le règlement ne peut être pris avant le premier en date des jours suivants", "Nancy J. Kyle est vice-présidente du conseil d’administration et", "Il est possible de copier-coller"]
classify2([src_texts, tgt_texts])

In [None]:
# model2.predict([src_texts, tgt_texts])
model3.summary()

In [None]:
tf.keras.utils.plot_model(model3, "/linguistics/ethan/DL_Prototype/models/TQA_models/Multilingual-LaBSE-Bidirectional-LSTM_ckpts/training_job_7/LaBSE_bi-LSTM_based.softmax.png", show_shapes=True)