In [1]:
import tensorflow as tf
import numpy as np
from collections import Counter
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re, string, os
import random

# Force Keras to use CPU.
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

def read_text(file):
    with open(file, "r") as f:
        lines = f.read().splitlines()

    return lines


class TextPreprocess(tf.keras.layers.Layer):

    def __init__(self, srcLang="eng", tgtLang="fra",
                       src_vocab_size=20000, src_len=200,
                       tgt_vocab_size=20000, tgt_len=200):
        super(TextPreprocess, self).__init__()
        # self.batch_size = batch_size
        self.srcLang = srcLang
        self.tgtLang = tgtLang
        self.src_vocab_size = src_vocab_size
        self.src_len = src_len
        self.tgt_vocab_size = tgt_vocab_size
        self.tgt_len = tgt_len
        self.src_text_vectorizer = TextVectorization(standardize=self.custom_standardization,
                                                     max_tokens=self.src_vocab_size,
                                                     output_mode="int",
                                                     output_sequence_length=self.src_len)
        self.tgt_text_vectorizer = TextVectorization(standardize=self.custom_standardization,
                                                     max_tokens=self.tgt_vocab_size,
                                                     output_mode="int",
                                                     output_sequence_length=self.tgt_len)

    def suffle_data(self, src_lines, tgt_lines, labels, random_state=1):
        """Shuffle data mainly for training data."""
        random.seed(random_state)
        random.shuffle(src_lines)
        random.seed(random_state)
        random.shuffle(tgt_lines)
        random.seed(random_state)
        random.shuffle(labels)

        return src_lines, tgt_lines, labels

    def read_dataset_from_directory(self, data_dir, label_class_map, shuffle=True):
        """Read TQA data from directory where the label is indicated in file name."""
        print("\nImporting Data")
        files = os.listdir(data_dir)
        good_src_prefix = [file.replace(".good." + self.srcLang, "") for file in files if file.endswith(".good." + self.srcLang)]
        good_tgt_prefix = [file.replace(".good." + self.tgtLang, "") for file in files if file.endswith(".good." + self.tgtLang)]
        bad_src_prefix = [file.replace(".bad." + self.srcLang, "") for file in files if file.endswith(".bad." + self.srcLang)]
        bad_tgt_prefix = [file.replace(".bad." + self.tgtLang, "") for file in files if file.endswith(".bad." + self.tgtLang)]

        assert set(good_src_prefix) == set(good_tgt_prefix), \
            "The number of good English and French file pairs not equal."

        assert set(bad_src_prefix) == set(bad_tgt_prefix), \
            "The number of bad English and French file pairs not equal."

        print("\t{} pairs of good English-French files found.".format(len(good_src_prefix)))
        print("\t{} pairs of bad English-French files found.".format(len(bad_src_prefix)))

        all_prefix_by_class = [prefix + ".good" for prefix in good_src_prefix] + \
                              [prefix + ".bad" for prefix in bad_src_prefix]
        src_lines = []
        tgt_lines = []
        labels = []

        for prefix in all_prefix_by_class:

            label = prefix.split(".")[-1]
            en_path = os.path.join(data_dir, prefix + "." + self.srcLang)
            fr_path = os.path.join(data_dir, prefix + "." + self.tgtLang)
            g_en_lines = read_text(en_path)
            g_fr_lines = read_text(fr_path)

            if len(g_en_lines) == len(g_fr_lines):
                class_num = label_class_map.get(label)
                src_lines += g_en_lines
                tgt_lines += g_fr_lines
                labels += [class_num] * len(g_en_lines)

        if shuffle:
            src_lines, tgt_lines, labels = self.suffle_data(src_lines, tgt_lines, labels)
        counter = Counter(labels)
        print("Importing Data Complete.")
        print("\t{} good entries".format(counter[label_class_map["good"]]))
        print("\t{} bad entries".format(counter[label_class_map["bad"]]))

        src_lines = np.array(src_lines).reshape(len(src_lines), 1)
        tgt_lines = np.array(tgt_lines).reshape(len(tgt_lines), 1)
        labels = np.array(labels).reshape(len(labels), 1)

        return src_lines, tgt_lines, labels

    def custom_standardization(self, input_data):
        """Customized manipulations on raw text."""
        lowercase = tf.strings.lower(input_data)
        stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
        return tf.strings.regex_replace(
            stripped_html, "[%s]" % re.escape(string.punctuation), ""
        )

    def create_integer_ds(self, src_text, tgt_text):
        # src_text, tgt_text = samples

        src_int_samples = self.src_text_vectorizer(src_text)
        tgt_int_samples = self.tgt_text_vectorizer(tgt_text)

        return src_int_samples, tgt_int_samples

    def create_datasets(self, data_dir, label_class_map, mode="train", batch_size=32):
        """Create datasets used for training and testing from local files"""

        src_lines, tgt_lines, labels = self.read_dataset_from_directory(data_dir, label_class_map, shuffle=True)

        # labels = tf.data.Dataset.from_tensor_slices((labels)).batch(batch_size)
        if mode == "train":

            # train_src_text = dataset.map(lambda src, tgt: src)
            # train_tgt_text = dataset.map(lambda src, tgt: tgt)
            # print("Creating vocabulary for training source texts...")
            # self.src_text_vectorizer.adapt(train_src_text)
            # print("Creating vocabulary for training target texts...")
            # self.tgt_text_vectorizer.adapt(train_tgt_text)
            # dataset = dataset.map(self.create_integer_ds)
            print("Creating vocabulary for training source and target texts...")
            self.src_text_vectorizer.adapt(src_lines)
            self.tgt_text_vectorizer.adapt(tgt_lines)
            print("Mapping texts into integer repsentations...")
            src_integers, tgt_integers = self.create_integer_ds(src_lines, tgt_lines)
            # print(src_integers.shape)
            # print(labels.shape)
            # dataset = tf.data.Dataset.from_tensor_slices(({"input_1": src_integers, "input_2": tgt_integers},
            #                                               labels)).batch(batch_size)

        elif mode == "test":

            print("Mapping texts into integer repsentations...")
            src_integers, tgt_integers = self.create_integer_ds(src_lines, tgt_lines)
            # dataset = tf.data.Dataset.from_tensor_slices(({"input_1": src_integers, "input_2": tgt_integers},
            #                                               labels)).batch(batch_size)
            # test_ds = tf.data.Dataset.from_tensor_slices(([src_lines, tgt_lines], labels)).batch(batch_size)
            # dataset = dataset.map(self.create_integer_ds)

        else:
            raise ValueError("Please select mode between 'train' and 'test'.")

        return src_integers, tgt_integers, labels

In [2]:
import os
import numpy as np
from model import TQC_Model
# from preprocessing import TextPreprocess


def train(model, train_ds, train_labels, epochs, optimizer="adam", **hparams):

    model.compile(loss="binary_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])
    model.fit(train_ds, train_labels, epochs=epochs)

    return model


# hyper-parameters
srcLang = "eng"
tgtLang = "fra"
src_vocab_size = 20000
src_len = 150
tgt_vocab_size = 20000
tgt_len = 150

num_layers = 6  # the number of encoder layer for both source and target
d_model = 128   # dimension of word for both source and target
num_heads = 8   # the number of heads for both source and target
dff = 2048
maximum_position_encoding = 10000

batch_size = 128
epochs = 3
optimizer = "adam"

label_class_map = {"good": 1, "bad": 0}

rootpath = os.path.abspath("..")
train_data_dir = os.path.join(rootpath, "tqa/train")
test_data_dir = os.path.join(rootpath, "tqa/test")

In [3]:
# get data ready
print("------------------------------------------------------------")
print("Reading and preprocessing data.")

# src_lines = np.array(src_lines)
# tgt_lines = np.array(tgt_lines)
# labels = np.array(labels)

tp = TextPreprocess(src_vocab_size=src_vocab_size, src_len=src_len,
                    tgt_vocab_size=tgt_vocab_size, tgt_len=tgt_len)
src_integers, tgt_integers, labels = tp.create_datasets(train_data_dir, label_class_map, mode='train')
test_src_integers, test_tgt_integers, test_labels = tp.create_datasets(test_data_dir, label_class_map, mode='test')
labels = np.array(labels)
test_labels = np.array(test_labels)

------------------------------------------------------------
Reading and preprocessing data.

Importing Data
	1 pairs of good English-French files found.
	1 pairs of bad English-French files found.
Importing Data Complete.
	335073 good entries
	128875 bad entries


TypeError: 'tuple' object is not callable

In [3]:

print(src_integers.shape)
print(tgt_integers.shape)
print(labels.shape)

(463948, 150)
(463948, 150)
(463948,)


In [None]:
# get model and start training
print("------------------------------------------------------------")
print("Initializing and training model.")

strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

with strategy.scope():
    
    # Everything that creates variables should be under the strategy scope.
    # In general this is only model construction & `compile()`.
    model = TQC_Model((src_len), (tgt_len),
                  num_layers, d_model, num_heads, dff,
                  src_vocab_size, tgt_vocab_size, maximum_position_encoding)
    
    model.compile(loss="binary_crossentropy",
              optimizer=optimizer,
              metrics=["accuracy"])


steps_per_epoch = int(src_integers.shape[0] / batch_size)

model.fit(x=[src_integers, tgt_integers], 
          y=labels, 
          validation_split=0.1, 
          epochs=epochs,
          batch_size=batch_size,
          steps_per_epoch=steps_per_epoch)

------------------------------------------------------------
Initializing and training model.
Epoch 1/3
   1/3624 [..............................] - ETA: 25:44:33 - loss: 2.1272 - accuracy: 0.3125

In [9]:
# import tensorflow as tf
# c = []
# gpus = tf.config.experimental.list_logical_devices('GPU')

# with tf.device("/gpu:0"):
#     a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
#     b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2])
#     c.append(tf.matmul(a, b))
# with tf.device('/cpu:0'):
#     sum = tf.add_n(c)
# # Creates a session with log_device_placement set to True.
# sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# # Runs the op.
# print(sess.run(sum))

# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Number of devices: 1
