In [1]:
import os
import numpy as np
from model import TQC_Model
from preprocessing import TextPreprocess


def train(model, train_ds, train_labels, epochs, optimizer="adam", **hparams):

    model.compile(loss="binary_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])
    model.fit(train_ds, train_labels, epochs=epochs)

    return model


# hyper-parameters
srcLang = "eng"
tgtLang = "fra"
src_vocab_size = 20000
src_len = 150
tgt_vocab_size = 20000
tgt_len = 150

num_layers = 6  # the number of encoder layer for both source and target
d_model = 128   # dimension of word for both source and target
num_heads = 8   # the number of heads for both source and target
dff = 2048
maximum_position_encoding = 10000

batch_size = 128
epochs = 3
optimizer = "adam"

label_class_map = {"good": 1, "bad": 0}

rootpath = os.path.abspath("..")
train_data_dir = os.path.join(rootpath, "datasets/tqa/train")
test_data_dir = os.path.join(rootpath, "datasets/tqa/test")

In [2]:
# get data ready
print("------------------------------------------------------------")
print("Reading and preprocessing data.")
tp = TextPreprocess(src_vocab_size=src_vocab_size, src_len=src_len,
                    tgt_vocab_size=tgt_vocab_size, tgt_len=tgt_len)
src_integers, tgt_integers, labels = tp.create_datasets(train_data_dir, label_class_map, mode='train')
test_src_integers, test_tgt_integers, test_labels = tp.create_datasets(test_data_dir, label_class_map, mode='test')

------------------------------------------------------------
Reading and preprocessing data.

Importing Data
	1 pairs of good English-French files found.
	1 pairs of bad English-French files found.
Importing Data Complete.
	335073 good entries
	128875 bad entries
Creating vocabulary for training source and target texts...
Mapping texts into integer repsentations...

Importing Data
	1 pairs of good English-French files found.
	1 pairs of bad English-French files found.
Importing Data Complete.
	400 good entries
	400 bad entries
Mapping texts into integer repsentations...


In [3]:
labels = np.array(labels)
print(src_integers.shape)
print(tgt_integers.shape)
print(labels.shape)

(463948, 150)
(463948, 150)
(463948,)


In [None]:
# get model and start training
print("------------------------------------------------------------")
print("Initializing and training model.")
model = TQC_Model((src_len), (tgt_len),
                  num_layers, d_model, num_heads, dff,
                  src_vocab_size, tgt_vocab_size, maximum_position_encoding)

steps_per_epoch = int(src_integers.shape[0] / batch_size)

model.compile(loss="binary_crossentropy",
              optimizer=optimizer,
              metrics=["accuracy"])

model.fit(x=[src_integers, tgt_integers], 
          y=labels, 
          validation_split=0.1, 
          epochs=epochs,
          batch_size=batch_size,
          steps_per_epoch=steps_per_epoch)

------------------------------------------------------------
Initializing and training model.
Epoch 1/3
  85/3624 [..............................] - ETA: 4:34:34 - loss: 0.3511 - accuracy: 0.8643