In [18]:
!pip install tensorflowjs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
import tensorflow as tf
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np
import json
import zipfile

from google.colab import drive, files #if use colab
from tensorflow.nn import relu, tanh, softmax
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
# if use colab
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
#if use colab
git_dir = "/content/IOH-Chat-App"
git_url = "https://github.com/bangkit-team/IOH-chat-app.git"

if not os.path.exists(git_dir):
  subprocess.call(["git", "clone", git_url])

In [22]:
filedir1 = "/content/IOH-chat-app/MachineLearning/datasets/translation/result/eng-ind.csv" # #if use colab
# filedir2 = "/content/IOH-chat-app/MachineLearning/datasets/spam/emails.csv" # #if use colab
# filedir = "../../datasets/translate sentence/result/eng-ind.csv" #if use local env
# filedir = "../../datasets/spam/emails.csv" #if use local env

In [23]:
df = pd.read_csv(filedir1)
df

Unnamed: 0,English,Indonesia
0,Run!,Lari!
1,Who?,Siapa?
2,Wow!,Wow!
3,Help!,Tolong!
4,Jump!,Lompat!
...,...,...
8814,Every student who has graduated from our unive...,Semua mahasiswa yang telah menyelesaikan studi...
8815,"If you don't want to put on sunscreen, that's ...","Kalau kamu tidak mau pakai tabir surya, ya, te..."
8816,"When she was finished ironing, Mary switched o...","Ketika dia sudah selesai menyetrika, Mary mema..."
8817,"Irene Pepperberg, a researcher at Northwestern...","Irene Pepperberg, seorang peneliti di Universi..."


In [24]:
# df2 = pd.read_csv(filedir2)
# df2 = df2.rename(columns={"text": "English", "teks": "Indonesia"})
# df2 = df2.drop("spam", axis=1)
# df2

In [25]:
# df2_len = len(df2)
# df = pd.concat([df1, df2[:df2_len//2]])
# df

In [26]:
start_mark = '<start>'
end_mark = '<end>'

In [27]:
class TranslatorDataset:
  def __init__(self, dataframe):
    self.dataframe = dataframe
    self.input_tokenizer = None
    self.target_tokenizer = None
    self._load_data_from_file()

  def _load_data_from_file(self):
    df = self.dataframe

    input_lang = df.English.values
    target_lang = df.Indonesia.values

    return input_lang, target_lang

  def _normalize_and_preprocess(self, text, use_mark=False):
    if use_mark:
      text = text.lower().strip()
      text = " ".join([start_mark, text, end_mark])
    else:
      text = text.lower().strip()

    return text

  def _tokenize(self, sentences, num_words, maxlen):
    punctuation = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'

    tokenizer = Tokenizer(num_words=num_words, filters=punctuation, lower=False)
    tokenizer.fit_on_texts(sentences)

    sequences = tokenizer.texts_to_sequences(sentences)
    sequences = pad_sequences(sequences, maxlen=maxlen, padding="post")

    return sequences, tokenizer

  def _create_dataset(self):
    input_lang, target_lang = self._load_data_from_file()

    input_sentence = np.array(
        list(map(lambda x: self._normalize_and_preprocess(x, False), input_lang)))
    
    target_sentence = np.array(
        list(map(lambda y: self._normalize_and_preprocess(y, True), target_lang)))
    
    return input_sentence, target_sentence

  def _load_dataset(self, num_words):
    input_lang, target_lang = self._create_dataset()

    self.maxlen = max([len(i)for i in input_lang]) // 5
    self.buffer_size = len(input_lang)

    input_sequences, input_tokenizer = self._tokenize(
        input_lang, num_words, self.maxlen)
    
    target_sequences, target_tokenizer = self._tokenize(
        target_lang, num_words, self.maxlen,)

    return (input_sequences, input_tokenizer), (target_sequences, target_tokenizer)
  
  def get(self, num_words, batch_size):
    input, target = self._load_dataset(num_words)

    input_sequences, self.input_tokenizer = input
    target_sequences, self.target_tokenizer = target

    dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
    dataset = dataset.shuffle(self.buffer_size).batch(batch_size, drop_remainder=True)

    return self.input_tokenizer, self.target_tokenizer, dataset

In [28]:
num_words = 8000
batch_size = 64

In [29]:
translator_dataset = TranslatorDataset(df)
input_tokenizer, target_tokenizer, dataset = translator_dataset.get(num_words, batch_size)

In [30]:
input_batch, target_batch = next(iter(dataset))

In [31]:
input_batch.shape, target_batch.shape

(TensorShape([64, 32]), TensorShape([64, 32]))

In [32]:
input_vocab_size = len(input_tokenizer.index_word) + 1
target_vocab_size = len(target_tokenizer.index_word) + 1
input_maxlen = input_batch.shape[1]
target_maxlen = target_batch.shape[1]

input_maxlen, target_maxlen, input_vocab_size, target_vocab_size

(32, 32, 4091, 4876)

In [33]:
input_wi_json = "input_word_index.json"

with open(input_wi_json, 'w') as f:
    json.dump(input_tokenizer.word_index, f)

files.download(input_wi_json)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:
target_iw_json = "target_index_word.json"

with open(target_iw_json, 'w') as f:
    json.dump(target_tokenizer.index_word, f)

files.download(target_iw_json)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
input_example = input_batch[-1]
input_example

<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([  52,   22,    1, 2467,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int32)>

In [36]:
target_example = target_batch[-1]
target_example

<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([  1,   7, 378,  18,  37,   2,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0], dtype=int32)>

In [37]:
input_sentence = input_tokenizer.sequences_to_texts([input_example.numpy()])
input_sentence

['where are you bound']

In [38]:
target_sentence = target_tokenizer.sequences_to_texts([target_example.numpy()])
target_sentence

['<start> kamu menuju ke mana <end>']

In [39]:
embed_dims = 128
units = 512

In [119]:
class Encoder(layers.Layer):
  def __init__(self, input_vocab_size, embedding_dims, units):
    super(Encoder, self).__init__()
    self.units = units
    self.input_vocab_size = input_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.input_vocab_size, self.embedding_dims)
    self.lstm_layer = layers.LSTM(self.units,
                                 return_sequences=True,
                                 return_state=True,
                                 recurrent_initializer='glorot_uniform')

  def call(self, inputs):
    embedding = self.embedding(inputs)
    encoder = self.lstm_layer(embedding, initial_state=None)

    return encoder

In [120]:
# class BahdanauAttention(layers.Layer):
#   def __init__(self, units):
#     super(BahdanauAttention, self).__init__()
#     self.w1 = layers.Dense(units, use_bias=True) 
#     self.w2 = layers.Dense(units, use_bias=True) 
#     self.fd = layers.Dense(1)

#   def call(self, query, values):
#     query_with_time_axis = tf.expand_dims(query, 1)
    
#     score = self.fd(tf.nn.tanh(
#         self.w1(query_with_time_axis) + self.w2(values)))

#     attention_weights = softmax(score, axis=1)

#     context_vector = attention_weights * values
#     context_vector = tf.reduce_sum(context_vector, axis=1)

#     return context_vector, attention_weights

In [133]:
class Decoder(layers.Layer):
  def __init__(self, output_vocab_size, embedding_dims, units):
    super(Decoder, self).__init__()
    self.units = units
    self.output_vocab_size = output_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.output_vocab_size, self.embedding_dims)
    self.lstm_layer = layers.LSTM(self.units,
                                 return_sequences=True,
                                 return_state=True,
                                 recurrent_initializer='glorot_uniform')
    self.attention = layers.AdditiveAttention()
    self.dense1 = layers.Dense(self.units, activation=tanh)
    self.dense2 = layers.Dense(self.output_vocab_size)

  def call(self, inputs, en_outputs, state):
    embedding = self.embedding(inputs)
    dec_outputs, dec_h_state, dec_c_state = self.lstm_layer(embedding, initial_state=state)
    query_value_attention_seq = self.attention([dec_outputs, en_outputs])

    attention_vector = self.dense1(query_value_attention_seq)
    outputs = self.dense2(attention_vector)

    return outputs

In [134]:
encoder = Encoder(input_vocab_size, embed_dims, units)
en_outputs, en_h_state, en_c_state = encoder(input_batch)

en_outputs.shape, en_h_state.shape, en_c_state.shape

(TensorShape([64, 32, 512]), TensorShape([64, 512]), TensorShape([64, 512]))

In [135]:
decoder = Decoder(target_vocab_size, embed_dims, units)
dec_outputs= decoder(target_batch, en_outputs, [en_h_state, en_c_state])

dec_outputs.shape

TensorShape([64, 32, 4876])

In [136]:
lr = 1e-3
epochs = 30

optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [137]:
class TranslatorModel:
  def __init__(self, input_vocab_size, 
               target_vocab_size, 
               embed_dims, 
               units):
    self.input_vocab_size = input_vocab_size
    self.target_vocab_size = target_vocab_size
    self.embed_dims = embed_dims
    self.units = units

    self.encoder = Encoder(self.input_vocab_size, self.embed_dims, self.units)
    self.decoder = Decoder(self.target_vocab_size, self.embed_dims, self.units)

  def get_encoder(self):
    return self.encoder

  def get_decoder(self):
    return self.decoder
  
  def build_model(self):
    en_inputs = layers.Input(shape=(None,))
    en_output, en_h_state, en_c_state = self.encoder.call(en_inputs)

    dec_outputs = self.decoder.call(en_inputs, en_output, [en_h_state, en_c_state])

    model = Model(inputs=[en_inputs], 
                  outputs=[dec_outputs])
    return model

In [138]:
translator_model = TranslatorModel(
    input_vocab_size,
    target_vocab_size,
    embed_dims,
    units,
)
model = translator_model.build_model()

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=["accuracy"]
)

In [139]:
checkpoint_path = "/content/drive/MyDrive/translate/checkpoint/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=3, 
    verbose=1)

callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor='loss', 
    verbose=1, 
    save_freq=10,
    save_weights_only=True, 
    save_best_only=True)

callbacks = [callback_early_stopping,
             callback_checkpoint]

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=["accuracy"]
)

model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_34 (Embedding)       (None, None, 128)    523648      ['input_7[0][0]']                
                                                                                                  
 embedding_35 (Embedding)       (None, None, 128)    624128      ['input_7[0][0]']                
                                                                                                  
 lstm_34 (LSTM)                 [(None, None, 512),  1312768     ['embedding_34[0][0]']           
                                 (None, 512),                                               

In [None]:
model.fit(dataset,
          epochs=epochs,
          callbacks=callbacks,
          verbose=1)

Epoch 1/30
  8/137 [>.............................] - ETA: 6s - loss: 5.4103 - accuracy: 0.6783
Epoch 1: loss improved from inf to 4.75472, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0001.ckpt
 19/137 [===>..........................] - ETA: 7s - loss: 3.4325 - accuracy: 0.7303
Epoch 1: loss improved from 4.75472 to 3.35112, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0001.ckpt
 29/137 [=====>........................] - ETA: 7s - loss: 2.8611 - accuracy: 0.7454
Epoch 1: loss improved from 3.35112 to 2.82394, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0001.ckpt
Epoch 1: loss improved from 2.82394 to 2.51358, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0001.ckpt
Epoch 1: loss improved from 2.51358 to 2.30361, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0001.ckpt
Epoch 1: loss improved from 2.30361 to 2.15988, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0001.ckpt
Epoch 1: los

In [None]:
# if use colab
saved_model_path = "/content/drive/MyDrive/translate/saved_model/translator.h5"

# if use local env
# saved_model_path = "code/translate sentence/saved_model"
saved_model_dir = os.path.dirname(saved_model_path)

if os.path.exists(saved_model_dir):
  shutil.rmtree(saved_model_dir)
model.save(saved_model_path)

In [None]:
class Translator:
  def __init__(self, model_path, input_tokenizer, target_tokenizer, maxlen):
    self.input_tokenizer = input_tokenizer
    self.target_tokenizer = target_tokenizer
    self.maxlen = maxlen
    self.model_path = model_path

    self._load_model()

  def _load_model(self):
    self.model = tf.keras.models.load_model(self.model_path)

  def _normalize_and_preprocess(self, text):
    punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    
    text = text.lower().strip()
    text = text.replace(punctuation, "")

    return text
    
  def translate(self, sentence):
    normalize_sentence = self._normalize_and_preprocess(sentence)

    sequences = input_tokenizer.texts_to_sequences([normalize_sentence])
    sequences = pad_sequences(sequences, maxlen=self.maxlen, padding="post")
    
    predictions = self.model.predict(sequences)

    index_prediction = list()

    for i in predictions[0]:
      index_prediction.append(np.argmax(i))

    marks = [start_mark, end_mark]
    result = target_tokenizer.sequences_to_texts([index_prediction])[0]
    result = " ".join([word for word in result.split(" ") if word not in marks])

    return result

In [None]:
translator = Translator(
    saved_model_path,
    input_tokenizer, 
    target_tokenizer,
    input_maxlen,
)

translate = translator.translate("how are you?")
translate

In [116]:
tfjs_path = "/content/IOH-chat-app/MachineLearning/code/translation/tfjs_model"

!tensorflowjs_converter \
  --input_format=keras \
  {saved_model_path} \
  {tfjs_path}

In [117]:
shutil.make_archive("tfjs_model", 'zip', tfjs_path)

'/content/tfjs_model.zip'

In [118]:
files.download("tfjs_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>