In [1]:
import tensorflow as tf
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np
import json
import re
import pathlib

from textblob import TextBlob
from sklearn.model_selection import train_test_split
from google.colab import drive, files #if use colab
from tensorflow.nn import relu, tanh, softmax
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.lite.python import interpreter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# # if use colab
# drive.mount('/content/drive')

In [3]:
#if use colab
git_dir = "/content/IOH-Chat-App"
git_url = "https://github.com/bangkit-team/IOH-chat-app.git"

if not os.path.exists(git_dir):
  subprocess.call(["git", "clone", git_url])

In [4]:
filedir1 = "/content/IOH-chat-app/MachineLearning/datasets/translation/result/eng-ind.csv" # #if use colab
# filedir1 = "../../datasets/translate sentence/result/eng-ind.csv" #if use local env

In [5]:
df = pd.read_csv(filedir1)
df

Unnamed: 0,English,Indonesia
0,Run!,Lari!
1,Who?,Siapa?
2,Wow!,Wow!
3,Help!,Tolong!
4,Jump!,Lompat!
...,...,...
15359,Limitation of this capability causes opportuni...,Keterbatasan kemampuan ini menyebabkan tertutu...
15360,Subjective approach evaluates poverty based on...,Pendekatan subyektif menilai kemiskinan berdas...
15361,"Limited sufficiency and food quality , seen fr...","terbatasnya kecukupan dan mutu pangan , diliha..."
15362,Around 20 percents people with the lowest inco...,Sekitar 20 persen penduduk dengan tingkat pend...


In [6]:
start_mark = '<start>'
end_mark = '<end>'

In [7]:
class TranslatorDataset:
  def __init__(self, dataframe):
    self.dataframe = dataframe
    self.input_tokenizer = None
    self.target_tokenizer = None
    self._load_data_from_file()

  def _load_data_from_file(self):
    df = self.dataframe

    input_lang = df.English.values
    target_lang = df.Indonesia.values

    return input_lang, target_lang

  def _normalize_and_preprocess(self, text, use_mark=False):
    if use_mark:
      text = text.lower().strip()
      text = " ".join([start_mark, text, end_mark])
    else:
      text = text.lower().strip()

    return text

  def _tokenize(self, sentences, num_words, maxlen):
    punctuation = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'

    tokenizer = Tokenizer(num_words=num_words, filters=punctuation, lower=False)
    tokenizer.fit_on_texts(sentences)

    sequences = tokenizer.texts_to_sequences(sentences)
    sequences = pad_sequences(
      sequences, maxlen=maxlen, padding="post", truncating="post")

    return sequences, tokenizer

  def _create_dataset(self):
    input_lang, target_lang = self._load_data_from_file()

    input_sentence = np.array(
        list(map(lambda x: self._normalize_and_preprocess(x, False), input_lang)))
    
    target_sentence = np.array(
        list(map(lambda y: self._normalize_and_preprocess(y, True), target_lang)))
    
    return input_sentence, target_sentence

  def _load_dataset(self, num_words):
    input_lang, target_lang = self._create_dataset()

    self.maxlen = max([len(i)for i in input_lang]) // 5
    self.buffer_size = len(input_lang)

    input_sequences, input_tokenizer = self._tokenize(
        input_lang, num_words, self.maxlen)
    
    target_sequences, target_tokenizer = self._tokenize(
        target_lang, num_words, self.maxlen,)

    return (input_sequences, input_tokenizer), (target_sequences, target_tokenizer)
  
  def get(self, num_words, batch_size):
    input, target = self._load_dataset(num_words)

    input_sequences, self.input_tokenizer = input
    target_sequences, self.target_tokenizer = target

    dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
    dataset = dataset.shuffle(self.buffer_size).batch(batch_size, drop_remainder=True)
    dataset = dataset.cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return self.input_tokenizer, self.target_tokenizer, dataset

In [8]:
num_words = 15000
batch_size = 64

In [9]:
translator_dataset = TranslatorDataset(df)

input_tokenizer, target_tokenizer, dataset = translator_dataset.get(num_words, batch_size)

In [10]:
input_batch, target_batch = next(iter(dataset))

In [11]:
input_batch.shape, target_batch.shape

(TensorShape([64, 112]), TensorShape([64, 112]))

In [12]:
input_vocab_size = len(input_tokenizer.index_word) + 1
target_vocab_size = len(target_tokenizer.index_word) + 1
input_maxlen = input_batch.shape[1]
target_maxlen = target_batch.shape[1]

input_maxlen, target_maxlen, input_vocab_size, target_vocab_size

(112, 112, 12035, 13498)

In [13]:
input_wi_json = "input_word_index.json"

with open(input_wi_json, 'w') as f:
    json.dump(input_tokenizer.word_index, f)

# files.download(input_wi_json)

In [14]:
target_wi_json = "target_index_word.json"

with open(target_wi_json, 'w') as f:
    json.dump(target_tokenizer.index_word, f)

# files.download(target_wi_json)

In [15]:
input_example = input_batch[-1]
input_example

<tf.Tensor: shape=(112,), dtype=int32, numpy=
array([  74, 1240,    3, 7767,  465,  579,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int32)>

In [16]:
target_example = target_batch[-1]
target_example

<tf.Tensor: shape=(112,), dtype=int32, numpy=
array([   1,  108,  108,    7,   14, 1986,    9,  303,    2,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int32)>

In [17]:
input_sentence = input_tokenizer.sequences_to_texts([input_example.numpy()])
input_sentence

["i'm sick of conferences these days"]

In [18]:
target_sentence = target_tokenizer.sequences_to_texts([target_example.numpy()])
target_sentence

['<start> akhir akhir ini aku bosan dengan rapat <end>']

In [19]:
embed_dims = 256
units = 1024

In [20]:
class Encoder():
  def __init__(self, input_vocab_size, embedding_dims, units, batch_size):
    self.units = units
    self.batch_size = batch_size
    self.input_vocab_size = input_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.input_vocab_size, self.embedding_dims)
    self.lstm_layer = layers.LSTM(self.units,
                                 return_sequences=True,
                                 return_state=True,
                                 recurrent_initializer='glorot_uniform')
    
  def initialize_hidden_state(self):
    return [tf.zeros((self.batch_size, self.units)), tf.zeros((self.batch_size, self.units))]

  def call(self, inputs, hidden):
    embedding = self.embedding(inputs)
    encoder = self.lstm_layer(embedding, initial_state=hidden)

    return encoder

In [21]:
class BahdanauAttention(layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.w1 = layers.Dense(units, use_bias=True) 
    self.w2 = layers.Dense(units, use_bias=True) 
    self.fd = layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    
    score = self.fd(tf.nn.tanh(
        self.w1(query_with_time_axis) + self.w2(values)))

    attention_weights = softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [22]:
class Decoder:
  def __init__(self, output_vocab_size, embedding_dims, units):
    self.units = units
    self.output_vocab_size = output_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.output_vocab_size, self.embedding_dims)
    self.lstm_layer = layers.LSTM(self.units,
                                  return_sequences=True,
                                  return_state=True,
                                  recurrent_initializer='glorot_uniform')
    self.attention = BahdanauAttention(self.units)
    self.dense1 = layers.Dense(self.units, activation=tanh)
    self.dropout = layers.Dropout(.5)
    self.dense2 = layers.Dense(self.output_vocab_size)

  def call(self, inputs, en_outputs, state):
    embedding = self.embedding(inputs)
    dec_outputs, dec_h_state, dec_c_state = self.lstm_layer(
        embedding, initial_state=state)
    
    context_vector, attention_weights = self.attention(
        query=dec_outputs, values=en_outputs)
    
    context_and_rnn_output = tf.concat([context_vector, dec_outputs], axis=-1)

    attention_vector = self.dense1(context_and_rnn_output)
    outputs = self.dropout(attention_vector)
    outputs = self.dense2(outputs)

    return outputs

In [23]:
encoder = Encoder(input_vocab_size, embed_dims, units, batch_size)

sample_en_hidden = encoder.initialize_hidden_state()
en_outputs, en_h_state, en_c_state = encoder.call(input_batch, sample_en_hidden)

en_outputs.shape, en_h_state.shape, en_c_state.shape

(TensorShape([64, 112, 1024]),
 TensorShape([64, 1024]),
 TensorShape([64, 1024]))

In [24]:
decoder = Decoder(target_vocab_size, embed_dims, units)
dec_outputs= decoder.call(target_batch, en_outputs, [en_h_state, en_c_state])

dec_outputs.shape

TensorShape([64, 112, 13498])

In [25]:
lr = 0.001
epochs = 30

optimizer = tf.keras.optimizers.Adam(
    learning_rate=lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [26]:
class TranslatorModel:
  def __init__(self, input_vocab_size, 
               target_vocab_size, 
               embed_dims, 
               units, 
               batch_size):
    self.input_vocab_size = input_vocab_size
    self.target_vocab_size = target_vocab_size
    self.embed_dims = embed_dims
    self.units = units
    self.batch_size = batch_size

    self.encoder = Encoder(
        self.input_vocab_size, self.embed_dims, self.units, self.batch_size)
    
    self.decoder = Decoder(
        self.target_vocab_size, self.embed_dims, self.units)
  
  def build_model(self):
    en_inputs = layers.Input(shape=(None,))

    en_hidden = self.encoder.initialize_hidden_state()
    en_output, en_h_state, en_c_state = self.encoder.call(en_inputs, en_hidden)

    dec_outputs = self.decoder.call(en_inputs, en_output, [en_h_state, en_c_state])

    model = Model(inputs=[en_inputs], 
                  outputs=[dec_outputs])
    return model

In [27]:
translator_model = TranslatorModel(
    input_vocab_size,
    target_vocab_size,
    embed_dims,
    units,
    batch_size
)
model = translator_model.build_model()

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=["accuracy"]
)

In [28]:
checkpoint_path = "checkpoint/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=3, 
    verbose=1)

callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor='loss', 
    verbose=1, 
    save_weights_only=True, 
    save_best_only=True)

callbacks = [callback_early_stopping,
             callback_checkpoint]

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 256)    3080960     ['input_1[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 256)    3455488     ['input_1[0][0]']                
                                                                                                  
 lstm_2 (LSTM)                  [(64, None, 1024),   5246976     ['embedding_2[0][0]']            
                                 (64, 1024),                                                  

In [None]:
model.fit(dataset,
          epochs=epochs,
          callbacks=callbacks,
          verbose=1)

Epoch 1/30
Epoch 1: loss improved from inf to 1.07971, saving model to checkpoint/cp.ckpt
Epoch 2/30
Epoch 2: loss improved from 1.07971 to 0.90412, saving model to checkpoint/cp.ckpt
Epoch 3/30
Epoch 3: loss improved from 0.90412 to 0.85695, saving model to checkpoint/cp.ckpt
Epoch 4/30
Epoch 4: loss improved from 0.85695 to 0.82206, saving model to checkpoint/cp.ckpt
Epoch 5/30
Epoch 5: loss improved from 0.82206 to 0.79225, saving model to checkpoint/cp.ckpt
Epoch 6/30
Epoch 6: loss improved from 0.79225 to 0.76591, saving model to checkpoint/cp.ckpt
Epoch 7/30
Epoch 7: loss improved from 0.76591 to 0.73932, saving model to checkpoint/cp.ckpt
Epoch 8/30
Epoch 8: loss improved from 0.73932 to 0.71330, saving model to checkpoint/cp.ckpt
Epoch 9/30
Epoch 9: loss improved from 0.71330 to 0.68990, saving model to checkpoint/cp.ckpt
Epoch 10/30
Epoch 10: loss improved from 0.68990 to 0.66357, saving model to checkpoint/cp.ckpt
Epoch 11/30
Epoch 11: loss improved from 0.66357 to 0.63584, s

In [None]:
# if use colab
saved_model_path = "saved_model"

# if use local env
# saved_model_path = "code/translate sentence/saved_model"
saved_model_dir = os.path.dirname(saved_model_path)

if os.path.exists(saved_model_dir):
  shutil.rmtree(saved_model_dir)
  
model.save(saved_model_path)

files.download(saved_model_path)

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS,
  tf.lite.OpsSet.SELECT_TF_OPS
]
converter.experimental_new_converter = True
converter.allow_custom_ops = True

tflite_model = converter.convert()

tflite_model_file = pathlib.Path('translation.tflite')
tflite_model_file.write_bytes(tflite_model)

In [None]:
interpreter = tf.lite.Interpreter("translation.tflite")
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

input_details, output_details

In [None]:
class Translator:
  def __init__(self, model_path, input_word_index, target_index_word, maxlen):
    self.input_word_index = input_word_index
    self.target_index_word = target_index_word
    self.maxlen = maxlen
    self.model_path = model_path

    self._load_model()
    self._load_vocab()

  def _load_model(self):
    self.model = tf.keras.models.load_model(self.model_path, compile=True)

  def _load_vocab(self):
    with open(self.input_word_index) as f:
      self.input_vocab = json.load(f)
    
    with open(self.target_index_word) as f:
      vocab = json.load(f)
      self.target_vocab = {int(k):v for k,v in vocab.items()}
      
  def _normalize_and_preprocess(self, text):
    punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    
    text = text.lower().strip()
    text = "".join((filter(lambda x: x not in punctuation, text)))

    return text

  def _texts_to_sequences(self, text):
    words = text.split(" ")
    sequences = list()

    for word in words:
      if word in self.input_vocab.keys():
        token = self.input_vocab[word]
        sequences.append(token)

    return sequences

  def _sequences_to_texts(self, sequences):
    words = list()

    for token in sequences:
      if token in self.target_vocab.keys():
        word = self.target_vocab[token]
        words.append(word)    

    return words  

  def lang_detector(self, sentence):
    lang = TextBlob(sentence)
    return lang
    
  def translate(self, sentence):
    index_prediction = list()
    normalize_sentence = self._normalize_and_preprocess(sentence)
    
    sequences = self._texts_to_sequences(normalize_sentence)
    sequences = pad_sequences(
        [sequences], maxlen=self.maxlen, padding="post", truncating="post")
    
    predictions = self.model.predict(sequences)

    for i in predictions[0]:
      index_prediction.append(np.argmax(i))

    marks = [start_mark, end_mark]
    result = self._sequences_to_texts(index_prediction)

    result = " ".join([word for word in result if word not in marks])

    return result

In [None]:
input_wi = "/content/input_word_index.json"
target_iw = "/content/target_index_word.json"

translator = Translator(
    encoder, decoder, input_tokenizer, target_tokenizer)

In [None]:
text_input = "i'm looking for"
lang_detector = translator.lang_detector(text_input)

if lang_detector == "en":
  translate = translator.translate(text_input)
  print(translate)
else:
  print("Bahasa tidak dikenali")