In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

!pip install --ignore-installed --user --upgrade tensorflow

In [2]:
tf.__version__

'2.10.1'

In [3]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        print("Found a GPU with the name:", gpu.name)
else:
    print("Failed to detect a GPU.")

Found a GPU with the name: /physical_device:GPU:0


In [4]:
data= pd.read_csv(r"C:\Users\Arvind Vasa\Downloads\jpn-eng\jpn.txt", delimiter='\t', header=None)

In [5]:
data= data.iloc[:,:-1]

In [6]:
data.head()

Unnamed: 0,0,1
0,Go.,行け。
1,Go.,行きなさい。
2,Hi.,こんにちは。
3,Hi.,もしもし。
4,Hi.,やっほー。


In [7]:
data.shape

(106516, 2)

In [8]:
data.rename(columns= {0:'English', 1:'Japanese'}, inplace= True)

In [9]:
data.head()

Unnamed: 0,English,Japanese
0,Go.,行け。
1,Go.,行きなさい。
2,Hi.,こんにちは。
3,Hi.,もしもし。
4,Hi.,やっほー。


In [10]:
data['English']= data['English'].astype(str)
data['Japanese']= data['Japanese'].astype(str)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106516 entries, 0 to 106515
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   English   106516 non-null  object
 1   Japanese  106516 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


In [12]:
from string import punctuation
import re

def clean_string(string):
    # Replace no-break space with space
    string = string.replace("\u202f"," ")
    # Converts all uppercase characters into lowercase characters
    string = string.lower()

    # Delete the punctuation and the numbers
    for p in punctuation + "«»" + "0123456789":
        string = string.replace(p," ")

    # Eliminate duplicate whitespaces using wildcards
    string = re.sub("\s+"," ", string)
    # Remove spaces at the beginning and at the end of the string
    string = string.strip()

    return string

In [13]:
data['English'] = data['English'].apply(lambda x: clean_string(x))
data['Japanese'] = data['Japanese'].apply(lambda x: clean_string(x))

In [14]:
# data= data.iloc[:15000, :]

In [15]:
data.shape

(106516, 2)

In [16]:
data.head()

Unnamed: 0,English,Japanese
0,go,行け。
1,go,行きなさい。
2,hi,こんにちは。
3,hi,もしもし。
4,hi,やっほー。


In [128]:
source_sentences= data['English'].values
target_sentences= data['Japanese'].values

In [134]:
train_data_split= int(source_sentences.shape[0] * 0.1 )
train_data_split

10651

In [18]:
source_sentences, source_val_sentences= source_sentences[train_data_split:], source_sentences[:train_data_split]
target_sentences, target_val_sentences= target_sentences[train_data_split:], target_sentences[:train_data_split]

In [19]:
def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

In [20]:
target_sentences= tag_target_sentences(target_sentences)

In [21]:
target_sentences[:5]

['<sos> 紙が必要だ。 <eos>',
 '<sos> 家に帰らなくちゃ。 <eos>',
 '<sos> 家に帰らなきゃ。 <eos>',
 '<sos> 温かい水が欲しい。 <eos>',
 '<sos> 黙ってうなずきました。 <eos>']

In [22]:
len(target_sentences)

96516

### Tokenization

In [23]:
# SOURCE Tokenization
source_tokenizer= tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>", filters="#$%&()*+,-./:;=@[\\]^_`{|}~\t\n")
source_tokenizer.fit_on_texts(source_sentences)

In [24]:
train_encoder_inputs = source_tokenizer.texts_to_sequences(source_sentences)

In [25]:
source_vocab_size= len(source_tokenizer.word_index) + 1
source_vocab_size

11191

In [26]:
print(train_encoder_inputs[60:65])
print(source_tokenizer.sequences_to_texts(train_encoder_inputs[60:65]))

[[2, 169, 45, 93], [2, 169, 73, 99], [2, 231, 4302], [2, 231, 4302], [2, 231, 6, 195, 111]]
['i thought as much', 'i thought so too', 'i took highway', 'i took highway', 'i took a week off']


In [27]:
# TARGET Tokenization
target_tokenizer= tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>", filters="#$%&()*+,-./:;=@[\\]^_`{|}~\t\n")
target_tokenizer.fit_on_texts(target_sentences)

In [28]:
target_vocab_size= len(target_tokenizer.word_index) + 1
target_vocab_size

82357

In [29]:
def generate_decoder_inputs_targets(sequences, tokenizer):
    seqs= tokenizer.texts_to_sequences(sequences)
    decoder_inputs= [s[:-1] for s in seqs] # Drops last token in sequence; eg = <sos> Hi I am Arvind
    decoder_outputs= [s[1:] for s in seqs] # Drops first token in sequence; eg = Hi I am Arvind <eos>

    return decoder_inputs, decoder_outputs

In [30]:
train_decoder_inputs, train_decoder_outputs= generate_decoder_inputs_targets(target_sentences, target_tokenizer)

In [31]:
# decoder inputs 
target_tokenizer.sequences_to_texts(train_decoder_inputs[:5])

['<sos> 紙が必要だ。',
 '<sos> 家に帰らなくちゃ。',
 '<sos> 家に帰らなきゃ。',
 '<sos> 温かい水が欲しい。',
 '<sos> 黙ってうなずきました。']

In [32]:
# decoder outputs
target_tokenizer.sequences_to_texts(train_decoder_outputs[:5])

['紙が必要だ。 <eos>',
 '家に帰らなくちゃ。 <eos>',
 '家に帰らなきゃ。 <eos>',
 '温かい水が欲しい。 <eos>',
 '黙ってうなずきました。 <eos>']

In [33]:
max_encoding_len= len(max(train_encoder_inputs, key= len))
max_encoding_len

59

In [34]:
max_decoding_len= len(max(train_decoder_inputs, key= len))
max_decoding_len

9

### Padding

In [35]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_train_encoder_inputs= pad_sequences(train_encoder_inputs, maxlen= max_encoding_len, padding= "post", truncating= "post")
padded_train_decoder_inputs= pad_sequences(train_decoder_inputs, maxlen= max_decoding_len, padding= "post", truncating= "post")
padded_train_decoder_outputs= pad_sequences(train_decoder_outputs, maxlen= max_decoding_len, padding= "post", truncating= "post")

In [36]:
print(padded_train_encoder_inputs[10])
print(padded_train_decoder_inputs[10])
print(padded_train_decoder_outputs[10])

[   2 2375   68  560    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0]
[    2 11393     0     0     0     0     0     0     0]
[11393     3     0     0     0     0     0     0     0]


In [37]:
# 0 is considered as OOV and given <unk> value
source_tokenizer.sequences_to_texts([padded_train_encoder_inputs[80000]])

['don t count your chickens before they hatch <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>']

### validation data preparation

In [38]:
def process_dataset(preprocessed_input, preprocessed_output):
    
    tagged_preprocessed_output = tag_target_sentences(preprocessed_output)
    # Vectorize encoder source sentences.
    encoder_inputs = source_tokenizer.texts_to_sequences(preprocessed_input)
    # Vectorize and create decoder input and target sentences.
    decoder_inputs, decoder_targets = generate_decoder_inputs_targets(tagged_preprocessed_output, 
                                                                    target_tokenizer)
  
    # Pad all collections.
    padded_encoder_inputs = pad_sequences(encoder_inputs, max_encoding_len, padding='post', truncating='post')
    padded_decoder_inputs = pad_sequences(decoder_inputs, max_decoding_len, padding='post', truncating='post')
    padded_decoder_targets = pad_sequences(decoder_targets, max_decoding_len, padding='post', truncating='post')

    return padded_encoder_inputs, padded_decoder_inputs, padded_decoder_targets


In [39]:
# Process validation dataset
padded_val_encoder_inputs, padded_val_decoder_inputs, padded_val_decoder_targets= process_dataset(source_val_sentences, target_val_sentences)

In [41]:
target_tokenizer.sequences_to_texts([padded_val_decoder_inputs[1780]])

['<sos> 始めていい？ <unk> <unk> <unk> <unk> <unk> <unk> <unk>']

### Building Translation Model

In [42]:
embedding_dim= 128
hidden_dim= 256
default_dropout= 0.2
batch_size= 32
epochs= 30

In [76]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense

encoder_inputs= Input(shape= [None], name= 'encoder_inputs')
#  mask_zero: Boolean, whether or not the input value 0 is a special
encoder_embeddings= Embedding(input_dim= source_vocab_size, output_dim= embedding_dim, mask_zero= True, name= "encoder_embeddings")
encoder_embedding_outputs= encoder_embeddings(encoder_inputs)

encoder_lstm= LSTM(units= hidden_dim, return_state= True, dropout= default_dropout, name= "encoder_lstm")
# since return sequences is false: the value of encoder_outputs is same as state_h, 
# if true, all y_hat values of every time stamp is returned to encoder_outputs
encoder_outputs, state_h, state_c= encoder_lstm(encoder_embedding_outputs)

encoder_states= (state_h, state_c)


In [44]:
decoder_inputs= Input(shape= [None], name= "decoder_inputs")
decoder_embeddings= Embedding(input_dim= target_vocab_size, output_dim= embedding_dim, mask_zero= True, name= "decoder_embeddings")
decoder_embedding_outputs= decoder_embeddings(decoder_inputs)

decoder_lstm= LSTM(units= hidden_dim, return_sequences=True ,return_state= True, dropout= default_dropout, name= "decoder_lstm")

decoder_outputs, _, _= decoder_lstm(decoder_embedding_outputs, initial_state= encoder_states)

decoder_dense= Dense(target_vocab_size, activation='softmax', name="decoder_dense")

y_proba= decoder_dense(decoder_outputs)

In [45]:
# Note how the model is taking two inputs in an array.
model = tf.keras.Model([encoder_inputs, decoder_inputs], y_proba, name='eng_jap_seq2seq_nmt_no_attention')

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',  metrics=['accuracy'])
model.summary()

Model: "eng_jap_seq2seq_nmt_no_attention"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder_embeddings (Embedding)  (None, None, 128)   1432448     ['encoder_inputs[0][0]']         
                                                                                                  
 decoder_embeddings (Embedding)  (None, None, 128)   10541696    ['decoder_inputs[0][0]']         
                                                                   

In [46]:
from keras.utils.vis_utils import plot_model

plot_model(model, to_file='eng_jap_seq2seq_nmt_no_attention.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [47]:
# Saving this to a folder on my local machine.
filepath="./HunEngNMTNoAttention/training1/cp.ckpt"

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                                 save_weights_only=True,
                                                 verbose=1)

In [48]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

history = model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs], padded_train_decoder_outputs,
                     batch_size=batch_size,
                     epochs=3,
                     validation_data=([padded_val_encoder_inputs, padded_val_decoder_inputs], padded_val_decoder_targets),
                     callbacks=[cp_callback, es_callback])

Epoch 1/3
Epoch 1: saving model to ./HunEngNMTNoAttention/training1\cp.ckpt
Epoch 2/3
Epoch 2: saving model to ./HunEngNMTNoAttention/training1\cp.ckpt
Epoch 3/3
Epoch 3: saving model to ./HunEngNMTNoAttention/training1\cp.ckpt


### Saving Model and Tokenizers

In [84]:
# Saving Model
from keras.models import load_model

# Save model to HDF5 format
model.save('artifacts/eng_jap_seq2seq_nmt_no_attention.h5')

# Load model from HDF5 format
loaded_model = load_model('artifacts/eng_jap_seq2seq_nmt_no_attention.h5')


In [87]:
# loaded_model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs], padded_train_decoder_outputs,
#                      batch_size=batch_size,
#                      epochs=3,
#                      validation_data=([padded_val_encoder_inputs, padded_val_decoder_inputs], padded_val_decoder_targets),
#                      callbacks=[cp_callback, es_callback])

In [79]:
# Saving Tokenizer
import io
import json


##### Save the tokenizers as JSON files. The resulting files can be downloaded by left-clicking on them.
source_tokenizer_json = source_tokenizer.to_json()
with io.open('artifacts/source_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(source_tokenizer_json, ensure_ascii=False))

target_tokenizer_json = target_tokenizer.to_json()
with io.open('artifacts/target_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(target_tokenizer_json, ensure_ascii=False))

In [82]:
# Reading Tokenizer
with io.open('artifacts/source_tokenizer.json', 'r', encoding='utf-8') as f:
    # Read the JSON data
    st= json.load(f)
    st = tf.keras.preprocessing.text.tokenizer_from_json(st)

### Model Evaluation

In [50]:
# Evaluate the model on the test set.
model.evaluate([padded_val_encoder_inputs, padded_val_decoder_inputs], padded_val_decoder_targets)



[1.5448741912841797, 0.4994009733200073]

In [96]:
[layer.name for layer in model.layers]

['encoder_inputs',
 'decoder_inputs',
 'encoder_embeddings',
 'decoder_embeddings',
 'encoder_lstm',
 'decoder_lstm',
 'decoder_dense']

In [97]:
encoder_inputs = model.get_layer('encoder_inputs').input

encoder_embedding_layer = model.get_layer('encoder_embeddings')
encoder_embeddings = encoder_embedding_layer(encoder_inputs)

encoder_lstm = model.get_layer('encoder_lstm')

_, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embeddings)

encoder_states = [encoder_state_h, encoder_state_c]

encoder_model_no_attention = tf.keras.Model(encoder_inputs, encoder_states)

In [98]:
decoder_inputs = model.get_layer('decoder_inputs').input

decoder_embedding_layer = model.get_layer('decoder_embeddings')
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

# Inputs to represent the decoder's LSTM hidden and cell states. We'll populate 
# these manually using the encoder's output for the initial state.
decoder_input_state_h = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_h')
decoder_input_state_c = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_c')
decoder_input_states = [decoder_input_state_h, decoder_input_state_c]

decoder_lstm = model.get_layer('decoder_lstm')

decoder_sequence_outputs, decoder_output_state_h, decoder_output_state_c = decoder_lstm(
    decoder_embeddings, initial_state=decoder_input_states
)

# Update hidden and cell states for the next time step.
decoder_output_states = [decoder_output_state_h, decoder_output_state_c]

decoder_dense = model.get_layer('decoder_dense')
y_proba = decoder_dense(decoder_sequence_outputs)

decoder_model_no_attention = tf.keras.Model(
    [decoder_inputs] + decoder_input_states, 
    [y_proba] + decoder_output_states
) 

In [99]:
def translate_without_attention(sentence: str, 
                                source_tokenizer, encoder,
                                target_tokenizer, decoder,
                                max_translated_len = 30):

  # Vectorize the source sentence and run it through the encoder.    
  input_seq = source_tokenizer.texts_to_sequences([sentence])

  # Get the tokenized sentence to see if there are any unknown tokens.
  tokenized_sentence = source_tokenizer.sequences_to_texts(input_seq)

  states = encoder.predict(input_seq)  

  current_word = '<sos>'
  decoded_sentence = []

  while len(decoded_sentence) < max_translated_len:
    
    # Set the next input word for the decoder.
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_tokenizer.word_index[current_word]
    
    # Determine the next word.
    target_y_proba, h, c = decoder.predict([target_seq] + states)
    target_token_index = np.argmax(target_y_proba[0, -1, :])
    current_word = target_tokenizer.index_word[target_token_index]

    if (current_word == '<eos>'):
      break

    decoded_sentence.append(current_word)
    states = [h, c]
  
  return tokenized_sentence[0], ' '.join(decoded_sentence)


In [122]:
import random

source_sentences= source_val_sentences[600:650]
target_sentences= target_val_sentences[600:650]

In [125]:
def translate_sentences(source_sentences, target_sentences,translation_func, source_tokenizer, encoder,
                        target_tokenizer, decoder):
  translations = {'Tokenized Original': [], 'Reference': [], 'Translation': []}

  for s in source_sentences:
    tokenized_sentence, translated = translation_func(s, source_tokenizer, encoder,
                                                      target_tokenizer, decoder)

    translations['Tokenized Original'].append(tokenized_sentence)
    translations['Translation'].append(translated)

  for t in target_sentences:
      translations['Reference'].append(t)
  
  return translations

In [126]:
translations_no_attention = pd.DataFrame(translate_sentences(source_sentences, target_sentences, translate_without_attention,
                                                             source_tokenizer, encoder_model_no_attention,
                                                             target_tokenizer, decoder_model_no_attention))
translations_no_attention



Unnamed: 0,Tokenized Original,Reference,Translation
0,i promise,約束するよ。,トムは 時だよ。
1,i relaxed,僕はリラックスした。,トムは 時だよ。
2,i said no,ダメだって言ったでしょ。,トムは 時だよ。
3,i said no,違うってば。,トムは 時だよ。
4,i said so,私はそう言いました。,トムは 時だよ。
5,i said so,そう言っておいたはずだ。,トムは 時だよ。
6,i saw him,私は彼に会った。,トムは 時だよ。
7,i saw him,彼を見た。,トムは 時だよ。
8,i see tom,トムが見える。,トムは 時だよ。
9,i want it,これが欲しい。,トムは 時だよ。


In [127]:
translations_no_attention

Unnamed: 0,Tokenized Original,Reference,Translation
0,i promise,約束するよ。,トムは 時だよ。
1,i relaxed,僕はリラックスした。,トムは 時だよ。
2,i said no,ダメだって言ったでしょ。,トムは 時だよ。
3,i said no,違うってば。,トムは 時だよ。
4,i said so,私はそう言いました。,トムは 時だよ。
5,i said so,そう言っておいたはずだ。,トムは 時だよ。
6,i saw him,私は彼に会った。,トムは 時だよ。
7,i saw him,彼を見た。,トムは 時だよ。
8,i see tom,トムが見える。,トムは 時だよ。
9,i want it,これが欲しい。,トムは 時だよ。


In [51]:
target_sentences= target_tokenizer.texts_to_sequences(target_sentences)

In [52]:
target_vocab_size= len(target_tokenizer.word_index) + 1
target_vocab_size

82357

eng_train_sentences, eng_test_sentences= eng_sentences[:85000], eng_sentences[85000:]
jap_train_sentences, jap_test_sentences= jap_sentences[:85000], jap_sentences[85000:]

In [53]:
eng_train_sentences

NameError: name 'eng_train_sentences' is not defined

### Padding

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# SOURCE padding
eng_padded_sentences= pad_sequences(eng_train_sentences, padding= 'post')
jap_padded_sentences= pad_sequences(jap_train_sentences, padding= 'post')

In [None]:
eng_vocab_size= len(eng_tokenizer.word_index) + 1
jap_vocab_size= len(jap_tokenizer.word_index) + 1

In [None]:
print(eng_vocab_size)
print(jap_vocab_size)

In [None]:
max_eng_sent_length= len(eng_padded_sentences[0])
max_jap_sent_length= len(jap_padded_sentences[0])
max_eng_sent_length, max_jap_sent_length

In [None]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense

embedding_dim= 32
units= 256

# Define the encoder
encoder_input = Input(shape=(max_eng_sent_length,))
encoder_embedding = Embedding(input_dim=eng_vocab_size, output_dim=embedding_dim)(encoder_input)
encoder_lstm, state_h, state_c = LSTM(units, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Define the decoder
decoder_input = Input(shape=(max_jap_sent_length-1,))
decoder_embedding = Embedding(input_dim= jap_vocab_size, output_dim=embedding_dim)(decoder_input)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(jap_vocab_size, activation='softmax')
decoder_output = decoder_dense(decoder_output)

# Define the model
model = Model([encoder_input, decoder_input], decoder_output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
model.fit([eng_padded_sentences, jap_padded_sentences[:, :-1]], jap_padded_sentences[:, 1:], epochs=3, batch_size=64, validation_split=0.2)

import pickle 

with open('eng_jap_translator.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [None]:
def predict(sentence):
    inputs = tf.convert_to_tensor(sentence)
    result = ''
    inputs = tf.expand_dims(inputs, axis=0)
    hidden = [tf.zeros((1, units)), tf.zeros((1, units))]
    enc_out, state = encoder(inputs, hidden)
    hidden_state = state
    dec_input = tf.expand_dims([target_lang_tokenize.word_index['start_']], 0)
    for t in range(max_length_target):
        predictions, hidden_state = decoder(dec_input,
                                                             hidden_state,
                                                             enc_out)

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += target_lang_tokenize.index_word[predicted_id] + ' '
        if target_lang_tokenize.index_word[predicted_id] == '_end' or len(result) > max_length_target:
            return result

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
    return result

In [None]:
model.fit([eng_padded_sentences, jap_padded_sentences[:, :-1]], jap_padded_sentences[:, 1:], epochs=10, batch_size=64, validation_split=0.2)

<!-- from tensorflow.keras.utils import plot_model
import pydot
import graphviz 

# Your model definition code goes here

# Plot the model and save the image 
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) -->