In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

!pip install --ignore-installed --user --upgrade tensorflow

In [2]:
tf.__version__

'2.10.1'

In [3]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        print("Found a GPU with the name:", gpu.name)
else:
    print("Failed to detect a GPU.")

Found a GPU with the name: /physical_device:GPU:0


In [4]:
data= pd.read_csv('data/jap-eng.csv', on_bad_lines='skip')

In [5]:
data= data.iloc[:,:-1]

In [6]:
data.head()

Unnamed: 0,日本語,英語
0,102世吉田日厚貫首,"the 102nd head priest, Nikko TOSHIDA"
1,1月15日：成人祭、新年祭,"15th January: Seijin-sai (Adult Festival), the..."
2,1月3日：家運隆盛、商売繁盛祈願祭,3rd January: Prayer Festival for the prosperit...
3,1月7日：七種粥神事,7th January: Nanakusa-gayu shinji (a divine se...
4,21世紀COEプログラム,The 21st Century Center Of Excellence Program


In [7]:
data.shape

(51971, 2)

In [8]:
data.rename(columns= {'日本語':'Japanese', '英語':'English'}, inplace= True)

In [9]:
data.head()

Unnamed: 0,Japanese,English
0,102世吉田日厚貫首,"the 102nd head priest, Nikko TOSHIDA"
1,1月15日：成人祭、新年祭,"15th January: Seijin-sai (Adult Festival), the..."
2,1月3日：家運隆盛、商売繁盛祈願祭,3rd January: Prayer Festival for the prosperit...
3,1月7日：七種粥神事,7th January: Nanakusa-gayu shinji (a divine se...
4,21世紀COEプログラム,The 21st Century Center Of Excellence Program


In [10]:
data['English']= data['English'].astype(str)
data['Japanese']= data['Japanese'].astype(str)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51971 entries, 0 to 51970
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Japanese  51971 non-null  object
 1   English   51971 non-null  object
dtypes: object(2)
memory usage: 812.2+ KB


In [12]:
from string import punctuation
import re

def clean_string(string):
    # Replace no-break space with space
    string = string.replace("\u202f"," ")
    # Converts all uppercase characters into lowercase characters
    string = string.lower()

    # Delete the punctuation and the numbers
    for p in punctuation + "«»" + "0123456789":
        string = string.replace(p," ")

    # Eliminate duplicate whitespaces using wildcards
    string = re.sub("\s+"," ", string)
    # Remove spaces at the beginning and at the end of the string
    string = string.strip()

    return string

In [13]:
data['English'] = data['English'].apply(lambda x: clean_string(x))
data['Japanese'] = data['Japanese'].apply(lambda x: clean_string(x))

In [14]:
# data= data.iloc[:15000, :]

In [15]:
data.shape

(51971, 2)

In [16]:
data.iloc[50:60,:]

Unnamed: 0,Japanese,English
50,drcハウス,drc house
51,etc,etc electronic toll collection
52,eu研究センター,eu research center
53,eu資料センター,eu information center
54,faz 輸入促進地域,faz foreign access zone
55,fifaワールドカップ,fifa world cup
56,ghq,ghq general headquarters of the allied forces
57,ghq民間情報教育局,civil information and education section ghq scap
58,ghq民生局,general section of ghq general headquarters of...
59,gk kyoto,gk kyoto


In [17]:
source_sentences= data['English'].values
target_sentences= data['Japanese'].values

In [18]:
train_data_split= int(source_sentences.shape[0] * 0.1 )
train_data_split

5197

In [19]:
source_sentences, source_val_sentences= source_sentences[train_data_split:], source_sentences[:train_data_split]
target_sentences, target_val_sentences= target_sentences[train_data_split:], target_sentences[:train_data_split]

In [20]:
def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

In [21]:
target_sentences= tag_target_sentences(target_sentences)

In [22]:
target_sentences[:5]

['<sos> 閲兵式 <eos>',
 '<sos> 閲覧室 <eos>',
 '<sos> 榎原雅治 <eos>',
 '<sos> 榎本郷 <eos>',
 '<sos> 榎本軍 <eos>']

In [23]:
len(target_sentences)

46774

### Tokenization

In [24]:
# SOURCE Tokenization
source_tokenizer= tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>", filters="#$%&()*+,-./:;=@[\\]^_`{|}~\t\n")
source_tokenizer.fit_on_texts(source_sentences)

In [25]:
train_encoder_inputs = source_tokenizer.texts_to_sequences(source_sentences)

In [26]:
source_vocab_size= len(source_tokenizer.word_index) + 1
source_vocab_size

29403

In [27]:
print(train_encoder_inputs[60:65])
print(source_tokenizer.sequences_to_texts(train_encoder_inputs[60:65]))

[[4662, 12771], [4662, 241], [5963, 8, 6], [12772, 385, 5963, 385, 3, 81], [12773, 234]]
['enchin nittoguho', 'enchin ha', 'entsu ji temple', 'enzu daishi entsu daishi the kannon', 'daiendaio kokushi']


In [28]:
# TARGET Tokenization
target_tokenizer= tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>", filters="#$%&()*+,-./:;=@[\\]^_`{|}~\t\n")
target_tokenizer.fit_on_texts(target_sentences)

In [29]:
target_vocab_size= len(target_tokenizer.word_index) + 1
target_vocab_size

47066

In [30]:
def generate_decoder_inputs_targets(sequences, tokenizer):
    seqs= tokenizer.texts_to_sequences(sequences)
    decoder_inputs= [s[:-1] for s in seqs] # Drops last token in sequence; eg = <sos> Hi I am Arvind
    decoder_outputs= [s[1:] for s in seqs] # Drops first token in sequence; eg = Hi I am Arvind <eos>

    return decoder_inputs, decoder_outputs

In [31]:
train_decoder_inputs, train_decoder_outputs= generate_decoder_inputs_targets(target_sentences, target_tokenizer)

In [32]:
# decoder inputs 
target_tokenizer.sequences_to_texts(train_decoder_inputs[:5])

['<sos> 閲兵式', '<sos> 閲覧室', '<sos> 榎原雅治', '<sos> 榎本郷', '<sos> 榎本軍']

In [33]:
# decoder outputs
target_tokenizer.sequences_to_texts(train_decoder_outputs[:5])

['閲兵式 <eos>', '閲覧室 <eos>', '榎原雅治 <eos>', '榎本郷 <eos>', '榎本軍 <eos>']

In [34]:
max_encoding_len= len(max(train_encoder_inputs, key= len))
max_encoding_len

55

In [35]:
max_decoding_len= len(max(train_decoder_inputs, key= len))
max_decoding_len

8

### Padding

In [36]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_train_encoder_inputs= pad_sequences(train_encoder_inputs, maxlen= max_encoding_len, padding= "post", truncating= "post")
padded_train_decoder_inputs= pad_sequences(train_decoder_inputs, maxlen= max_decoding_len, padding= "post", truncating= "post")
padded_train_decoder_outputs= pad_sequences(train_decoder_outputs, maxlen= max_decoding_len, padding= "post", truncating= "post")

In [37]:
print(padded_train_encoder_inputs[10])
print(padded_train_decoder_inputs[10])
print(padded_train_decoder_outputs[10])

[8179    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]
[  2 454   0   0   0   0   0   0]
[454   3   0   0   0   0   0   0]


In [38]:
# 0 is considered as OOV and given <unk> value
source_tokenizer.sequences_to_texts([padded_train_encoder_inputs[800]])

['grace <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>']

### validation data preparation

In [39]:
def process_dataset(preprocessed_input, preprocessed_output):
    
    tagged_preprocessed_output = tag_target_sentences(preprocessed_output)
    # Vectorize encoder source sentences.
    encoder_inputs = source_tokenizer.texts_to_sequences(preprocessed_input)
    # Vectorize and create decoder input and target sentences.
    decoder_inputs, decoder_targets = generate_decoder_inputs_targets(tagged_preprocessed_output, 
                                                                    target_tokenizer)
  
    # Pad all collections.
    padded_encoder_inputs = pad_sequences(encoder_inputs, max_encoding_len, padding='post', truncating='post')
    padded_decoder_inputs = pad_sequences(decoder_inputs, max_decoding_len, padding='post', truncating='post')
    padded_decoder_targets = pad_sequences(decoder_targets, max_decoding_len, padding='post', truncating='post')

    return padded_encoder_inputs, padded_decoder_inputs, padded_decoder_targets


In [40]:
# Process validation dataset
padded_val_encoder_inputs, padded_val_decoder_inputs, padded_val_decoder_targets= process_dataset(source_val_sentences, target_val_sentences)

In [41]:
target_tokenizer.word_index

{'<unk>': 1,
 '<sos>': 2,
 '<eos>': 3,
 '駅': 4,
 '）': 5,
 '列車': 6,
 '第': 7,
 '」': 8,
 '京都府道': 9,
 '能': 10,
 '日本': 11,
 '前': 12,
 '京都市': 13,
 '系電車': 14,
 '京阪': 15,
 '家': 16,
 'りょう）': 17,
 '号': 18,
 '京都府': 19,
 '年': 20,
 'の': 21,
 '番': 22,
 '年（': 23,
 '王朝': 24,
 '年）': 25,
 '年間': 26,
 '中国': 27,
 '国鉄': 28,
 '宗': 29,
 '太刀': 30,
 '藤原': 31,
 '南朝': 32,
 'トンネル': 33,
 '京都バス': 34,
 '系統': 35,
 '京都市バス': 36,
 '京都市営地下鉄': 37,
 '近鉄': 38,
 '奈良県': 39,
 '僧侶': 40,
 '祭': 41,
 '公卿': 42,
 '系客車': 43,
 '石': 44,
 '大阪市': 45,
 '茶室': 46,
 '北野天神縁起': 47,
 '戸': 48,
 '三重県': 49,
 '加茂': 50,
 '山': 51,
 '式': 52,
 '株式会社': 53,
 '紋': 54,
 '京炎': 55,
 '京極': 56,
 '電気鉄道': 57,
 '形電車': 58,
 'dayチケット': 59,
 '京都学生祭典': 60,
 '舞鶴': 61,
 '京都市道': 62,
 '大津市': 63,
 '藩': 64,
 '堺市': 65,
 '東海道': 66,
 '源': 67,
 '源光': 68,
 '源信': 69,
 '五条駅': 70,
 '僧': 71,
 '弘安': 72,
 '高野山': 73,
 '条': 74,
 '阪急': 75,
 '阪急京都本線': 76,
 '阪神高速道路': 77,
 'ヶ条': 78,
 '四条駅': 79,
 '時代': 80,
 '滋賀県道': 81,
 '準急': 82,
 '書院': 83,
 '勝光寺': 84,
 '小倉駅': 85,
 '昭和': 86,
 '舞鶴市': 87,
 '真言

In [42]:
target_tokenizer.sequences_to_texts([padded_val_decoder_inputs[900]])

['<sos> <unk> <unk> <unk> <unk> <unk> <unk> <unk>']

### Building Translation Model

In [43]:
embedding_dim= 128
hidden_dim= 256
default_dropout= 0.2
batch_size= 32
epochs= 30

In [44]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense

encoder_inputs= Input(shape= [None], name= 'encoder_inputs')
#  mask_zero: Boolean, whether or not the input value 0 is a special
encoder_embeddings= Embedding(input_dim= source_vocab_size, output_dim= embedding_dim, mask_zero= True, name= "encoder_embeddings")
encoder_embedding_outputs= encoder_embeddings(encoder_inputs)

encoder_lstm= LSTM(units= hidden_dim, return_state= True, dropout= default_dropout, name= "encoder_lstm")
# since return sequences is false: the value of encoder_outputs is same as state_h, 
# if true, all y_hat values of every time stamp is returned to encoder_outputs
encoder_outputs, state_h, state_c= encoder_lstm(encoder_embedding_outputs)

encoder_states= (state_h, state_c)


In [45]:
decoder_inputs= Input(shape= [None], name= "decoder_inputs")
decoder_embeddings= Embedding(input_dim= target_vocab_size, output_dim= embedding_dim, mask_zero= True, name= "decoder_embeddings")
decoder_embedding_outputs= decoder_embeddings(decoder_inputs)

decoder_lstm= LSTM(units= hidden_dim, return_sequences=True ,return_state= True, dropout= default_dropout, name= "decoder_lstm")

decoder_outputs, _, _= decoder_lstm(decoder_embedding_outputs, initial_state= encoder_states)

decoder_dense= Dense(target_vocab_size, activation='softmax', name="decoder_dense")

y_proba= decoder_dense(decoder_outputs)

In [46]:
# Note how the model is taking two inputs in an array.
model = tf.keras.Model([encoder_inputs, decoder_inputs], y_proba, name='eng_jap_seq2seq_nmt_no_attention2')

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',  metrics=['accuracy'])
model.summary()

Model: "eng_jap_seq2seq_nmt_no_attention2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder_embeddings (Embedding)  (None, None, 128)   3763584     ['encoder_inputs[0][0]']         
                                                                                                  
 decoder_embeddings (Embedding)  (None, None, 128)   6024448     ['decoder_inputs[0][0]']         
                                                                  

In [47]:
from keras.utils.vis_utils import plot_model

plot_model(model, to_file='eng_jap_seq2seq_nmt_no_attention2.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [48]:
import os

os.getcwd()

'F:\\AI-ML\\NLP\\Machine translation'

In [49]:
# Saving this to a folder on my local machine.
filepath="artifacts/Eng-Jap2/Eng-Jap.h5"

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                                 save_weights_only=True,
                                                 verbose=1)

In [50]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

history = model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs], padded_train_decoder_outputs,
                     batch_size=batch_size,
                     epochs=5,
                     validation_data=([padded_val_encoder_inputs, padded_val_decoder_inputs], padded_val_decoder_targets),
                     callbacks=[cp_callback, es_callback])

Epoch 1/5
Epoch 1: saving model to artifacts/Eng-Jap2\Eng-Jap.h5
Epoch 2/5
Epoch 2: saving model to artifacts/Eng-Jap2\Eng-Jap.h5
Epoch 3/5
Epoch 3: saving model to artifacts/Eng-Jap2\Eng-Jap.h5
Epoch 4/5
Epoch 4: saving model to artifacts/Eng-Jap2\Eng-Jap.h5


### Saving Model and Tokenizers

In [51]:
# Saving Model
from keras.models import load_model

# Save model to HDF5 format
model.save('artifacts/Eng-Jap2/Eng-Jap-model.h5')

# Load model from HDF5 format
loaded_model = load_model('artifacts/Eng-Jap2/Eng-Jap-model.h5')

In [52]:
# loaded_model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs], padded_train_decoder_outputs,
#                      batch_size=batch_size,
#                      epochs=3,
#                      validation_data=([padded_val_encoder_inputs, padded_val_decoder_inputs], padded_val_decoder_targets),
#                      callbacks=[cp_callback, es_callback])

In [53]:
# Saving Tokenizer
import io
import json


##### Save the tokenizers as JSON files. The resulting files can be downloaded by left-clicking on them.
source_tokenizer_json = source_tokenizer.to_json()
with io.open('artifacts/Eng-Jap2/source_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(source_tokenizer_json, ensure_ascii=False))

target_tokenizer_json = target_tokenizer.to_json()
with io.open('artifacts/Eng-Jap2/target_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(target_tokenizer_json, ensure_ascii=False))

In [54]:
# Reading Tokenizer
with io.open('artifacts/Eng-Jap2/source_tokenizer.json', 'r', encoding='utf-8') as f:
    # Read the JSON data
    st= json.load(f)
    st = tf.keras.preprocessing.text.tokenizer_from_json(st)

### Model Evaluation

In [55]:
# Evaluate the model on the test set.
model.evaluate([padded_val_encoder_inputs, padded_val_decoder_inputs], padded_val_decoder_targets)



[1.8068965673446655, 0.4639418423175812]

In [56]:
[layer.name for layer in model.layers]

['encoder_inputs',
 'decoder_inputs',
 'encoder_embeddings',
 'decoder_embeddings',
 'encoder_lstm',
 'decoder_lstm',
 'decoder_dense']

In [57]:
encoder_inputs = model.get_layer('encoder_inputs').input

encoder_embedding_layer = model.get_layer('encoder_embeddings')
encoder_embeddings = encoder_embedding_layer(encoder_inputs)

encoder_lstm = model.get_layer('encoder_lstm')

_, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embeddings)

encoder_states = [encoder_state_h, encoder_state_c]

encoder_model_no_attention = tf.keras.Model(encoder_inputs, encoder_states)

In [58]:
decoder_inputs = model.get_layer('decoder_inputs').input

decoder_embedding_layer = model.get_layer('decoder_embeddings')
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

# Inputs to represent the decoder's LSTM hidden and cell states. We'll populate 
# these manually using the encoder's output for the initial state.
decoder_input_state_h = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_h')
decoder_input_state_c = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_c')
decoder_input_states = [decoder_input_state_h, decoder_input_state_c]

decoder_lstm = model.get_layer('decoder_lstm')

decoder_sequence_outputs, decoder_output_state_h, decoder_output_state_c = decoder_lstm(
    decoder_embeddings, initial_state=decoder_input_states
)

# Update hidden and cell states for the next time step.
decoder_output_states = [decoder_output_state_h, decoder_output_state_c]

decoder_dense = model.get_layer('decoder_dense')
y_proba = decoder_dense(decoder_sequence_outputs)

decoder_model_no_attention = tf.keras.Model(
    [decoder_inputs] + decoder_input_states, 
    [y_proba] + decoder_output_states
) 

In [59]:
def translate_without_attention(sentence: str, 
                                source_tokenizer, encoder,
                                target_tokenizer, decoder,
                                max_translated_len = 30):

  # Vectorize the source sentence and run it through the encoder.    
  input_seq = source_tokenizer.texts_to_sequences([sentence])

  # Get the tokenized sentence to see if there are any unknown tokens.
  tokenized_sentence = source_tokenizer.sequences_to_texts(input_seq)

  states = encoder.predict(input_seq)  

  current_word = '<sos>'
  decoded_sentence = []

  while len(decoded_sentence) < max_translated_len:
    
    # Set the next input word for the decoder.
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_tokenizer.word_index[current_word]
    
    # Determine the next word.
    target_y_proba, h, c = decoder.predict([target_seq] + states)
    target_token_index = np.argmax(target_y_proba[0, -1, :])
    current_word = target_tokenizer.index_word[target_token_index]

    if (current_word == '<eos>'):
      break

    decoded_sentence.append(current_word)
    states = [h, c]
  
  return tokenized_sentence[0], ' '.join(decoded_sentence)


In [60]:
import random

source_sentences= source_val_sentences[600:650]
target_sentences= target_val_sentences[600:650]

In [61]:
def translate_sentences(source_sentences, target_sentences,translation_func, source_tokenizer, encoder,
                        target_tokenizer, decoder):
  translations = {'Tokenized Original': [], 'Reference': [], 'Translation': []}

  for s in source_sentences:
    tokenized_sentence, translated = translation_func(s, source_tokenizer, encoder,
                                                      target_tokenizer, decoder)

    translations['Tokenized Original'].append(tokenized_sentence)
    translations['Translation'].append(translated)

  for t in target_sentences:
      translations['Reference'].append(t)
  
  return translations

In [62]:
translations_no_attention = pd.DataFrame(translate_sentences(source_sentences, target_sentences, translate_without_attention,
                                                             source_tokenizer, encoder_model_no_attention,
                                                             target_tokenizer, decoder_model_no_attention))
translations_no_attention



Unnamed: 0,Tokenized Original,Reference,Translation
0,japanese giant <unk>,オオサンショウウオ,
1,ocean arrow,オーシャンアロー,
2,envoy extraordinary to <unk>,オーストリア特命全権公使,大江定基
3,goshawk,オオタカ,第
4,rose crested <unk>,オオバタン,京都府道
5,open <unk>,オープンコースウェア,京都府道
6,omononushi,オオモノヌシ,駅
7,oyamatsumi,オオヤマツミ,
8,japanese cracker <unk>,おかき,京都府道 列車
9,<unk> mairi,おかげ参り,京都府道


In [63]:
translations_no_attention

Unnamed: 0,Tokenized Original,Reference,Translation
0,japanese giant <unk>,オオサンショウウオ,
1,ocean arrow,オーシャンアロー,
2,envoy extraordinary to <unk>,オーストリア特命全権公使,大江定基
3,goshawk,オオタカ,第
4,rose crested <unk>,オオバタン,京都府道
5,open <unk>,オープンコースウェア,京都府道
6,omononushi,オオモノヌシ,駅
7,oyamatsumi,オオヤマツミ,
8,japanese cracker <unk>,おかき,京都府道 列車
9,<unk> mairi,おかげ参り,京都府道


In [64]:
target_sentences= target_tokenizer.texts_to_sequences(target_sentences)

In [65]:
target_vocab_size= len(target_tokenizer.word_index) + 1
target_vocab_size

47066

eng_train_sentences, eng_test_sentences= eng_sentences[:85000], eng_sentences[85000:]
jap_train_sentences, jap_test_sentences= jap_sentences[:85000], jap_sentences[85000:]

<!-- from tensorflow.keras.utils import plot_model
import pydot
import graphviz 

# Your model definition code goes here

# Plot the model and save the image 
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) -->