<a href="https://colab.research.google.com/github/Auckland68/NLP-Tensorflow-Projects/blob/main/Machine_Translation_with_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf 
import numpy as np
from nltk.corpus import stopwords
from tensorflow import keras 

In [23]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [24]:
train, test = train_test_split(pd.read_csv("gdrive/MyDrive/ita.csv", sep='\\t',header = None, nrows=100000) , test_size=.10) 

  """Entry point for launching an IPython kernel.


In [25]:
train.shape

(90000, 2)

In [26]:
test.shape

(10000, 2)

In [27]:
train.columns = ["English","Italian"]
train.head()

Unnamed: 0,English,Italian
8780,It's a shame.,È un peccato.
95381,Don't eat the oysters.,Non mangiate le ostriche.
38594,They're not good.,Non sono bravi.
79584,You weren't invited.,Voi non siete stati invitati.
69346,I am bored to death.,Io sono annoiato a morte.


# Preprocessing

In [28]:
train["lower"] = train["English"].str.lower()
train["punc"] = train["English"].str.replace('[^\w\s]','')

In [29]:
train['lower_it'] = train["Italian"].str.lower()
train['punc_it'] =  '_start_' + ' ' +train['lower_it'].str.replace('[^\w\s]','')+ ' ' +'_end_'

In [30]:
train.head()

Unnamed: 0,English,Italian,lower,punc,lower_it,punc_it
8780,It's a shame.,È un peccato.,it's a shame.,Its a shame,è un peccato.,_start_ è un peccato _end_
95381,Don't eat the oysters.,Non mangiate le ostriche.,don't eat the oysters.,Dont eat the oysters,non mangiate le ostriche.,_start_ non mangiate le ostriche _end_
38594,They're not good.,Non sono bravi.,they're not good.,Theyre not good,non sono bravi.,_start_ non sono bravi _end_
79584,You weren't invited.,Voi non siete stati invitati.,you weren't invited.,You werent invited,voi non siete stati invitati.,_start_ voi non siete stati invitati _end_
69346,I am bored to death.,Io sono annoiato a morte.,i am bored to death.,I am bored to death,io sono annoiato a morte.,_start_ io sono annoiato a morte _end_


In [31]:
# Set parameters
max_feat = 5000
maxlen = 100

max_feat2 = 5000
maxlen2 = 100

In [32]:
# Create word embeddings for english, tokenize, create sequences and pad to 20 characters
tok1 = keras.preprocessing.text.Tokenizer(num_words=max_feat) 
tok1.fit_on_texts(list(train['punc']))
tf_train_english =tok1.texts_to_sequences(list(train['punc']))
tf_train_english =keras.preprocessing.sequence.pad_sequences(tf_train_english, maxlen=maxlen) 

In [33]:
# Create word embeddings for italian. Padding after words
tok2 = keras.preprocessing.text.Tokenizer(num_words=max_feat2, filters = '*') 
tok2.fit_on_texts(list(train['punc_it'])) 
tf_train_italian = tok2.texts_to_sequences(list(train['punc_it']))
tf_train_italian = keras.preprocessing.sequence.pad_sequences(tf_train_italian, maxlen=maxlen2, padding ='post') 

# Model Architecture

In [34]:
# Decoder
vectorized_italian = tf_train_italian # For Decoder Input, you don't need the last word
decoder_input_data = vectorized_italian[:, :-1]
decoder_target_data = vectorized_italian[:, 1:] # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data 

print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')

# Encoder
vectorized_english = tf_train_english 
encoder_input_data = vectorized_english
doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}')


Shape of decoder input: (90000, 99)
Shape of decoder target: (90000, 99)
Shape of encoder input: (90000, 100)


In [35]:
# Set parameters
vocab_size_encoder = len(tok1.word_index) + 1 
vocab_size_decoder = len(tok1.word_index) + 1

print(vocab_size_encoder)
print(vocab_size_decoder)

latent_dim = 40

6187
6187


In [36]:
encoder_inputs = keras.Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (English text)
x = keras.layers.Embedding(vocab_size_encoder, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = tf.keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x) # to set distribution of inputs use batch_norm
_, state_h = tf.keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # fixed hidden state of input
encoder_model = keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') 
seq2seq_encoder_out = encoder_model(encoder_inputs)

# Decoder
decoder_inputs = keras.Input(shape=(None,), name='Decoder-Input')  
dec_emb = keras.layers.Embedding(vocab_size_decoder, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
decoder_gru = keras.layers.GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) #the decoder "decodes" the encoder output.
x = keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = keras.layers.Dense(vocab_size_decoder, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

# Seq2seq Model
seq2seq_Model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=0.001), loss='sparse_categorical_crossentropy')

In [37]:
seq2seq_Model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 40)     247480      Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 40)     160         Decoder-Word-Embedding[0][0]     
____________________________________________________________________________________________

# Train Model


In [38]:
batch_size = 1200
epochs = 5
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,  epochs=epochs,  validation_split=0.12) 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
