### Machine Translation Prototype

This is the prototype for the machine translation model we are going to build

In [6]:
import tensorflow as tf
print(tf.__version__)

2.3.0


In [2]:
import string
import re
from numpy import array, argmax, random, take
import numpy as np
from numpy.random import shuffle
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, RepeatVector
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras import optimizers
import matplotlib.pyplot as plt
% matplotlib inline
pd.set_option('display.max_colwidth', 200)
from pickle import dump
from unicodedata import normalize
from tensorflow.keras.models import load_model

In [7]:
# Defining the path to the raw data set
fileurl = '/content/drive/My Drive/Bayesian Quest/deu.txt'


In [8]:
# function to read raw text file
def read_text(filename):
    # open the file
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    
    # Split the text into individual lines
    lines = text.strip().split('\n')
    # Splitting each line based on tab spaces and creating a list
    lines = [line.split('\t') for line in lines]

    file.close()
    return array(lines)

In [9]:
# Reading the data using the function
mtData = read_text(fileurl)
# Taking only 50000 rows of data
mtData = mtData[:50000,:2]
print(mtData.shape)
mtData[0:10]

(50000, 2)


array([['Go.', 'Geh.'],
       ['Hi.', 'Hallo!'],
       ['Hi.', 'Grüß Gott!'],
       ['Run!', 'Lauf!'],
       ['Run.', 'Lauf!'],
       ['Wow!', 'Potzdonner!'],
       ['Wow!', 'Donnerwetter!'],
       ['Fire!', 'Feuer!'],
       ['Help!', 'Hilfe!'],
       ['Help!', 'Zu Hülf!']], dtype='<U537')

Removing all unwanted characters

In [10]:
# Cleaning the document from all unwanted characters

def cleanDocs(lines):
  cleanArray = list()
  for docs in lines:
    cleanDocs = list()
    for line in docs:
      # Normalising unicode characters
      line = normalize('NFD', line).encode('ascii', 'ignore')
      line = line.decode('UTF-8')
      # Tokenize on white space
      line = line.split()
      # Removing punctuations from each token
      line = [word.translate(str.maketrans('', '', string.punctuation)) for word in line]
      # convert to lower case
      line = [word.lower() for word in line]
      # Remove tokens with numbers in them
      line = [word for word in line if word.isalpha()]
      # Store as string
      cleanDocs.append(' '.join(line))
    cleanArray.append(cleanDocs)
  return array(cleanArray)

In [11]:
# Cleaning the sentences
cleanMtDocs = cleanDocs(mtData)
cleanMtDocs[0:10]


array([['go', 'geh'],
       ['hi', 'hallo'],
       ['hi', 'gru gott'],
       ['run', 'lauf'],
       ['run', 'lauf'],
       ['wow', 'potzdonner'],
       ['wow', 'donnerwetter'],
       ['fire', 'feuer'],
       ['help', 'hilfe'],
       ['help', 'zu hulf']], dtype='<U117')

In [12]:
# The dimensions of the data set
len(cleanMtDocs)
print(cleanMtDocs.shape)

(50000, 2)


In [13]:
# Shuffling the data
shuffle(cleanMtDocs)
cleanMtDocs[0:10]

array([['i admire your talent', 'ich bewundere dein talent'],
       ['i feel strong', 'ich fuhle mich stark'],
       ['i need toms help', 'ich brauche toms hilfe'],
       ['how is that spelled', 'wie wird das buchstabiert'],
       ['what is this for', 'wofur ist das'],
       ['tomll remember', 'tom wird sich daran erinnern'],
       ['give him time', 'gib ihm zeit'],
       ['did you see tom leave', 'haben sie tom gehen sehen'],
       ['well walk', 'wir werden zu fu gehen'],
       ['i wont pay this bill', 'ich bezahle diese rechnung nicht']],
      dtype='<U117')

### Starting the Neural Translation Model


In [22]:
from pickle import load
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed
from keras.callbacks import ModelCheckpoint

In [14]:
# Creating the tokenizers
# Function for creating tokenizers
def createTokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [15]:
# Create English Tokenizer
eng_tokenizer = createTokenizer(cleanMtDocs[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
print('Length of english vocabulary',eng_vocab_size)

Length of english vocabulary 6255


In [16]:
# Listing the first 10 items of the English tokenizer
list(eng_tokenizer.word_index.items())[0:10]

[('tom', 1),
 ('i', 2),
 ('you', 3),
 ('is', 4),
 ('a', 5),
 ('it', 6),
 ('the', 7),
 ('to', 8),
 ('me', 9),
 ('im', 10)]

In [17]:
# Create German tokenizer
ger_tokenizer = createTokenizer(cleanMtDocs[:,1])
# Defining German Vocabulary
ger_vocab_size = len(ger_tokenizer.word_index) + 1
print(ger_vocab_size)

10210


### Finding the optimum length for the German and English vocabulary

In [18]:
# Create an empty list to store all english sentence lenghts
len_english = []
# Getting the length of all the English sentences
[len_english.append(len(line.split())) for line in cleanMtDocs[:,0]]
len_english[0:10]

[4, 3, 4, 4, 4, 2, 3, 5, 2, 5]

In [19]:
len_German = []
# Getting the length of all the English sentences
[len_German.append(len(line.split())) for line in cleanMtDocs[:,1]]
len_German[0:10]

[4, 4, 4, 4, 3, 5, 3, 5, 5, 5]

##### Finding the optimimum sequence lengths

In [20]:
# Find the quantile length
engLength = np.quantile(len_english, .975)
engLength

5.0

In [21]:
# Find the quantile length
gerLength = np.quantile(len_German, .975)
gerLength

6.0

### Encoding the sequences 

In this phase we will encode each of the sentences as integers in a sequence. Another task which needs to be done is to ensure that the lengths are standard. This is the reason we calcualated the maximum length of each sequence. We get the lengths standard by zero padding the sequences.

In [22]:
# Function for encoding and padding sequences

def encode_sequences(tokenizer,length, lines):
    # Sequences as integers
    X = tokenizer.texts_to_sequences(lines)
    # Padding the sentences with 0
    X = pad_sequences(X,maxlen=length,padding='post')
    return X

In [23]:
# Preparing the train and test splits
from sklearn.model_selection import train_test_split
# split data into train and test set
train, test = train_test_split(cleanMtDocs, test_size=0.1, random_state = 123)
print(train.shape)
print(test.shape)

(45000, 2)
(5000, 2)


In [24]:
# Creating the X variable for both train and test sets
trainX = encode_sequences(ger_tokenizer,int(gerLength),train[:,1])
testX = encode_sequences(ger_tokenizer,int(gerLength),test[:,1])
print(trainX.shape)
print(testX.shape)


(45000, 6)
(5000, 6)


In [25]:
# Displaying first 5 rows of the traininig set
trainX[0:5]

array([[  82,   25,    1,  356,    0,    0],
       [  10,   19,  670,    0,    0,    0],
       [   5,  600, 1113,    0,    0,    0],
       [   2,  111,  523,    0,    0,    0],
       [  90,   29,   14,  413,  134,    0]], dtype=int32)

In [26]:
# Creating the Y variable both train and test
trainY = encode_sequences(eng_tokenizer,int(engLength),train[:,0])
testY = encode_sequences(eng_tokenizer,int(engLength),test[:,0])
print(trainY.shape)
print(testY.shape)

(45000, 5)
(5000, 5)


### Modelling 

In [39]:
def defineModel(src_vocab,tar_vocab,src_timesteps,tar_timesteps,n_units):
    model = Sequential()
    model.add(Embedding(src_vocab,n_units,input_length=src_timesteps,mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units,return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab,activation='softmax')))
    # Compiling the model
    model.compile(optimizer = 'adam',loss='sparse_categorical_crossentropy')
    # Summarising the model
    model.summary()
    
    return model

In [79]:
model = defineModel(ger_vocab_size,eng_vocab_size,int(gerLength),int(engLength),256)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 6, 256)            2613760   
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 6255)           1607535   
Total params: 5,271,919
Trainable params: 5,271,919
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Fitting the model
checkpoint = ModelCheckpoint('model1.h5',monitor='val_loss',verbose=1,save_best_only=True,mode='min')
model.fit(trainX,trainY,epochs=50,batch_size=64,validation_data=(testX,testY),callbacks=[checkpoint],verbose=2)


In [3]:
# loading the model from the best model saved
model = load_model('/content/drive/My Drive/Bayesian Quest/model1.h5')

#### Predictions with the model on the test set


In [27]:
# Generating the predictions
prediction = model.predict(testX,verbose=0)
prediction.shape

(5000, 5, 6255)

(5000, 5, 6255)

In [99]:
# Getting the prediction index along the last axis ( Vocabulary size axis)
predIndex = [argmax(vector,axis = -1) for vector in prediction]
predIndex[0:3]

[array([   5,  123,    4, 3052,    0]),
 array([  2,  14,  47,   7, 383]),
 array([  1, 476, 356,   0,   0])]

In [100]:
# Creating the reverse dictionary
reverse_eng = eng_tokenizer.index_word


In [101]:
# Converting the tokens to a sentence
preds = []
for pred in predIndex[0]:
  if pred == 0:
        continue 
  preds.append(reverse_eng[pred])  
print(' '.join(preds))

a dog is barking


In [102]:
# Looking at the target sentence
preds = []
for pred in testY[0]:
  if pred == 0:
        continue 
  preds.append(reverse_eng[pred])  
print(' '.join(preds))

a dog is barking


In [33]:
# Creating a function for converting sequences
def Convertsequence(tokenizer,source):
    target = list()
    reverse_eng = tokenizer.index_word
    for i in source:
        if i == 0:
            continue
        target.append(reverse_eng[int(i)])
    return ' '.join(target)

In [34]:
# Function to generate predictions from source data
def generatePredictions(model,tokenizer,data):
    prediction = model.predict(data,verbose=0)
    AllPreds = []
    for i in range(len(prediction)):
        predIndex = [argmax(prediction[i, :, :], axis=-1)][0]
        target = Convertsequence(tokenizer,predIndex)
        AllPreds.append(target)
    return AllPreds

In [105]:
# Generate predictions
predSent = generatePredictions(model,eng_tokenizer,testX[0:20,:])

In [106]:
for i in range(len(testY[0:20])):
    targetY = Convertsequence(eng_tokenizer,testY[i:i+1][0])
    print("Original sentence : {} :: Prediction : {}".format([targetY],[predSent[i]]))

Original sentence : ['a dog is barking'] :: Prediction : ['a dog is barking']
Original sentence : ['ive been to the mall'] :: Prediction : ['i was at the first']
Original sentence : ['tom sounds mad'] :: Prediction : ['tom sounds crazy']
Original sentence : ['he must be over sixty'] :: Prediction : ['he must be over sixty']
Original sentence : ['freeze'] :: Prediction : ['stop around']
Original sentence : ['they feel hungry'] :: Prediction : ['youre hungry']
Original sentence : ['tom wants an apple'] :: Prediction : ['tom wants an an']
Original sentence : ['im new'] :: Prediction : ['im new']
Original sentence : ['i woke you up'] :: Prediction : ['i woke her her']
Original sentence : ['are you watching me'] :: Prediction : ['do you understand me']
Original sentence : ['what does tom have on'] :: Prediction : ['what tom got']
Original sentence : ['tom was pretty bummed'] :: Prediction : ['i was almost']
Original sentence : ['see what i mean'] :: Prediction : ['do i order my leap']
Origi

### Predicting on your own sentences

In [28]:
def cleanInput(lines):
    cleanSent = []
    cleanDocs = list()
    for docs in lines.split():
        line = normalize('NFD', docs).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        line = [line.translate(str.maketrans('', '', string.punctuation))]
        line = line[0].lower()
        cleanDocs.append(line)
    cleanSent.append(' '.join(cleanDocs))
    return array(cleanSent)

In [44]:
# Trying different input sentences
inputSentence = 'Es ist ein großartiger Tag' # It is a great day ?
#inputSentence ='Heute wird es regnen' #  it's going to rain Today
#inputSentence ='Ich habe im Radio gesprochen' # I spoke on the radio

In [45]:

# Clean the input sentence
cleanText = cleanInput(inputSentence)
cleanText

array(['es ist ein groartiger tag'], dtype='<U25')

In [46]:
# Encode the inputsentence as sequence of integers
seq1 = encode_sequences(ger_tokenizer,int(gerLength),cleanText)
seq1

array([[   7,    3,   12, 2314,  196,    0]], dtype=int32)

In [47]:
# Generate the prediction
predSent = generatePredictions(model,eng_tokenizer,seq1)

print("Original sentence : {} :: Prediction : {}".format([cleanText[0]],predSent))

Original sentence : ['es ist ein groartiger tag'] :: Prediction : ['its still ok']


In [39]:
inputSentence1 ='Heute wird es regnen' #  it's going to rain Today
inputSentence2 ='Ich habe im Radio gesprochen' # I spoke on the radio

for sentence in [inputSentence1,inputSentence2]:
  cleanText = cleanInput(sentence)
  seq1 = encode_sequences(ger_tokenizer,int(gerLength),cleanText)
  # Generate the prediction
  predSent = generatePredictions(model,eng_tokenizer,seq1)

  print("Original sentence : {} :: Prediction : {}".format([cleanText[0]],predSent))



Original sentence : ['heute wird es regnen'] :: Prediction : ['it be in today']
Original sentence : ['ich habe im radio gesprochen'] :: Prediction : ['i have your cards']
