In [155]:
#GLOBAL IMPORTS

import numpy as np
import os
import pandas as pd
import re
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

#LOCAL IMPORTS


#GLOBAL VARIABLES


In [164]:
def read_data():
    #Fetch THE DATA
    path_to_data = "dataset/deu.txt"

    #Retrive some of the data
    lines = pd.read_table(path_to_data, names = ['source', 'target', 'comments'])
    return lines

lines = read_data()
lines.sample(6)


Unnamed: 0,source,target,comments
94111,Why do you love me so much?,Warum liebst du mich so sehr?,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
14034,Tom was blinded.,Tom war geblendet.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
66644,"This is my sister, Mary.",Das ist meine Schwester Maria.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
161687,They helped each other with homework.,Sie halfen sich gegenseitig bei den Hausaufgaben.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
167984,We have more in common than I thought.,"Wir haben mehr Gemeinsamkeiten, als ich dachte.",CC-BY 2.0 (France) Attribution: tatoeba.org #3...
135725,I should just tell Tom the truth.,Ich sollte Tom einfach die Wahrheit sagen.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [165]:
'''
    Cleaning the data:
        - Convert english (source) into lowercase
        - Remove Qoutes
        - Remove all special characters like “@, !, *, $, #, ?, %, etc.”
        - Remove numbers .. since they are different!
        - Remove spaces
'''
#Read the data
lines = read_data()
#Shuffle the data
lines = shuffle(lines)

#LOWER CASE
lines.source = lines.source.apply (lambda x: x.lower())
lines.target = lines.target.apply (lambda x: x.lower()) 

#Qoutes
lines.source = lines.source.apply (lambda x: re.sub("'", '', x))
lines.target = lines.target.apply (lambda x: re.sub("'", '', x))

#A list of all punctuations
punc = set(string.punctuation)

# punc.add ('؟') #ara
# punc.add ('...') #ara
punc.add ('...') #eng
# punc.add ('،') #ara
lines.source = lines.source.apply(lambda x: ''.join(char1 for char1 in x if char1 not in punc))
lines.target = lines.target.apply(lambda x: ''.join(char1 for char1 in x if char1 not in punc))

#Remving digits
num_digits= str.maketrans('','', digits)
lines.source = lines.source.apply(lambda x: x.translate(num_digits))
lines.target = lines.target.apply(lambda x: x.translate(num_digits))

# Remove extra spaces
lines.source=lines.source.apply(lambda x: x.strip())
lines.target=lines.target.apply(lambda x: x.strip())
lines.source=lines.source.apply(lambda x: re.sub(" +", " ", x))
lines.target=lines.target.apply(lambda x: re.sub(" +", " ", x))

#adding start/end tags
# lines.source = lines.source.apply(lambda x : '<sos> '+ x + ' <eos>')
lines.target = lines.target.apply(lambda x : '<sos> '+ x + ' <eos>')
lines.sample(6)


Unnamed: 0,source,target,comments
36932,tom gladly accepted,<sos> tom stimmte bereitwillig zu <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
101388,tom was wearing a brown hat,<sos> tom trug einen braunen hut <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #8...
165656,is there a souvenir shop in the hotel,<sos> gibt es einen souvenirladen in dem hotel...,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
46147,youre such a coward,<sos> du bist so ein feigling <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
120641,i dont want to make you angry,<sos> ich möchte dich nicht ärgern <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
135551,i know that tom has been helpful,<sos> ich weiß dass tom hilfreich gewesen ist ...,CC-BY 2.0 (France) Attribution: tatoeba.org #6...


In [166]:
#The whole Source set length
source_vocab = []
for line in lines.source:
    chars = line.split(' ')
    for char in chars: 
        source_vocab.append (char)

target_vocab = set()
for line in lines.target:
    chars = line.split(' ')
    for char in chars: 
        if char not in target_vocab:
            target_vocab.add(char)
        
print (f'The Whole Source count \t\t{len(source_vocab)}')
print (f'Uniques in the Source count \t{len(set(source_vocab))}')
print ()
print (f'Uniques in the Target count \t{len(target_vocab)}')
#NOTE: the arabic language is vastly rich!

source_vocab = sorted(list(set(source_vocab)))
target_vocab = sorted(list(target_vocab))

# Input tokens for encoder
num_encoder_tokens=len(set(source_vocab))
# Input tokens for decoder zero padded
num_decoder_tokens=len(target_vocab) +1


The Whole Source count 		1402031
Uniques in the Source count 	16309

Uniques in the Target count 	35922


In [167]:
#Max lengths for Source and Targets
source_length_list=[]
for l in lines.source:
    source_length_list.append(len(l.split(' ')))
src_max_length= max(source_length_list)

target_length_list=[]
for l in lines.target:
    target_length_list.append(len(l.split(' ')))
trg_max_length= max(target_length_list)

print (f'Max length of Source lang: {src_max_length}')
print (f'Max length of Target lang: {trg_max_length}')

#Word to index dictionary
source_word2idx= dict([(word, i+1) for i,word in enumerate(source_vocab)])
target_word2idx=dict([(word, i+1) for i, word in enumerate(target_vocab)])

#creating a dictionary for index to word for source and target vocabulary
source_idx2word= dict([(i, word) for word, i in  source_word2idx.items()])
target_idx2word =dict([(i, word) for word, i in target_word2idx.items()])

#Shuffle the data


Max length of Source lang: 101
Max length of Target lang: 77


In [168]:
#Splitting the data
#train_test_split from Sklearn lib


X, y = shuffle(np.array(lines.source)), shuffle(np.array(lines.target))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
print (f'Source Training data shape: \t{X_train.shape}')
print (f'Target Training data shape: \t{y_train.shape}')
print ()
print (f'Source Test data shape: \t{X_test.shape}')
print (f'Target Test data shape: \t{y_test.shape}')


Source Training data shape: 	(199379,)
Target Training data shape: 	(199379,)

Source Test data shape: 	(22154,)
Target Test data shape: 	(22154,)


In [169]:
#generate_batch() will provide batches of data for fit_generator()

def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, src_max_length),dtype='float32')
            decoder_input_data = np.zeros((batch_size, trg_max_length),dtype='float32')
            decoder_target_data = np.zeros((batch_size, trg_max_length, num_decoder_tokens),dtype='float32')
            
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = source_word2idx[word] 
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_word2idx[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        #print(word)
                        decoder_target_data[i, t - 1, target_word2idx[word]] = 1.
                    
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [170]:
#MODEL

#Essenteal parameters
train_samples = len(X_train)
test_samples = len(X_test)
batch_size = 128
epochs = 15
latent_dim=256

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that takes encoder and decoder input 
# to output decoder_outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

#optimizer: rmsprop .. try adam!
#loss: cross entropy
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])


In [171]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = test_samples//batch_size)

Epoch 1/15


ResourceExhaustedError:  OOM when allocating tensor with shape[128,77,35923] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[node categorical_crossentropy/Log (defined at <ipython-input-171-456f6b8ec204>:5) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_70269]

Function call stack:
train_function


In [None]:
model.save_weights(‘nmt_weights_100epochs.h5’)