In [57]:
from pickle import dump 
import re
import numpy as np
from unicodedata import normalize
import string

<h3>Read data</h3>
<p>Data from http://www.manythings.org/, German words and their translations</p>

In [17]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [18]:
doc = load_doc('data/deu-eng/deu.txt')

In [19]:
doc[:20]

'Hi.\tHallo!\nHi.\tGrüß '

<h3>Cleaning the data</h3>

In [20]:
def to_pairs(doc):
    lines = doc.split('\n')
    pairs = [x.split('\t') for x in lines]
    return pairs

In [21]:
pairs = to_pairs(doc)
pairs[:5]

[['Hi.', 'Hallo!'],
 ['Hi.', 'Grüß Gott!'],
 ['Run!', 'Lauf!'],
 ['Fire!', 'Feuer!'],
 ['Help!', 'Hilfe!']]


- Remove all non-printable characters.
- Remove all punctuation characters.
- Normalize all Unicode characters to ASCII (e.g. Latin characters).
- Normalize the case to lowercase.
- Remove any remaining tokens that are not alphabetic.


In [26]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

The unicodedata module offers a .normalize() function, you want to normalize to the NFC form. NFC, or 'Normal Form Composed' returns composed characters, NFD, 'Normal Form Decomposed' gives you decomposed, combined characters.

In [27]:
cleaned_pairs = clean_pairs(pairs)
cleaned_pairs[:5]

array([list(['hi', 'hallo']), list(['hi', 'gru gott']),
       list(['run', 'lauf']), list(['fire', 'feuer']),
       list(['help', 'hilfe'])], dtype=object)

<h3>Save the cleansed data</h3>

In [32]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [38]:
save_clean_data(cleaned_pairs, "data/deu-eng/cleansed_deu.pkl")

Saved: data/deu-eng/cleansed_deu.pkl


<h3>Loading the processed data</h3>

In [4]:
from pickle import load
from numpy.random import rand
from numpy.random import shuffle

In [5]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [6]:
raw_dataset = load_clean_sentences('data/deu-eng/cleansed_deu.pkl')

In [7]:
raw_dataset[:5]

array([list(['hi', 'hallo']), list(['hi', 'gru gott']),
       list(['run', 'lauf']), list(['fire', 'feuer']),
       list(['help', 'hilfe'])], dtype=object)

In [8]:
len(raw_dataset)

169814

<h3>Test Train Split</h3>

In [9]:
n_sentences = 10000
dataset = raw_dataset[:n_sentences]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]

In [10]:
train[:5], len(train)

(array([list(['go to school', 'geh in die schule']),
        list(['that isnt tom', 'das ist nicht tom']),
        list(['tom saw you', 'tom sah dich']),
        list(['how late is it', 'wie spat ist es']),
        list(['tom is blind', 'tom ist blind'])], dtype=object), 9000)

In [11]:
test[:5], len(test)

(array([list(['wake tom up', 'wecke tom']),
        list(['dont worry', 'mach dir keine gedanken']),
        list(['tom is relaxed', 'tom ist entspannt']),
        list(['im not ok', 'ich bin nicht okay']),
        list(['he was alone', 'er war alleine'])], dtype=object), 1000)

<h3>Tokenize</h3>

Just get the english datset

In [12]:
english_lines = [x[0] for x in dataset]
english_lines[:4]

['go to school', 'that isnt tom', 'tom saw you', 'how late is it']

In [29]:
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [31]:
eng_length = max_length(english_lines)
eng_length

5

In [13]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [14]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [15]:
eng_tokenizer = create_tokenizer(english_lines)

In [16]:
from pandas import DataFrame

In [20]:
type(eng_tokenizer.word_index)

dict

In [44]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_vocab_size

2315

In [24]:
eng_token_mapping = DataFrame.from_dict(eng_tokenizer.word_index, orient = 'index')

In [26]:
eng_token_mapping[:5]

Unnamed: 0,0
tom,1
i,2
it,3
is,4
you,5


This is how the keras preprocessing tokenizes the english text, now we do the same for german texts.

In [28]:
de_lines = [x[1] for x in dataset]
de_tokenizer = create_tokenizer(de_lines)
de_token_mapping = DataFrame.from_dict(de_tokenizer.word_index, orient = 'index')
de_token_mapping[:5]

Unnamed: 0,0
ich,1
tom,2
ist,3
sie,4
es,5


In [32]:
de_length = max_length(de_lines)
de_length

10

In [45]:
de_vocab_size = len(de_tokenizer.word_index) + 1
de_vocab_size

3686

We have tokenized the entire dataset, now we need to apply this to the training and testing dataset using the tokenizer created. We need to encode each sequence to max length to be used as a word embedding. Keras pad sequence helps to pad all sequences into similar length. 'post' padding option allows us to add 0's after each sequence to complete the max length.

In [33]:
from keras.preprocessing.sequence import pad_sequences 

In [40]:
def encode_seq(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [50]:
trainY = encode_seq(de_tokenizer, de_length, [x[1] for x in train])
trainX = encode_seq(eng_tokenizer, eng_length, [x[0] for x in train])

testY = encode_seq(de_tokenizer, de_length, [x[1] for x in test])
testX = encode_seq(eng_tokenizer, eng_length, [x[0] for x in test])

In [51]:
trainX[:5]

array([[ 11,  15, 278,   0,   0],
       [ 16,  65,   1,   0,   0],
       [  1,  70,   5,   0,   0],
       [ 51, 144,   4,   3,   0],
       [  1,   4, 363,   0,   0]], dtype=int32)

The final layer would be a probablity distribution of each word appearing in the sentence and hence has to be one hot encoded layer.

In [65]:
trainX = trainX[:1000]
testX = testX[:1000]

In [52]:
from keras.utils import to_categorical
#Converts a class vector (integers) to binary class matrix.

In [55]:
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [60]:
trainY = encode_output(trainY[:100], de_vocab_size)

In [62]:
trainY[:5].shape

(5, 10, 3686)

In [64]:
testY = encode_output(testY[:100], de_vocab_size)

<h3>Model</h3>

In [66]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

- Sequential model is a linear stack of layers
- 

In [None]:
def def_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model