In [1]:
!wget http://www.manythings.org/anki/deu-eng.zip

--2022-04-12 02:19:05--  http://www.manythings.org/anki/deu-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.21.92.44, 172.67.186.54, 2606:4700:3033::ac43:ba36, ...
Connecting to www.manythings.org (www.manythings.org)|104.21.92.44|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9347491 (8.9M) [application/zip]
Saving to: ‘deu-eng.zip’


2022-04-12 02:19:05 (60.9 MB/s) - ‘deu-eng.zip’ saved [9347491/9347491]



In [16]:
import string
import re
from pickle import dump,load
from unicodedata import normalize
from numpy import array,argmax
from zipfile import ZipFile
from pickle import load
from pickle import dump
from numpy.random import shuffle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import corpus_bleu

In [17]:
file_name = "deu-eng.zip"
with ZipFile(file_name, 'r') as zip:
    print('Extracting all the files now...')
    zip.extractall()
    print('Done!')

Extracting all the files now...
Done!


In [18]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [19]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

In [20]:
def clean_pairs(lines):
    cleaned = list()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    for pair in lines:
        clean_pair = list()
        for line in pair:
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            line = line.split()
            line = [word.lower() for word in line]
            line = [re_punc.sub('', w) for w in line]
            line = [re_print.sub('', w) for w in line]
            line = [word for word in line if word.isalpha()]
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [21]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved:',filename)

filename = 'deu.txt'
doc = load_doc(filename)
pairs = to_pairs(doc)
clean_pairs = clean_pairs(pairs)
save_clean_data(clean_pairs, 'english-german.pkl')
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[duck] => [kopf runter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stay] => [bleib]
[stop] => [stopp]
[stop] => [anhalten]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[do it] => [mache es]
[do it] => [tue es]
[go on] => [mach weiter]
[hello] => [hallo]
[hello] => [sers]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i hid] => [ich versteckte mich]
[i hid] => [ich habe mich versteckt]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich versuche es]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[i won] => [ich habe gewonnen]
[oh no] => [oh nein]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[sorry] => [entschuldigung]
[ask me] => [frag mich]
[ask me] => [fragt mich]
[ask me] => [fragen sie mich]
[at

In [22]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

raw_dataset = load_clean_sentences('english-german.pkl')

n_sentences = 10000
dataset = raw_dataset[:n_sentences, :2]

shuffle(dataset)

train, test = dataset[:9000], dataset[9000:]

save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [23]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [24]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)


eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

English Vocabulary Size: 2171
English Max Length: 5
German Vocabulary Size: 3533
German Max Length: 9


In [25]:
train

array([['help yourself', 'bedien dich'],
       ['come see me', 'komm und besuch mich'],
       ['wish me luck', 'druckt mir die daumen'],
       ...,
       ['im so excited', 'ich bin ja so aufgeregt'],
       ['i ate too much', 'ich habe zu viel gegessen'],
       ['youve tried', 'du hast es versucht']], dtype='<U527')

In [26]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.summary()
    return model

In [27]:
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)

checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1,save_best_only=True, mode='min')

English Vocabulary Size: 2171
English Max Length: 5
German Vocabulary Size: 3533
German Max Length: 9
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 9, 256)            904448    
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 5, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 5, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 5, 2171)          557947    
 ibuted)                                                         
                    

In [28]:
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY),callbacks=[checkpoint], verbose=2)

Epoch 1/30

Epoch 1: val_loss improved from inf to 3.36337, saving model to model.h5
141/141 - 15s - loss: 4.1122 - val_loss: 3.3634 - 15s/epoch - 109ms/step
Epoch 2/30

Epoch 2: val_loss improved from 3.36337 to 3.21857, saving model to model.h5
141/141 - 3s - loss: 3.1992 - val_loss: 3.2186 - 3s/epoch - 19ms/step
Epoch 3/30

Epoch 3: val_loss improved from 3.21857 to 3.09473, saving model to model.h5
141/141 - 3s - loss: 3.0484 - val_loss: 3.0947 - 3s/epoch - 18ms/step
Epoch 4/30

Epoch 4: val_loss improved from 3.09473 to 2.97421, saving model to model.h5
141/141 - 3s - loss: 2.8908 - val_loss: 2.9742 - 3s/epoch - 18ms/step
Epoch 5/30

Epoch 5: val_loss improved from 2.97421 to 2.87719, saving model to model.h5
141/141 - 3s - loss: 2.7561 - val_loss: 2.8772 - 3s/epoch - 18ms/step
Epoch 6/30

Epoch 6: val_loss improved from 2.87719 to 2.80320, saving model to model.h5
141/141 - 3s - loss: 2.6282 - val_loss: 2.8032 - 3s/epoch - 18ms/step
Epoch 7/30

Epoch 7: val_loss improved from 2.8

<keras.callbacks.History at 0x7f59ba762b10>

In [29]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

def evaluate_model(model, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [32]:
evaluate_model(model, trainX, train)

src=[bedien dich], target=[help yourself], predicted=[thanks yourself]
src=[komm und besuch mich], target=[come see me], predicted=[come see me]
src=[druckt mir die daumen], target=[wish me luck], predicted=[wish me luck]
src=[ich unterstutze ihn], target=[i support him], predicted=[i support him]
src=[ich bin pingelig], target=[im finicky], predicted=[im picky]
src=[das ist aus gold], target=[this is gold], predicted=[this is gold]
src=[gib ihn tom], target=[hand it to tom], predicted=[hand it to tom]
src=[es ist ihre pflicht], target=[its your duty], predicted=[its your duty]
src=[siehe unten], target=[see below], predicted=[see below]
src=[wir haben einen hund], target=[we have a dog], predicted=[we have a car]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.084573
BLEU-2: 0.275912
BLEU-3: 0.442778
BLEU-4: 0.498356


In [42]:
val = 5
source = trainX[val].reshape((1, trainX[val].shape[0]))
translation = predict_sequence(model, eng_tokenizer, source)
train[val][1],train[val][0],translation

('das ist aus gold', 'this is gold', 'this is gold')