In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
#Import libraries
!pip install nltk
!pip install Keras-Preprocessing
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from numpy import array
from numpy import argmax
from pickle import load,dump,load
from numpy.random import shuffle
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
import string
import re
from unicodedata import normalize


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


In [21]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs


In [22]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [re_punc.sub('', w) for w in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [23]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)
# load dataset
filename = '/content/drive/MyDrive/Machine_Translation/deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, '/content/drive/MyDrive/Machine_Translation/english-german.pkl')
# spot check
for i in range(10):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: /content/drive/MyDrive/Machine_Translation/english-german.pkl
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[duck] => [kopf runter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stay] => [bleib]
[stop] => [stopp]
[stop] => [anhalten]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[do it] => [mache es]
[do it] => [tue es]
[go on] => [mach weiter]
[hello] => [hallo]
[hello] => [sers]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i hid] => [ich versteckte mich]
[i hid] => [ich habe mich versteckt]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich versuche es]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[i won] => [ich habe gewonnen]
[oh no] => [oh nein]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[sorry] => [entschuldigung]
[ask me] => [frag mich]
[ask me] => [fr

In [24]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [25]:
# load dataset
raw_dataset = load_clean_sentences('/content/drive/MyDrive/Machine_Translation/english-german.pkl')
# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, '/content/drive/MyDrive/Machine_Translation/english-german-both.pkl')
save_clean_data(train, '/content/drive/MyDrive/Machine_Translation/english-german-train.pkl')
save_clean_data(test, '/content/drive/MyDrive/Machine_Translation/english-german-test.pkl')

Saved: /content/drive/MyDrive/Machine_Translation/english-german-both.pkl
Saved: /content/drive/MyDrive/Machine_Translation/english-german-train.pkl
Saved: /content/drive/MyDrive/Machine_Translation/english-german-test.pkl


In [26]:
# load datasets
dataset = load_clean_sentences('/content/drive/MyDrive/Machine_Translation/english-german-both.pkl')
train = load_clean_sentences('/content/drive/MyDrive/Machine_Translation/english-german-train.pkl')
test = load_clean_sentences('/content/drive/MyDrive/Machine_Translation/english-german-test.pkl')

In [27]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


In [28]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [29]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))


English Vocabulary Size: 2176
English Max Length: 5


In [30]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

German Vocabulary Size: 3534
German Max Length: 9


In [31]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [32]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [33]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [34]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    # compile model
    model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 9, 256)            904704    
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 5, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 5, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 5, 2176)          559232    
 ibuted)                                                         
                                                                 
Total params: 2,514,560
Trainable params: 2,514,560
Non-

In [35]:
# fit model
from keras.callbacks import EarlyStopping
checkpoint = ModelCheckpoint('/content/drive/MyDrive/Machine_Translation/model.h5', monitor='val_loss',save_best_only=True, mode='min')
callback = EarlyStopping(monitor='val_loss', patience=8)
model.fit(trainX, trainY, epochs=60, batch_size=64, validation_data=(testX, testY),callbacks=[checkpoint,callback], verbose=2)


Epoch 1/60
141/141 - 44s - loss: 4.0883 - val_loss: 3.3400 - 44s/epoch - 313ms/step
Epoch 2/60
141/141 - 27s - loss: 3.1912 - val_loss: 3.1917 - 27s/epoch - 192ms/step
Epoch 3/60
141/141 - 26s - loss: 3.0365 - val_loss: 3.0859 - 26s/epoch - 181ms/step
Epoch 4/60
141/141 - 25s - loss: 2.8775 - val_loss: 2.9777 - 25s/epoch - 180ms/step
Epoch 5/60
141/141 - 30s - loss: 2.7372 - val_loss: 2.8749 - 30s/epoch - 211ms/step
Epoch 6/60
141/141 - 26s - loss: 2.5853 - val_loss: 2.7812 - 26s/epoch - 184ms/step
Epoch 7/60
141/141 - 26s - loss: 2.4277 - val_loss: 2.6481 - 26s/epoch - 186ms/step
Epoch 8/60
141/141 - 26s - loss: 2.2820 - val_loss: 2.5520 - 26s/epoch - 184ms/step
Epoch 9/60
141/141 - 26s - loss: 2.1301 - val_loss: 2.4606 - 26s/epoch - 185ms/step
Epoch 10/60
141/141 - 26s - loss: 1.9923 - val_loss: 2.3643 - 26s/epoch - 183ms/step
Epoch 11/60
141/141 - 29s - loss: 1.8670 - val_loss: 2.2949 - 29s/epoch - 206ms/step
Epoch 12/60
141/141 - 26s - loss: 1.7521 - val_loss: 2.2371 - 26s/epoch - 

<keras.callbacks.History at 0x7f8284011550>

In [36]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [37]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [38]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target= raw_dataset[i][0]
		raw_src = raw_dataset[i][1]
		if i < 20:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [40]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])


In [41]:
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [42]:
# load model
model = load_model('/content/drive/MyDrive/Machine_Translation/model.h5')

In [43]:
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)

train
src=[das gehort tom], target=[its toms], predicted=[thats toms]
src=[er hat mir den laufpass gegeben], target=[he dumped me], predicted=[he dumped me]
src=[tom wei es jetzt], target=[tom knows now], predicted=[tom knows it]
src=[ich rieche kaffee], target=[i smell coffee], predicted=[i smell coffee]
src=[du bist zuruck], target=[youre back], predicted=[youre back]
src=[uns ist hei], target=[were hot], predicted=[were hot]
src=[scham dich], target=[shame on you], predicted=[shame on you]
src=[es ist geschehen], target=[its happened], predicted=[its happened]
src=[ich habe eine arbeit], target=[ive got a job], predicted=[i have a job]
src=[frag einen lehrer], target=[ask a teacher], predicted=[ask a teacher]
src=[ich habe noch mehr], target=[ive got more], predicted=[i got more]
src=[tom ist fair], target=[tom is fair], predicted=[tom is fair]
src=[komm nicht zu spat], target=[dont be late], predicted=[dont be late]
src=[atme aus], target=[breathe out], predicted=[breathe]
src=[er 

In [44]:
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

test
src=[ich habe ihnen geglaubt], target=[i believed you], predicted=[i believed you]
src=[es ist warm hier], target=[its warm here], predicted=[its here]
src=[nimm nur eine], target=[take only one], predicted=[take only one]
src=[die glocke lautete], target=[the bell rang], predicted=[youre cry]
src=[ich habe kein talent], target=[im untalented], predicted=[i ate no]
src=[sie sind bescheiden], target=[youre modest], predicted=[youre modest]
src=[ich bin nass], target=[im wet], predicted=[im am]
src=[macht ein nickerchen], target=[take a nap], predicted=[take a nap]
src=[ist ihnen nicht hei], target=[arent you hot], predicted=[arent you hot]
src=[uberrascht mich], target=[surprise me], predicted=[me me]
src=[das ist in ordnung], target=[this is ok], predicted=[thats is]
src=[tom ist auf], target=[toms up], predicted=[tom is tidy]
src=[ist das nicht cool], target=[isnt it cool], predicted=[is it easy]
src=[gib mir eine minute], target=[wait a minute], predicted=[give a a]
src=[aber si