In [2]:
import pandas as pd

In [3]:
ks = pd.read_csv('data/kidsspelling.csv')
ks = ks.drop(['Code', 'Semester', 'Unnamed: 4'], axis=1)
ks = ks.dropna()
ks["Target"] = ks.Target.apply(lambda x: x.strip())
ks[0:5]

Unnamed: 0,Target,Spelling
0,favorite,favtit
1,throw,thow
2,catch,cach
3,touchdown,tuchdone
4,dance,dans


In [11]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [9]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [10]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    print(len(X), X[0])
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [12]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [13]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [33]:
spelling_tokens = [[letter for letter in word] for word in ks['Spelling']]
target_tokens = [[letter for letter in word] for word in ks['Target']]

spelling_train = spelling_tokens[:900]
target_train = target_tokens[:900]

spelling_test = spelling_tokens[900:]
target_test = target_tokens[900:]
len(target_test)

458

In [26]:
spelling_tokenizer = create_tokenizer(spelling_tokens)
target_tokenizer = create_tokenizer(target_tokens)

In [27]:
spelling_size = len(spelling_tokenizer.word_index)+1
target_size = len(target_tokenizer.word_index)+1
target_size

32

In [31]:
spelling_max = max(len(token) for token in spelling_tokens)
target_max =  max(len(token) for token in target_tokens)
target_max

17

In [34]:
# prepare training data
trainX = encode_sequences(spelling_tokenizer, spelling_max, spelling_train)
trainY = encode_sequences(target_tokenizer, target_max, target_train)
trainY = encode_output(trainY, target_size)
# prepare validation data
testX = encode_sequences(spelling_tokenizer, spelling_max, spelling_test)
testY = encode_sequences(target_tokenizer, target_max, target_test)
testY = encode_output(testY, target_size)

900 [17, 4, 22, 3, 5, 3]
900 [19, 4, 22, 6, 7, 3, 2, 1]
458 [9, 4, 8, 12, 5, 12]
458 [9, 4, 8, 12, 1, 12]


In [35]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [36]:
# define model
model = define_model(spelling_size, target_size, spelling_max, target_max, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 14, 256)           7680      
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 17, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 17, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 17, 32)            8224      
Total params: 1,066,528
Trainable params: 1,066,528
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Train on 900 samples, validate on 458 samples
Epoch 1/30
 - 19s - loss: 2.4612 - val_loss: 1.5337

Epoch 00001: val_loss improved from inf to 1.53368, saving model to model.h5
Epoch 2/30
 - 9s - loss: 1.3558 - val_loss: 1.2983

Epoch 00002: val_loss improved from 1.53368 to 1.29831, saving model to model.h5
Epoch 3/30
 - 8s - loss: 1.2445 - val_loss: 1.3173

Epoch 00003: val_loss did not improve from 1.29831
Epoch 4/30
 - 8s - loss: 1.2134 - val_loss: 1.2290

Epoch 00004: val_loss improved from 1.29831 to 1.22897, saving model to model.h5
Epoch 5/30
 - 8s - loss: 1.1869 - val_loss: 1.2239

Epoch 00005: val_loss improved from 1.22897 to 1.22388, saving model to model.h5
Epoch 6/30
 - 8s - loss: 1.1618 - val_loss: 1.1942

Epoch 00006: val_loss improved from 1.22388 to 1.19425, saving model to model.h5
Epoch 7/30
 - 8s - loss: 1.1300 - val_loss: 1.1683

Epoch 00007: val_loss improved from 1.19425 to 1.16831, saving model to model.h5
Epoch 8/30
 - 9s - loss: 1.1034 - val_loss: 1.1234

Epoc

<keras.callbacks.History at 0x1d0bf4a4400>

In [39]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

In [40]:
model = load_model('model.h5')

In [41]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [42]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [70]:
def evaluate_model(model, tokenizer, sources, spelling_train, target_train):
    correct = []
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, target_tokenizer, source).replace(" ", "")
        raw_target, raw_src = ''.join(target_train[i]), ''.join(spelling_train[i])
        print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        correct.append(raw_target == translation)
    print(str(sum(correct))+ " / " +str(len(correct)))
   
# for s, t in zip(spelling_tokens, target_tokens):
  #  translation = predict_sequence(model, eng_tokenizer, source)

In [71]:
evaluate_model(model, target_tokenizer, trainX, spelling_train, target_train)

src=[favtit], target=[favorite], predicted=[tttttt]
src=[thow], target=[throw], predicted=[thhh]
src=[cach], target=[catch], predicted=[sooo]
src=[tuchdone], target=[touchdown], predicted=[ccooohh]
src=[dans], target=[dance], predicted=[siies]
src=[villajg], target=[villager], predicted=[llllle]
src=[pots], target=[puts], predicted=[sosss]
src=[cadig], target=[cabbage], predicted=[saaee]
src=[carit], target=[carrot], predicted=[tttte]
src=[smels], target=[smells], predicted=[semmss]
src=[ston], target=[stone], predicted=[soit]
src=[soop], target=[soup], predicted=[soooe]
src=[magick], target=[magic], predicted=[saiii]
src=[Jonsun], target=[Johnson], predicted=[fiiiin]
src=[fol], target=[full], predicted=[wlll]
src=[korts], target=[court], predicted=[sotss]
src=[haf], target=[half], predicted=[whe]
src=[korts], target=[court], predicted=[sotss]
src=[Lackrs], target=[Lakers], predicted=[cllcees]
src=[scoot], target=[scooped], predicted=[soooo]
src=[gast], target=[gasped], predicted=[ssss

src=[blu], target=[blue], predicted=[woll]
src=[chokolet], target=[chocolate], predicted=[coooole]
src=[tast], target=[taste], predicted=[sttt]
src=[becase], target=[because], predicted=[saccse]
src=[ran ], target=[rain], predicted=[win]
src=[stad], target=[stayed], predicted=[saaed]
src=[insaen], target=[inside], predicted=[siiiin]
src=[reses], target=[recess], predicted=[sssss]
src=[stad], target=[stayed], predicted=[saaed]
src=[agen], target=[again], predicted=[winn]
src=[insid], target=[inside], predicted=[siiid]
src=[wen], target=[when], predicted=[wnn]
src=[raning], target=[raining], predicted=[iinnnn]
src=[wen], target=[went], predicted=[wnn]
src=[becas ], target=[because], predicted=[saccme]
src=[frind], target=[friend], predicted=[fiinn]
src=[hase], target=[house], predicted=[seee]
src=[gona], target=[going to (gonna)], predicted=[woon]
src=[winn], target=[when], predicted=[wnnnn]
src=[littol], target=[little], predicted=[coool]
src=[rid ], target=[ride], predicted=[seed]
src=

src=[fite], target=[fight], predicted=[titt]
src=[fiteing], target=[fighting], predicted=[tiiinn]
src=[casis], target=[cases], predicted=[ssssss]
src=[stared], target=[started], predicted=[sarree]
src=[fiered], target=[fired], predicted=[rrredd]
src=[fand], target=[found], predicted=[finn]
src=[fand], target=[found], predicted=[finn]
src=[fierst], target=[first], predicted=[siitt]
src=[fite], target=[fight], predicted=[titt]
src=[oners], target=[owners], predicted=[siiess]
src=[evething], target=[everything], predicted=[ttttiing]
src=[tietts], target=[tickets], predicted=[stttt]
src=[agenst], target=[against], predicted=[siiitt]
src=[beagen], target=[begin], predicted=[aaiinn]
src=[bot], target=[bought], predicted=[toot]
src=[seads], target=[seeds], predicted=[ssesse]
src=[grand], target=[ground], predicted=[riine]
src=[gluve], target=[glove], predicted=[lllle]
src=[rete], target=[ready], predicted=[ttee]
src=[cot], target=[caught], predicted=[toot]
src=[faerl], target=[foul], predicte

src=[hirow], target=[hero], predicted=[whhe]
src=[sow], target=[so], predicted=[soo]
src=[somwan], target=[someone], predicted=[soooin]
src=[nids], target=[needs], predicted=[siiss]
src=[cen], target=[can], predicted=[win]
src=[hir], target=[her], predicted=[thee]
src=[war], target=[world], predicted=[wee]
src=[cach], target=[catch], predicted=[sooo]
src=[en], target=[on], predicted=[win]
src=[fiyor], target=[fire], predicted=[waoee]
src=[favurite], target=[favorite], predicted=[ceccrree]
src=[fer], target=[fair], predicted=[trre]
src=[sow], target=[so], predicted=[soo]
src=[cen], target=[can], predicted=[win]
src=[gowen], target=[go on], predicted=[woonn]
src=[rids], target=[rides], predicted=[saeee]
src=[win], target=[when], predicted=[wnn]
src=[stof], target=[stuffed], predicted=[soott]
src=[anamols], target=[animals], predicted=[lllleee]
src=[dident], target=[didn't], predicted=[fiiiee]
src=[gow], target=[go], predicted=[woo]
src=[rad], target=[ride], predicted=[waed]
src=[tat], ta

src=[frend], target=[friend], predicted=[riine]
src=[Halleween], target=[halloween], predicted=[lllllee]
src=[coms], target=[comes], predicted=[ssssss]
src=[hous], target=[house], predicted=[soos]
src=[ded], target=[dead], predicted=[sadd]
src=[blod], target=[blood], predicted=[wolll]
src=[wen], target=[when], predicted=[wnn]
src=[axed], target=[asked], predicted=[saed]
src=[cood], target=[could], predicted=[wool]
src=[yoos], target=[use], predicted=[sooe]
src=[sed], target=[said], predicted=[saed]
src=[brock], target=[broke], predicted=[coook]
src=[here], target=[her], predicted=[tere]
src=[sed], target=[said], predicted=[saed]
src=[watever], target=[whatever], predicted=[rrrrree]
src=[riped], target=[ripped], predicted=[fiidd]
src=[hed], target=[head], predicted=[weed]
src=[wat], target=[what], predicted=[the]
src=[wood], target=[would], predicted=[wool]
src=[thot], target=[thought], predicted=[thtt]
src=[ges], target=[guess], predicted=[ssss]
src=[mised], target=[missed], predicted=

src=[funnyest], target=[funniest], predicted=[fiiiitt]
src=[joks], target=[jokes], predicted=[soous]
src=[alwase], target=[always], predicted=[alllee]
src=[vary], target=[very], predicted=[taeyy]
src=[takeing], target=[taking], predicted=[aiiinn]
src=[did't], target=[didn't], predicted=[fiiiee]
src=[woched], target=[watched], predicted=[coooee]
src=[moove], target=[movie], predicted=[woooe]
src=[scard], target=[scared], predicted=[saaaee]
src=[alwase], target=[always], predicted=[alllee]
src=[Texis], target=[Texas], predicted=[saiss]
src=[travelling], target=[traveling], predicted=[lllllinn]
src=[fownd], target=[found], predicted=[wonnn]
37 / 900
