In [145]:
'''
Tal Weiss Deep Spell
https://github.com/MajorTal/DeepSpell
'''

import pandas as pd
import numpy as np
from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand
from numpy import zeros as np_zeros

from keras.models import Sequential, load_model
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout, recurrent
from keras.callbacks import Callback

Using TensorFlow backend.


In [233]:
ks = pd.read_csv('data/kidsspelling.csv')
ks = ks.drop(['Code', 'Semester', 'Unnamed: 4'], axis=1)
ks = ks.dropna()
ks["Target"] = ks.Target.apply(lambda x: x.strip())
ks[0:5]

Unnamed: 0,Target,Spelling
0,favorite,favtit
1,throw,thow
2,catch,cach
3,touchdown,tuchdone
4,dance,dans


In [206]:
def print_random_predictions(model, ctable, X_val, y_val):
    """Select 10 samples from the validation set at random so we can visualize errors"""
    print()
    for _ in range(10):
        ind = random_randint(0, len(X_val))
        rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member
        preds = model.predict_classes(rowX, verbose=0)
        q = ctable.decode(rowX[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        if CONFIG.inverted:
            print('Q', q[::-1]) # inverted back!
        else:
            print('Q', q)
        print('A', correct)
        print(Colors.green + '☑' + Colors.close if correct == guess else Colors.red + '☒' + Colors.close, guess)
        print('---')
    print()

In [207]:
class Colors(object):
    """For nicer printouts"""
    green = '\033[92m'
    red = '\033[91m'
    close = '\033[0m'

In [208]:
def iterate_training(model, X_train, y_train, X_val, y_val, ctable):
    print("Starting training")
    """Iterative Training"""
    # Train the model each generation and show predictions against the validation dataset
    for iteration in range(1, CONFIG.number_of_iterations):
        print()
        print('-' * 50)
        print('Iteration', iteration)
        model.fit(X_train, y_train, batch_size=CONFIG.batch_size, epochs=CONFIG.epochs,
                  validation_data=(X_val, y_val))
        print("saving model")       
        model.save('my_model.h5') 
        print_random_predictions(model, ctable, X_val, y_val)

In [220]:
def generate_model(output_len, chars=None):
    """Generate the model"""
    print('Build model...')
    chars = chars or CHARS
    model = Sequential()
    # "Encode" the input sequence using an RNN, producing an output of hidden_size
    # note: in a situation where your input sequences have a variable length,
    # use input_shape=(None, nb_feature).
    for layer_number in range(CONFIG.input_layers):
        print("added input")
        model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization,
                                 return_sequences=layer_number + 1 < CONFIG.input_layers))
        model.add(Dropout(CONFIG.amount_of_dropout))
    # For the decoder's input, we repeat the encoded input for each time step
    model.add(RepeatVector(output_len))
    # The decoder RNN could be multiple layers stacked or a single layer
    for _ in range(CONFIG.output_layers):
        print("added output")
        model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization))
        model.add(Dropout(CONFIG.amount_of_dropout))

    # For each of step of the output sequence, decide which character should be chosen
    model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [210]:
def slice_X(X, start=None, stop=None):
    """This takes an array-like, or a list of
    array-likes, and outputs:
        - X[start:stop] if X is an array-like
        - [x[start:stop] for x in X] if X in a list
    Can also work on list/array of indices: `slice_X(x, indices)`
    # Arguments
        start: can be an integer index (start index)
            or a list/array of indices
        stop: integer (stop index); should be None if
            `start` was a list.
    """
    if isinstance(X, list):
        if hasattr(start, '__len__'):
            # hdf5 datasets only support list objects as indices
            if hasattr(start, 'shape'):
                start = start.tolist()
            return [x[start] for x in X]
        else:
            return [x[start:stop] for x in X]
    else:
        if hasattr(start, '__len__'):
            if hasattr(start, 'shape'):
                start = start.tolist()
            return X[start]
        else:
            return X[start:stop]

In [211]:
class CharacterTable(object):
    """
    Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    @property
    def size(self):
        """The number of chars"""
        return len(self.chars)

    def encode(self, C, maxlen):
        """Encode as one-hot"""
        X = np_zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member
        for i, c in enumerate(C):
            X[i, self.char_indices[c]] = 1
        return X

    def decode(self, X, calc_argmax=True):
        """Decode from one-hot"""
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in X if x)

In [212]:
def _vectorize(questions, answers, ctable):
    """Vectorize the data as numpy arrays"""
    len_of_questions = len(questions)
    X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), dtype=np.bool)
    for i in range(len(questions)):
        sentence = questions.pop()
        for j, c in enumerate(sentence):
            try:
                X[i, j, ctable.char_indices[c]] = 1
            except KeyError:
                pass # Padding
    y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), dtype=np.bool)
    for i in range(len(answers)):
        sentence = answers.pop()
        for j, c in enumerate(sentence):
            try:
                y[i, j, ctable.char_indices[c]] = 1
            except KeyError:
                pass # Padding
    return X, y

In [213]:
def vectorize(questions, answers, chars=None):
    """Vectorize the questions and expected answers"""
    print('Vectorization...')
    chars = chars or CHARS
    ctable = CharacterTable(chars)
    X, y = _vectorize(questions, answers, ctable)
    # Explicitly set apart 10% for validation data that we never train over
    split_at = int(len(X) - len(X) / 10)
    (X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at))
    (y_train, y_val) = (y[:split_at], y[split_at:])

    print(X_train.shape)
    print(y_train.shape)

    return X_train, X_val, y_train, y_val, CONFIG.max_input_len, ctable

In [222]:
class Configuration(object):
    """Dump stuff here"""

CONFIG = Configuration()
#pylint:disable=attribute-defined-outside-init
# Parameters for the model:
CONFIG.input_layers = 2
CONFIG.output_layers = 2
CONFIG.amount_of_dropout = 0.2
CONFIG.hidden_size = 50
CONFIG.initialization = "he_normal" # : Gaussian initialization scaled by fan-in (He et al., 2014)
CONFIG.max_input_len = 28
CONFIG.inverted = True
CONFIG.batch_size = 10 # As the model changes in size, play with the batch size to best fit the process in memory
CONFIG.epochs = 5 # due to mini-epochs.
CONFIG.steps_per_epoch = 50 # This is a mini-epoch. Using News 2013 an epoch would need to be ~60K.
CONFIG.validation_steps = 50
CONFIG.number_of_iterations = 5

AMOUNT_OF_NOISE = 1 / CONFIG.max_input_len
PADDING = "☕"

CHARS = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ '")

In [223]:
def add_noise_to_string(a_string, amount_of_noise):
    """Add some artificial spelling mistakes to the string"""
    #print(a_string)
    if rand() < amount_of_noise * len(a_string):
        # Replace a character with a random character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position + 1:]
    if rand() < amount_of_noise * len(a_string):
        # Delete a character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + a_string[random_char_position + 1:]
    if len(a_string) < CONFIG.max_input_len and rand() < amount_of_noise * len(a_string):
        # Add a random character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position:]
    if rand() < amount_of_noise * len(a_string) and len(a_string) > 1:
        # Transpose 2 characters
        random_char_position = random_randint(len(a_string) - 1)
        a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] +
                    a_string[random_char_position + 2:])
    return a_string

In [224]:
def generate_question(answer):
    """Generate a question by adding noise"""
    question = add_noise_to_string(answer, AMOUNT_OF_NOISE)
    # Add padding:
    question += PADDING * (CONFIG.max_input_len - len(question))
    answer += PADDING * (CONFIG.max_input_len - len(answer))
    return question, answer

In [225]:
def generate_news_data():
    """Generate some news data"""
    print ("Generating Data")
    word_freq = pd.read_csv('data/word_freq.csv')
    answers = word_freq['word'].tolist()
    answers = answers * 3
    #print(answers)
    questions = []
    print('shuffle', end=" ")
    random_shuffle(answers)
    print("Done")
    for answer_index, answer in enumerate(answers):
        question, answer = generate_question(answer)
        #print(question, answer)
        answers[answer_index] = answer
        assert len(answer) == CONFIG.max_input_len
        question = question[::-1] if CONFIG.inverted else question
        if random_randint(100000) == 8: # Show some progress
            print (len(answers))
            print ("answer:   '{}'".format(answer))
            print ("question: '{}'".format(question))
            print ()
        questions.append(question)

    return questions, answers

In [252]:
questions = [(word.upper() + PADDING * (CONFIG.max_input_len - len(word)))[::-1] for word in ks['Spelling'].tolist()] #question += PADDING * (CONFIG.max_input_len - len(question))
answers = [word.upper() + PADDING * (CONFIG.max_input_len - len(word)) for word in ks['Target'].tolist()]
print(len(max(questions, key=len)))
print(len(max(answers, key=len)))

28
28


In [253]:
questions, answers = generate_news_data()
print(questions[:10])
chars_answer = set.union(*(set(answer) for answer in answers))
chars_question = set.union(*(set(question) for question in questions))
chars = list(set.union(chars_answer, chars_question))
X_train, X_val, y_train, y_val, y_maxlen, ctable = vectorize(questions, answers, chars)
#print ("y_maxlen, chars", y_maxlen, "".join(chars))
#model = generate_model(y_maxlen, chars)
model = load_model('my_model.h5')
#iterate_training(model, X_train, y_train, X_val, y_val, ctable)
print_random_predictions(model, ctable, X_val, y_val)

Generating Data
shuffle Done
128889
answer:   'MONOCULAR☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕'
question: '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕RALUCONM'

128889
answer:   'CAPITALISTS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕'
question: '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕STUILATIPAC'

['☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕SELDDUP', '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕THGILYALD', '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕SERIQER', '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕SESNAPXE', '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕SLWANC', '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕ECNDEREFED', '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕GNILIAH', '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕RERPPSEN', '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕DEIFIRP', '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕SREDNAM']
Vectorization...
(116000, 28, 29)
(116000, 28, 29)

Q JNDESCRIBABLE☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A INDESCRIBABLE☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m INDESTIIIABL☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q DISINTEEGATED☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A DISINTEGRATED☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m DISIINEENTEE☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q PUMPKN☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A PUMPKIN☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m PUMPING☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q PERZVERSE☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A PERVERSE☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m PEEVEEES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕

In [254]:
print_random_predictions(model, ctable, X_val, y_val)


Q AJOYS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A JOYS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m AAOYS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q OAUTS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A OATS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m OOTTS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q GUANO☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A GUANO☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[92m☑[0m GUANO☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q VOILET☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A VIOLET☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m VOLLET☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q RECATL☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A RECTAL☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m RECATL☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q OBIGING☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A OBLIGING☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m BOIIING☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q DESHOTISM☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A DESPOTISM☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m DESHOOIS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q WATCHJFUL☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A WATCHFUL☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m WATTHULL☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q MORES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A MORES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[92m☑[0m MORES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q PARKD☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A PARKED☕☕☕☕☕☕

In [230]:
train_speller_w_all_data()

Generating Data
shuffle Done
128889
answer:   'CLUTTER☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕'
question: '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕RETTULC'

128889
answer:   'SENTINELS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕'
question: '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕SLENITEAS'

128889
answer:   'DYSPLASIA☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕'
question: '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕AISALPYD'

128889
answer:   'PERSONIFICATION☕☕☕☕☕☕☕☕☕☕☕☕☕'
question: '☕☕☕☕☕☕☕☕☕☕☕☕☕NOITCVIFINOSREP'

Vectorization...
(116000, 28, 29)
(116000, 28, 29)

Q RESURRECT☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A RESURRECT☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m RESURRET☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q THIRVES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A THRIVES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[92m☑[0m THRIVES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q PENWNET☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A PENNED☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m PENNEET☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q SILICONSE☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A SILICONES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m SIIIINEES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q ACQUAINDANHCES☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A ACQUAINTANCES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m ACCUANNNATEES☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q PZRECONDITIONING☕☕☕☕☕☕☕☕☕☕☕

In [227]:
train_speller_w_all_data()

Generating Data
shuffle Done
128889
answer:   'FACULTY☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕'
question: '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕YTLUCAF'

128889
answer:   'GRUDGINGLY☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕'
question: '☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕YLGNIGDUURG'

Vectorization...
(116000, 28, 29)
(116000, 28, 29)
y_maxlen, chars 28 GZESYJMNVPXBARKU'FOCWLH☕T QID
Build model...
added input
added input
added output
added output
Starting training

--------------------------------------------------
Iteration 1
Train on 116000 samples, validate on 12889 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
saving model

Q CONSCIENTOUSLY☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A CONSCIENTIOUSLY☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m CONSIINIIIALY☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q PLATINUM☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A PLATINUM☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m PLATIONM☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q IDEALIRTS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A IDEALISTS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m IDEAAITS☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q FOIRUTEEN☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
A FOURTEEN☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
[91m☒[0m FORREENN☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕☕
---
Q ODO

KeyboardInterrupt: 