In [1]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

import nltk
from nltk.corpus import wordnet as wn
import inflect

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
import numpy as np
from collections import Counter, defaultdict

from gensim.utils import tokenize
from itertools import groupby

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import LSTM, RepeatVector
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.optimizers import RMSprop

In [2]:
p = inflect.engine()

In [3]:
nltk.download('wordnet')

pairs = {}
for synset in wn.all_synsets('n'):
    word = synset.name().split('.', 1)[0]
    if not word in pairs:
        pairs[word] = p.plural(word)
len(pairs)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mitsuhisa.ohta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


67176

In [4]:
with open('data/plurals.txt', 'w') as fout:
    for k in sorted(pairs):
        if '_' in k or '-' in k:
            continue
        if k.isdigit():
            continue
        fout.write('%s\t%s\n' % (k, pairs[k]))

In [5]:
p.plural('no')

'noes'

In [6]:
class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        """Initialize character table.
        # Arguments
            chars: Characters that can appear in the input.
        """
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    def encode(self, C, num_rows):
        """One hot encode given string C.
        # Arguments
            num_rows: Number of rows in the returned one hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x

    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in x)


In [7]:
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'


In [8]:
# Parameters for the model and dataset.
INVERT = True

In [9]:
questions = []
expected = []
seen = set()
#with open('data/en_de.txt') as fin:
with open('data/plurals.txt') as fin:
    for line in fin:
        en, de = line.strip().split('\t')
        questions.append(en)
        expected.append(de)

max_question_len = max(len(q) for q in questions)
max_expected_len = max(len(e) for e in expected)
questions = [' ' * (max_question_len - len(q)) + q for q in questions]
expected = [e + ' ' * (max_expected_len - len(e)) for e in expected]
if INVERT:
    questions = [q[::-1] for q in questions]

print('Total addition questions:', len(questions))

Total addition questions: 39929


In [10]:
chars = set(ch for k, v in zip(questions, expected) for ch in k + v)
ctable = CharacterTable(chars)
len(chars)

40

In [11]:
print('Vectorization...')
x = np.zeros((len(questions), max_question_len, len(chars)), dtype=np.bool)
y = np.zeros((len(questions), max_expected_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(questions):
    x[i] = ctable.encode(sentence, max_question_len)
for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, max_expected_len)
print('done')

Vectorization...
done


In [12]:
# Shuffle (x, y) in unison as the later parts of x will almost all be larger
# digits.
indices = np.arange(len(y))
np.random.shuffle(indices)
x = x[indices]
y = y[indices]

# Explicitly set apart 10% for validation data that we never train over.
split_at = len(x) - len(x) // 10
(x_train, x_val) = x[:split_at], x[split_at:]
(y_train, y_val) = y[:split_at], y[split_at:]

print('Training Data:')
print(x_train.shape)
print(y_train.shape)

print('Validation Data:')
print(x_val.shape)
print(y_val.shape)


Training Data:
(35937, 31, 40)
(35937, 32, 40)
Validation Data:
(3992, 31, 40)
(3992, 32, 40)


In [13]:
# The below is taken from: https://github.com/keras-team/keras/blob/master/examples/addition_rnn.py
RNN = layers.LSTM
HIDDEN_SIZE = 128
LAYERS = 1

print('Build model...')
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE.
# Note: In a situation where your input sequences have a variable length,
# use input_shape=(None, num_feature).
model.add(RNN(HIDDEN_SIZE, input_shape=(max_question_len, len(chars))))
# As the decoder RNN's input, repeatedly provide with the last hidden state of
# RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
# length of output, e.g., when DIGITS=3, max output is 999+999=1998.
#model.add(layers.Dropout(DROP_OUT))
model.add(layers.RepeatVector(max_expected_len))
# The decoder RNN could be multiple layers stacked or a single layer.
for _ in range(LAYERS):
    # By setting return_sequences to True, return not only the last output but
    # all the outputs so far in the form of (num_samples, timesteps,
    # output_dim). This is necessary as TimeDistributed in the below expects
    # the first dimension to be the timesteps.
    model.add(RNN(HIDDEN_SIZE, return_sequences=True))
#    model.add(layers.Dropout(DROP_OUT))

# Apply a dense layer to the every temporal slice of an input. For each of step
# of the output sequence, decide which character should be chosen.
model.add(layers.TimeDistributed(layers.Dense(len(chars))))
model.add(layers.Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Build model...


W0303 23:05:55.412947 139963923275904 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f4b0d41c4e0>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0303 23:05:57.093016 139963923275904 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f4b214df7f0>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
unified_lstm (UnifiedLSTM)   (None, 128)               86528     
_________________________________________________________________
repeat_vector (RepeatVector) (None, 32, 128)           0         
_________________________________________________________________
unified_lstm_1 (UnifiedLSTM) (None, 32, 128)           131584    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 40)            5160      
_________________________________________________________________
activation (Activation)      (None, 32, 40)            0         
Total params: 223,272
Trainable params: 223,272
Non-trainable params: 0
_________________________________________________________________


In [14]:
def create_seq2seq(num_nodes, num_layers):
    num_chars = len(chars)
    question = Input(shape=(max_question_len, num_chars), name='question')
    # repeat = RepeatVector(max_expected_len)(question)
    prev = question
    for _ in range(num_layers):
        lstm = LSTM(num_nodes, return_sequences=True, name='lstm_layer_%d' % (i + 1))(prev)
        prev = lstm
    dense = TimeDistributed(Dense(num_chars, name='dense', activation='softmax'))(prev)
    model = Model(inputs=[question], outputs=[dense])
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

seq2seq = create_seq2seq(128, 1)

W0303 23:05:57.256747 139963923275904 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f4b0ccccb38>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


In [15]:

BATCH_SIZE = 2048

# Train the model each generation and show predictions against the validation
# dataset.
for iteration in range(1, 200):
    model.fit(x_train, y_train,
              batch_size=BATCH_SIZE,
              epochs=10,
              validation_data=(x_val, y_val))
    print()
    print('-' * 50)
    print('Iteration', iteration)
    # Select 10 samples from the validation set at random so we can visualize
    # errors.
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print(q[::-1] if INVERT else q, '(%s)' % correct, '-', guess)


Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 1
                          dphil (dphils                          ) - aaaess                          
                          pedal (pedals                          ) - aaaess                          
                        zoopsia (zoopsias                        ) - aaaaeess                        
                      acquittal (acquittals                      ) - aaaaiiiess                      
                       diffuser (diffusers                       ) - aaaaieess                       
                   breechloader (breechloaders                   ) - aaaaiiiiiess                    
                       barbados (barbado                         ) - aaaaieess                       
                    heliosphere (heliospheres                 

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 4
                         giotto (giottoes                        ) - aaaiies                         
                        chunnel (chunnels                        ) - aaaiiees                        
                        olearia (olearias                        ) - aaaiiees                        
                         ipecac (ipecacs                         ) - aaaiies                         
                        enamine (enamines                        ) - aaaiiess                        
                         peeing (peeings                         ) - aaaiess                         
                       conodont (conodonts                       ) - aarriiies                       
                            wac (wacs                            ) - sass                            
                      sinusitis

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 7
                          smear (smears                          ) - saaies                          
                         pokomo (pokomoes                        ) - sariies                         
                        prowler (prowlers                        ) - sariiies                        
                chamaeleontidae (chamaeleontidaes                ) - cerooooiiiiiiies                
                        murdoch (murdoches                       ) - sarriiees                       
                    breechcloth (breechcloths                    ) - ceroooiiiiess                   
                      scrambler (scramblers                      ) - sarriiiies                      
                     cantaloupe (cantaloupes                     ) - carroiiiies                     
                          boxer (boxers                         

Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 10
                        hadrian (hadrians                        ) - sariiies                        
                         spread (spreads                         ) - saaiies                         
                      cranberry (cranberries                     ) - carooiiiies                     
                         maniac (maniacs                         ) - saaiies                         
                         ningal (ningals                         ) - saaiies                         
                         levi's (levis'                          ) - saaiies                         
                         litmus (litmuses                        ) - sariiees                        
                     trichroism (trichroisms                     ) - carooiiiies                     
                       jeroboam (jeroboams                       ) - sareiiies       

Epoch 10/10

--------------------------------------------------
Iteration 13
                      commissar (commissars                      ) - careeiiies                      
                      faceplate (faceplates                      ) - careiiiees                      
                      factorial (factorials                      ) - careeiiies                      
                           boar (boars                           ) - saaes                           
                         loofah (loofahs                         ) - sariies                         
                        rubicon (rubicons                        ) - saraiies                        
                     promptness (promptnesses                    ) - cereoiiiiees                    
                            gar (gars                            ) - sass                            
                     engagement (engagements                     ) - careeaiiies                     
     

                            eft (efts                            ) - sass                            
                         xylene (xylenes                         ) - saraess                         
                       wayfarer (wayfarers                       ) - cariiaess                       
                       allspice (allspices                       ) - cariiaess                       
                     desmanthus (desmanthuses                    ) - cartiiiiess                     
                         likuta (likutas                         ) - saraees                         
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 17
                            say (says                            ) - saes                            
                           onyx (onyxes                      

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 20
                     euphrosyne (euphrosynes                     ) - careaaiins                      
                           kali (kalis                           ) - saies                           
                       ischemia (ischemias                       ) - saraaiees                       
                       centesis (centeses                        ) - saraaiins                       
                  polyarteritis (polyarteritises                 ) - ceroooiiiiiss                   
                        country (countries                       ) - saraaiees                       
                 phenylbutazone (phenylbutazones                 ) - cereooiiiiises                  
                   osteomalacia (osteomalacias                   ) - cerooaaiiiss                    
                   

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 23
                         veneer (veneers                         ) - sariees                         
                         jitter (jitters                         ) - sariees                         
                     cimicifuga (cimicifugas                     ) - cartiiiiees                     
                         gravel (gravels                         ) - sariees                         
                       limonene (limonenes                       ) - sariiiess                       
                        mencken (menckens                        ) - sariiees                        
                           mend (mends                           ) - sares                           
                       mccauley (mccauleys                       ) - cariiiiess                      
                       caranday (caranday

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 26
                    beguilement (beguilements                    ) - cerooaiiiies                    
                       ethylene (ethylenes                       ) - sareiiees                       
                      nummulite (nummulites                      ) - careeiiies                      
                       demeanor (demeanors                       ) - sareeiies                       
                      animatism (animatisms                      ) - careeiiies                      
                       congreve (congreves                       ) - sareiiies                       
                      carnelian (carnelians                      ) - careeiiies                      
                     cimicifuga (cimicifugas                     ) - caroeaiiies                     
                       humanity (humanities                    

Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 29
                         malabo (malaboes                        ) - caaaoaes                        
                  struthiomimus (struthiomimuses                 ) - cereoooiiiisses                 
                    crinkleroot (crinkleroots                    ) - ceroooaaions                    
                         brunet (brunets                         ) - sareers                         
                   electrolytic (electrolytics                   ) - cereoooatiines                  
                         rappel (rappels                         ) - sareirs                         
                        cydonia (cydonias                        ) - coloooas                        
                    maniraptora (maniraptoras                    ) - cereooaaions                    
                    paresthesia (paresthesias                    ) - sereeettiins    

Epoch 10/10

--------------------------------------------------
Iteration 32
                          matai (matais                          ) - sarers                          
                   illogicality (illogicalities                  ) - colooooolliies                  
                            gar (gars                            ) - aaas                            
                     sou'wester (sou'westers                     ) - sereeeteers                     
                         folium (foliums                         ) - bollums                         
                         center (centers                         ) - seseers                         
                      midazolam (midazolams                      ) - colooliirs                      
                        cautery (cauteries                       ) - sareteies                       
                     mandragora (mandragoras                     ) - carooaaiias                     
     

                   opisthocomus (opisthocomuses                  ) - poooooooiiuses                  
                        account (accounts                        ) - soroioes                        
                  sulfacetamide (sulfacetamides                  ) - cereeeaaaaiaes                  
                       defender (defenders                       ) - aeeeeeers                       
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 36
                        dresser (dressers                        ) - sesetsts                        
                         sender (senders                         ) - teneers                         
                       pieridae (pieridaes                       ) - aareaiaes                       
                            yes (yeses                       

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 39
                       beetroot (beetroots                       ) - boroooers                       
                       dicranum (dicranums                       ) - aanaiiums                       
                      gallamine (gallamines                      ) - aaliaiines                      
                        songhai (songhais                        ) - sarelels                        
                      emmenthal (emmenthals                      ) - sereesteas                      
                    coextension (coextensions                    ) - seseesstions                    
                        widower (widowers                        ) - borggers                        
                      verdigris (verdigri                        ) - biiiiiuis                       
                   

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 42
                  ethchlorvynol (ethchlorvynols                  ) - phpphooooodeas                  
                   keynesianism (keynesianisms                   ) - eeeeeeeenisms                   
                        crosier (crosiers                        ) - cooshers                        
                     abstractor (abstractors                     ) - conttttters                     
                        saratov (saratovs                        ) - parttias                        
                    stickleback (sticklebacks                    ) - phpoolliiaes                    
                        masters (master                          ) - messers                         
                       fontanne (fontannes                       ) - banttines                       
                      backwoods (backwood

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 45
                      hilliness (hillinesses                     ) - siliinesses                     
                        jujutsu (jujutsus                        ) - fuussems                        
                         dating (datings                         ) - daaings                         
                         humate (humates                         ) - pontees                         
                            eft (efts                            ) - gets                            
                       dairying (dairyings                       ) - aaraaings                       
                          vigna (vignas                          ) - vinias                          
                  laboriousness (laboriousnesses                 ) - boloooossnesses                 
                        laundry (laundries                     

Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 48
                 libertarianism (libertarianisms                 ) - bineeteeeeiisms                 
                          hacek (haceks                          ) - hhaces                          
                  stegocephalia (stegocephalias                  ) - stetttooooiias                  
                        poverty (poverties                       ) - poneities                       
                         lydian (lydians                         ) - ciliins                         
                     perfective (perfectives                     ) - seeeeetnces                     
                      agamemnon (agamemnons                      ) - aevemmmons                      
                          tenon (tenons                          ) - temons                          
                         ripple (ripples                         ) - rrpples         

Epoch 10/10

--------------------------------------------------
Iteration 51
                     ascaphidae (ascaphidaes                     ) - apchhcidaes                     
                     gorgonacea (gorgonaceas                     ) - guiieaaneas                     
                          hello (helloes                         ) - halgoes                         
                        enamine (enamines                        ) - eeemenes                        
                           kali (kalis                           ) - kalis                           
                     phalaropus (phalaropuses                    ) - pharoooiuses                    
                         virgil (virgils                         ) - diiiils                         
                  reinforcement (reinforcements                  ) - eenenenneeents                  
                     prosperity (prosperities                    ) - prrssstities                    
     

                   autoantibody (autoantibodies                  ) - gntioooollgies                  
                      anguillan (anguillans                      ) - anuullians                      
                          sport (sports                          ) - spopts                          
                    tentaculata (tentaculatas                    ) - tenacccaceas                    
                      anguillan (anguillans                      ) - anuullians                      
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 55
                        whooper (whoopers                        ) - whowwers                        
                        waxycap (waxycaps                        ) - tachoaps                        
                  perfectionist (perfectionists              

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 58
                          cramp (cramps                          ) - cramps                          
                          sport (sports                          ) - sports                          
                     perversion (perversions                     ) - pergersions                     
                        ardisia (ardisias                        ) - ariisias                        
                      honorific (honorifics                      ) - hottnfiids                      
                     gasherbrum (gasherbrums                     ) - grrrrrliums                     
                      misoneism (misoneisms                      ) - minonnisms                      
                          kiddy (kiddies                         ) - kiddies                         
                   

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 61
                     submission (submissions                     ) - subgissions                     
                competitiveness (competitivenesses               ) - corpertiiienesses               
                       napoleon (napoleons                       ) - naoollons                       
                       shepherd (shepherds                       ) - shepphrds                       
                      hexestrol (hexestrols                      ) - hettossols                      
                   cryptanalyst (cryptanalysts                   ) - crrthhoolists                   
                      lotusland (lotuslands                      ) - lostoiinds                      
                    dardanelles (dardanelle                      ) - daneeeeale                      
                      pholidota (pholidot

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 64
                     polychaeta (polychaetas                     ) - poccccaceas                     
                      retardant (retardants                      ) - reteedints                      
                          creak (creaks                          ) - crarks                          
                          porto (portoes                         ) - portoes                         
                        gherkin (gherkins                        ) - grakkins                        
                   thoroughfare (thoroughfares                   ) - thoneeoolales                   
                        kolkata (kolkatas                        ) - konkeeas                        
                        nostril (nostrils                        ) - nostiils                        
                        iberian (iberians                      

Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 67
                 counterexample (counterexamples                 ) - counteeecchores                 
                     anemonella (anemonellas                     ) - anermllalas                     
                    lobachevsky (lobachevskies                   ) - londleeeeries                   
                     anastatica (anastaticas                     ) - anisttatcas                     
                         henson (hensons                         ) - hensons                         
                            ted (teds                            ) - teds                            
                    mostaccioli (mostacciolis                    ) - mottoooooris                    
                          tapir (tapirs                          ) - tapirs                          
                         gavial (gavials                         ) - gaviils         

Epoch 10/10

--------------------------------------------------
Iteration 70
                    balaeniceps (balaeniceps                     ) - ballleeters                     
                          nanak (nanaks                          ) - nannks                          
                scrophulariales (scrophulariale                  ) - schoooooolllaes                 
                   lansoprazole (lansoprazoles                   ) - larropppolles                   
                    strangeness (strangenesses                   ) - strardenesses                   
                        tapioca (tapiocas                        ) - taptiias                        
                         ovibos (ovibo                           ) - ovboos                          
                         busboy (busboys                         ) - burboies                        
                      barcelona (barcelonas                      ) - barkllords                      
     

                      briefness (briefnesses                     ) - briednesses                     
                        rubicon (rubicons                        ) - robicons                        
                          doris (dori                            ) - dori                            
                      eisegesis (eisegeses                       ) - eiseruses                       
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 74
                         huston (hustons                         ) - hussons                         
                           dean (deans                           ) - deans                           
                      dalliance (dalliances                      ) - daliaances                      
                   officeholder (officeholders               

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 77
                        decagon (decagons                        ) - decadons                        
                        oddness (oddnesses                       ) - oddnnsse                        
                    inscription (inscriptions                    ) - intortations                    
                  candlesnuffer (candlesnuffers                  ) - canniniifffer                   
                         bedlam (bedlams                         ) - bellams                         
                        buchner (buchners                        ) - buccners                        
                        bitumen (bitumens                        ) - bishmens                        
                       beetroot (beetroots                       ) - bertrooes                       
                   

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 93
                     derivation (derivations                     ) - dervvitions                     
                       valletta (vallettas                       ) - valleters                       
                      telepathy (telepathies                     ) - telapatcits                     
                   hydrobatidae (hydrobatidaes                   ) - hydbagoaiders                   
                       pectoral (pectorals                       ) - pecisials                       
                       volution (volutions                       ) - folitions                       
                       ardennes (ardenne                         ) - ardenes                         
                         tenant (tenants                         ) - tennnes                         
                       injector (injectors                       ) - indec

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 96
                            ebb (ebbs                            ) - ebbs                            
                          vulva (vulvas                          ) - vulfas                          
                          grove (groves                          ) - groves                          
                     anastigmat (anastigmats                     ) - anastnroets                     
                       paycheck (paychecks                       ) - pacchecks                       
                        modicon (modicons                        ) - modicons                        
                            wig (wigs                            ) - wigs                            
                           chop (chops                           ) - chops                           
                     conspectus (conspectuses                    ) - corscettuses               


--------------------------------------------------
Iteration 99
                      sinhalese (sinhalese                       ) - sinhalase                       
                        paradox (paradoxes                       ) - paradoxes                       
                     overburden (overburdens                     ) - overbugdens                     
                          argil (argils                          ) - argils                          
                      beanfeast (beanfeasts                      ) - bennfeests                      
                          noyes (noye                            ) - noy                             
                    hydrocarbon (hydrocarbons                    ) - hydrichloons                    
                     gorgonacea (gorgonaceas                     ) - gorgonaceas                     
                  ginglymostoma (ginglymostomas                  ) - gindoooossomas                  
                 

                     inhibition (inhibitions                     ) - inhobitions                     
                    hemigrammus (hemigrammuses                   ) - heprgrammuses                   
                      zinnemann (zinnemanns                      ) - hinnemeans                      
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 103
                     stationery (stationeries                    ) - stattiueries                    
                     abnegation (abnegations                     ) - abgegations                     
                  hepaticopsida (hepaticopsidas                  ) - heperoccttiras                  
                      mentioner (mentioners                      ) - mentonners                      
                          title (titles                     

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 106
                          swath (swaths                          ) - swaths                          
                      mentioner (mentioners                      ) - mentonners                      
                       rotifera (rotiferas                       ) - rotiferas                       
                       tomahawk (tomahawks                       ) - toaahoaks                       
                     housecraft (housecraft                      ) - houseceufes                     
                   legitimation (legitimations                   ) - legistiations                   
                       geastrum (geastrums                       ) - gerstrums                       
                      conepatus (conepatuses                     ) - conetttuses                     
                  

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 109
                       audition (auditions                       ) - auditions                       
                          haunt (haunts                          ) - haunds                          
                         health (healths                         ) - heaaths                         
                       stockton (stocktons                       ) - stocktons                       
                         office (offices                         ) - offices                         
                 responsibility (responsibilities                ) - ressosuibilities                
                  pterodactylus (pterodactyluses                 ) - pterodachyluses                 
                         ripple (ripples                         ) - ripples                         
                       liberian (liberia

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 112
                        psophia (psophias                        ) - psophias                        
                           move (moves                           ) - moves                           
                      pessimism (pessimisms                      ) - pessimisms                      
                          ypres (ypre                            ) - ypr                             
                           mary (maries                          ) - maries                          
                         linden (lindens                         ) - lindens                         
                misconstruction (misconstructions                ) - miscontustations                
                            bud (buds                            ) - buds                            
                     surrealism (surrealisms                  

Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 115
                     supplejack (supplejacks                     ) - supplllacks                     
                       stallion (stallions                       ) - stallions                       
                      synizesis (synizeses                       ) - syikieses                       
                      contagion (contagions                      ) - contagions                      
                      nonworker (nonworkers                      ) - nonwwrkers                      
                 stenopterygius (stenopterygiuses                ) - stenottaroituses                
                     anthropoid (anthropoids                     ) - anthropoids                     
                          jonah (jonahs                          ) - jonahs                          
                 erythropoietin (erythropoietins                 ) - eryhhroooritins

Epoch 10/10

--------------------------------------------------
Iteration 118
                        antique (antiques                        ) - antiques                        
                           ouse (ouses                           ) - ouses                           
                   hyperacidity (hyperacidities                  ) - hyperacilities                  
                        maracay (maracays                        ) - maracays                        
                 inexplicitness (inexplicitnesses                ) - inempicittnesses                
                     mustelidae (mustelidaes                     ) - mustelidaes                     
                      truckling (trucklings                      ) - trucklings                      
                         winter (winters                         ) - winters                         
                    chronograph (chronographs                    ) - chronographe                    
    

                     cimicifuga (cimicifugas                     ) - cimitifuras                     
                          prawn (prawns                          ) - prawns                          
                        mailbag (mailbags                        ) - mailbans                        
                         feijoa (feijoas                         ) - ferjocs                         
                           kuvi (kuvis                           ) - kovis                           
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 122
                  proconsulship (proconsulships                  ) - phoccuuurapips                  
                      soliloquy (soliloquies                     ) - sollloouies                     
                     nicaraguan (nicaraguans                

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 125
                           mend (mends                           ) - mends                           
                         hominy (hominies                        ) - hominies                        
                      labetalol (labetalols                      ) - labetalils                      
                          arcus (arcuses                         ) - arcuses                         
                      marketing (marketings                      ) - marketings                      
                        cleaner (cleaners                        ) - cleaners                        
                        terrier (terriers                        ) - terriers                        
                       coreidae (coreidaes                       ) - coreidaes                       
                  

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 128
                         putoff (putoffs                         ) - putoffs                         
                         shield (shields                         ) - shields                         
                         kitten (kittens                         ) - kittens                         
                      hydrazine (hydrazines                      ) - hydrazines                      
                     smattering (smatterings                     ) - shatterings                     
                      chamomile (chamomiles                      ) - chamoriles                      
                     profligacy (profligacies                    ) - profllgacies                    
                           bosc (boscs                           ) - boscs                           
                         frying (fryings

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 131
                       basophil (basophils                       ) - basophils                       
                         finger (fingers                         ) - fingers                         
                       dampener (dampeners                       ) - dampeners                       
                         invite (invites                         ) - invites                         
                 hystricomorpha (hystricomorphas                 ) - hystttooolaphas                 
                     hypocapnia (hypocapnias                     ) - hypocattias                     
                    subdominant (subdominants                    ) - sublominants                    
                         bigram (bigrams                         ) - bigrams                         
                     wellington (wellingtons                  

Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 134
                       otorrhea (otorrheas                       ) - otorpaeas                       
                      dryadella (dryadellas                      ) - dryadellas                      
                         louver (louvers                         ) - louvers                         
                    blessedness (blessednesses                   ) - blessednesses                   
                        windaus (windauses                       ) - windauses                       
                          faint (faints                          ) - faints                          
                        moocher (moochers                        ) - moochers                        
                    paperhanger (paperhangers                    ) - papertangers                    
                          alnus (alnuses                         ) - alnuses        

Epoch 10/10

--------------------------------------------------
Iteration 137
                 plumbaginaceae (plumbaginaceaes                 ) - pllubagniaccaes                 
                    manumission (manumissions                    ) - manumissions                    
                      genitalia (genitalias                      ) - genitalias                      
                    boatbuilder (boatbuilders                    ) - byttiuoleers                    
                     retraction (retractions                     ) - retractions                     
                        linseed (linseeds                        ) - linseeds                        
                        bedouin (bedouins                        ) - beduuins                        
                       nephthys (nephthy                         ) - nephthys                        
                      zephaniah (zephaniahs                      ) - zephandiis                      
    

                     mustelidae (mustelidaes                     ) - mustelidaes                     
                          galen (galens                          ) - galens                          
                  accompaniment (accompaniments                  ) - accompeinments                  
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 141
                        dresser (dressers                        ) - dressers                        
                treponemataceae (treponemataceaes                ) - tterolanetaceaes                
                       dihybrid (dihybrids                       ) - dihmbrids                       
                         enamel (enamels                         ) - enemels                         
                      tarantism (tarantisms                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 144
                 combustibility (combustibilities                ) - combustibilities                
                      postulant (postulants                      ) - postulants                      
                         sangay (sangays                         ) - sangays                         
                         soweto (sowetoes                        ) - sowetoe                         
                       footwork (footworks                       ) - footworks                       
                           dace (daces                           ) - daces                           
                    latrodectus (latrodectuses                   ) - lattodeccuses                   
                         linden (lindens                         ) - lindens                         
                  

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 147
                       blackout (blackouts                       ) - blackuuts                       
                         ononis (ononi                           ) - onooi                           
                          grief (griefs                          ) - griefs                          
                         simeon (simeons                         ) - simeons                         
                        seating (seatings                        ) - seatings                        
                        product (products                        ) - produkts                        
                        thinker (thinkers                        ) - thinkers                        
                     louisville (louisvilles                     ) - loossbilles                     
                     gastrocybe (gastroc

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 150
                         khanty (khanties                        ) - khanties                        
                       polymath (polymaths                       ) - polymaths                       
                procrastination (procrastinations                ) - prochastinations                
                     fiberscope (fiberscopes                     ) - fiberscopes                     
                         anthem (anthems                         ) - anthems                         
                        walleye (walleyes                        ) - walleyes                        
                   lepidocybium (lepidocybiums                   ) - lepidocybiums                   
                    sericulture (sericultures                    ) - sericultures                    
                       dealfish (dealfish                     

Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 153
                        collect (collects                        ) - collects                        
                   rejuvenation (rejuvenations                   ) - rejuvenations                   
                         cereus (cereuses                        ) - cereuses                        
                         brunei (bruneis                         ) - bruneis                         
                         threat (threats                         ) - threats                         
                      adulthood (adulthoods                      ) - adllthoods                      
                    teredinidae (teredinidaes                    ) - teredinidaes                    
                      parentage (parentages                      ) - parentages                      
                    legislature (legislatures                    ) - legisaatures   

Epoch 10/10

--------------------------------------------------
Iteration 156
                      deformity (deformities                     ) - deformities                     
                        frigate (frigates                        ) - frigates                        
                         baking (bakings                         ) - bakings                         
                       nicandra (nicandras                       ) - nicandras                       
                           dope (dopes                           ) - dopes                           
                angiotelectasia (angiotelectasias                ) - angisceleripiias                
                       akinesis (akineses                        ) - aiineses                        
                         fender (fenders                         ) - fenders                         
                        forging (forgings                        ) - forgings                        
    

                        testing (testings                        ) - testings                        
                  investigation (investigations                  ) - investigations                  
                           slot (slots                           ) - slots                           
                     copernicus (copernicuses                    ) - copernncuses                    
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 160
                         regard (regards                         ) - regards                         
                  pterodactylus (pterodactyluses                 ) - pterodactyluses                 
                        periwig (periwigs                        ) - perroigs                        
                      vetchling (vetchlings                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 163
                         lancer (lancers                         ) - lancers                         
                    reappraisal (reappraisals                    ) - reappaansals                    
                   chrestomathy (chrestomathies                  ) - chrestomahhies                  
                           lear (lears                           ) - lears                           
                      germanism (germanisms                      ) - germanisms                      
                         yukawa (yukawas                         ) - yukawas                         
                          hooks (hook                            ) - hook                            
                        sequoia (sequoias                        ) - sequoias                        
                  

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 166
                         cleats (cleat                           ) - cleat                           
                        burying (buryings                        ) - buryings                        
                      ailanthus (ailanthuses                     ) - aylanthuses                     
                         putoff (putoffs                         ) - putoffs                         
                  counterperson (counterpeople                   ) - counterpersons                  
                          knoll (knolls                          ) - knolls                          
                       amaretto (amarettoes                      ) - amarettoss                      
                   rejuvenation (rejuvenations                   ) - rejuvenations                   
                    penetration (penetra

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 169
                      obviation (obviations                      ) - obviations                      
                      tineoidea (tineoideas                      ) - tinerideas                      
                      pennoncel (pennoncels                      ) - pennoncels                      
                        omentum (omentums                        ) - omentums                        
                        laxness (laxnesses                       ) - laynesses                       
                  prematureness (prematurenesses                 ) - prematurenesses                 
                 echinococcosis (echinococcoses                  ) - echinococcoses                  
                     absolutist (absolutists                     ) - absolusists                     
                   microtaggant (microtaggants                

Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 172
                      parcheesi (parcheesis                      ) - parchessis                      
                          midst (midsts                          ) - midsts                          
                         orphan (orphans                         ) - orphans                         
                    trichoceros (trichocero                      ) - trichocero                      
                         satrap (satraps                         ) - satraps                         
                       outskirt (outskirts                       ) - oustkirts                       
                         voider (voiders                         ) - voiders                         
                balaenopteridae (balaenopteridaes                ) - balaedopteridaes                
                          boxer (boxers                          ) - boxers         

Epoch 10/10

--------------------------------------------------
Iteration 175
                        caretta (carettas                        ) - carettas                        
                        gruyere (gruyeres                        ) - grureres                        
                           pump (pumps                           ) - pumps                           
                  leptomeninges (leptomeninge                    ) - leptomeniaae                    
                        skinful (skinfuls                        ) - skinfuls                        
                    phonophobia (phonophobias                    ) - phonophobias                    
                      garnishee (garnishees                      ) - garnishees                      
                    appointment (appointments                    ) - applintments                    
                         hominy (hominies                        ) - hominies                        
    

                        saltpan (saltpans                        ) - salppans                        
                      athetosis (athetoses                       ) - athetoses                       
                      forficula (forficulas                      ) - forficulas                      
                  undergraduate (undergraduates                  ) - undergangretes                  
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 179
                  expressionist (expressionists                  ) - expeessionists                  
                           arse (arses                           ) - arses                           
                   plausibility (plausibilities                  ) - plausibilities                  
                   helianthemum (helianthemums              

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 182
                          sprag (sprags                          ) - sprags                          
                           dose (doses                           ) - doses                           
                    whoremaster (whoremasters                    ) - wroaemasters                    
                         maundy (maundies                        ) - maundies                        
                        celioma (celiomas                        ) - celiomas                        
                      dizziness (dizzinesses                     ) - dizzinesses                     
                        outcrop (outcrops                        ) - outcrops                        
                        frigate (frigates                        ) - frigates                        
                  

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 185
                      dalliance (dalliances                      ) - dalliances                      
                         aeneid (aeneids                         ) - aeeeids                         
                   protohistory (protohistories                  ) - protoristories                  
                       footbath (footbaths                       ) - footbaths                       
                      microglia (microglias                      ) - microglias                      
                          irish (irishes                         ) - irishes                         
                     compromise (compromises                     ) - comprommses                     
                         tinker (tinkers                         ) - tinkers                         
                           volt (volts  

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 188
                         hubbub (hubbubs                         ) - hubbubs                         
                         sauria (saurias                         ) - saurias                         
                         jitter (jitters                         ) - jitters                         
                          emery (emeries                         ) - emeries                         
                         isogon (isogons                         ) - isogons                         
                     pogostemon (pogostemons                     ) - pogostemons                     
                       moneybag (moneybags                       ) - moneybags                       
                         brunet (brunets                         ) - brunets                         
                       hepatoma (hepatomas                    

Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 191
                prechlorination (prechlorinations                ) - prechmorniations                
                 overindulgence (overindulgences                 ) - overdidulrences                 
                      pentoxide (pentoxides                      ) - pentoxides                      
                         parvis (parvi                           ) - parvi                           
                       leonberg (leonbergs                       ) - leonbergs                       
                  bumptiousness (bumptiousnesses                 ) - bumptiousnesses                 
                       override (overrides                       ) - overrides                       
                  disinvestment (disinvestments                  ) - disinvestments                  
                       estoppel (estoppels                       ) - estoppels      

Epoch 10/10

--------------------------------------------------
Iteration 194
                      adulthood (adulthoods                      ) - adulthoods                      
                    verticality (verticalities                   ) - verticalities                   
                    tentaculata (tentaculatas                    ) - tentaculatas                    
                     christella (christellas                     ) - christellas                     
                      arabesque (arabesques                      ) - arabeqques                      
                     cannonball (cannonballs                     ) - cannnnballs                     
                          assam (assams                          ) - assams                          
                          chios (chio                            ) - chio                            
                      peacetime (peacetimes                      ) - peacetimes                      
    

                          butut (bututs                          ) - bututs                          
                 chrysosplenium (chrysospleniums                 ) - chrysospaediums                 
                    synthesizer (synthesizers                    ) - synthesizers                    
                  assassination (assassinations                  ) - aspassinations                  
                    multiplexer (multiplexers                    ) - multipleeers                    
                      pentoxide (pentoxides                      ) - pentoxides                      
Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 198
                         herpes (herpes                          ) - herpes                          
                       bluefish (bluefish                   

In [16]:
shakespeare = strip_headers(load_etext(100))
tokens = [tuple(word) for word in tokenize(shakespeare, to_lower=True)]
# tokens = [tuple(word) for word in tokenize(plays, to_lower=True)]
token_counts = Counter(tokens)

In [17]:
pairs = [(token[i], token[i + 1], token_id) for token_id, token in enumerate(tokens) for i in range(len(token) - 1)]

In [18]:
pairs[10]

('o', 'r', 2)

In [19]:
help(tokenize)

Help on function tokenize in module gensim.utils:

tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors='strict', to_lower=False, lower=False)
    Iteratively yield tokens as unicode strings, removing accent marks
    and optionally lowercasing the unidoce string by assigning True
    to one of the parameters, lowercase, to_lower, or lower.
    
    Input text may be either unicode or utf8-encoded byte string.
    
    The tokens on output are maximal contiguous sequences of alphabetic
    characters (no digits!).
    
    >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True))
    [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']



In [20]:
tokens[5]

('s', 'h', 'a', 'k', 'e', 's', 'p', 'e', 'a', 'r', 'e')