In [121]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

import nltk
from nltk.corpus import wordnet as wn
import inflect

from keras.models import Sequential
from keras import layers
import numpy as np
from collections import Counter, defaultdict

from gensim.utils import tokenize
from itertools import groupby

from keras.models import Input, Model
from keras.layers import Dense, Dropout
from keras.layers import LSTM, RepeatVector
from keras.layers.wrappers import TimeDistributed

In [2]:
p = inflect.engine()

In [3]:
pairs = {}
for synset in wn.all_synsets('n'):
    word = synset.name().split('.', 1)[0]
    if not word in pairs:
        pairs[word] = p.plural(word)
len(pairs)

67176

In [4]:
with open('data/plurals.txt', 'w') as fout:
    for k in sorted(pairs):
        if '_' in k or '-' in k:
            continue
        if k.isdigit():
            continue
        fout.write('%s\t%s\n' % (k, pairs[k]))

In [5]:
p.plural('no')

'noes'

In [6]:
class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        """Initialize character table.
        # Arguments
            chars: Characters that can appear in the input.
        """
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    def encode(self, C, num_rows):
        """One hot encode given string C.
        # Arguments
            num_rows: Number of rows in the returned one hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x

    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in x)


In [7]:
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'


In [8]:
# Parameters for the model and dataset.
INVERT = True

In [108]:
questions = []
expected = []
seen = set()
#with open('data/en_de.txt') as fin:
with open('data/plurals.txt') as fin:
    for line in fin:
        en, de = line.strip().split('\t')
        questions.append(en)
        expected.append(de)

max_question_len = max(len(q) for q in questions)
max_expected_len = max(len(e) for e in expected)
questions = [' ' * (max_question_len - len(q)) + q for q in questions]
expected = [e + ' ' * (max_expected_len - len(e)) for e in expected]
if INVERT:
    questions = [q[::-1] for q in questions]

print('Total addition questions:', len(questions))

Total addition questions: 39929


In [109]:
chars = set(ch for k, v in zip(questions, expected) for ch in k + v)
ctable = CharacterTable(chars)
len(chars)

40

In [110]:
print('Vectorization...')
x = np.zeros((len(questions), max_question_len, len(chars)), dtype=np.bool)
y = np.zeros((len(questions), max_expected_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(questions):
    x[i] = ctable.encode(sentence, max_question_len)
for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, max_expected_len)
print('done')

Vectorization...
done


In [111]:
# Shuffle (x, y) in unison as the later parts of x will almost all be larger
# digits.
indices = np.arange(len(y))
np.random.shuffle(indices)
x = x[indices]
y = y[indices]

# Explicitly set apart 10% for validation data that we never train over.
split_at = len(x) - len(x) // 10
(x_train, x_val) = x[:split_at], x[split_at:]
(y_train, y_val) = y[:split_at], y[split_at:]

print('Training Data:')
print(x_train.shape)
print(y_train.shape)

print('Validation Data:')
print(x_val.shape)
print(y_val.shape)


Training Data:
(35937, 31, 40)
(35937, 32, 40)
Validation Data:
(3992, 31, 40)
(3992, 32, 40)


In [114]:
# Try replacing GRU, or SimpleRNN.
RNN = layers.LSTM
HIDDEN_SIZE = 128
LAYERS = 1

print('Build model...')
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE.
# Note: In a situation where your input sequences have a variable length,
# use input_shape=(None, num_feature).
model.add(RNN(HIDDEN_SIZE, input_shape=(max_question_len, len(chars))))
# As the decoder RNN's input, repeatedly provide with the last hidden state of
# RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
# length of output, e.g., when DIGITS=3, max output is 999+999=1998.
#model.add(layers.Dropout(DROP_OUT))
model.add(layers.RepeatVector(max_expected_len))
# The decoder RNN could be multiple layers stacked or a single layer.
for _ in range(LAYERS):
    # By setting return_sequences to True, return not only the last output but
    # all the outputs so far in the form of (num_samples, timesteps,
    # output_dim). This is necessary as TimeDistributed in the below expects
    # the first dimension to be the timesteps.
    model.add(RNN(HIDDEN_SIZE, return_sequences=True))
#    model.add(layers.Dropout(DROP_OUT))

# Apply a dense layer to the every temporal slice of an input. For each of step
# of the output sequence, decide which character should be chosen.
model.add(layers.TimeDistributed(layers.Dense(len(chars))))
model.add(layers.Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_31 (LSTM)               (None, 128)               86528     
_________________________________________________________________
repeat_vector_15 (RepeatVect (None, 32, 128)           0         
_________________________________________________________________
lstm_32 (LSTM)               (None, 32, 128)           131584    
_________________________________________________________________
time_distributed_14 (TimeDis (None, 32, 40)            5160      
_________________________________________________________________
activation_14 (Activation)   (None, 32, 40)            0         
Total params: 223,272
Trainable params: 223,272
Non-trainable params: 0
_________________________________________________________________


In [122]:
def create_seq2seq(num_nodes, num_layers):
    question = Input(shape=(max_question_len, len(chars)), name='question')
    repeat = RepeatVector(max_expected_len)(question)
    prev = question
    for _ in range(num_layers):
        lstm = LSTM(num_nodes, return_sequences=True, name='lstm_layer_%d' % (i + 1))(prev)
        prev = lstm
    dense = TimeDistributed(Dense(num_chars, name='dense', activation='softmax'))(prev)
    model = Model(inputs=[input], outputs=[dense])
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

seq2seq = create_seq2seq(128, 1)

ValueError: Input 0 is incompatible with layer repeat_vector_16: expected ndim=2, found ndim=3

In [115]:

BATCH_SIZE = 2048

# Train the model each generation and show predictions against the validation
# dataset.
for iteration in range(1, 200):
    model.fit(x_train, y_train,
              batch_size=BATCH_SIZE,
              epochs=10,
              validation_data=(x_val, y_val))
    print()
    print('-' * 50)
    print('Iteration', iteration)
    # Select 10 samples from the validation set at random so we can visualize
    # errors.
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print(q[::-1] if INVERT else q, '(%s)' % correct, '-', guess)


Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 1
                     barracouta (barracoutas                     ) - aaaaaiieess                     
                        geebung (geebungs                        ) - aaaaeess                        
                        shylock (shylocks                        ) - aaaaeess                        
                     forefather (forefathers                     ) - aaaaiiieess                     
                      pantheism (pantheisms                      ) - aaaaiiees                       
                       mantidae (mantidaes                       ) - aaaaeeess                       
                        gingiva (gingivas                        ) - aaaaeess                        
                     confidence (confidences                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 4
                       brummell (brummells                       ) - aaaaiiees                       
                       addendum (addendums                       ) - aaaaiiees                       
                     rakishness (rakishnesses                    ) - aaoooiiiies                     
                phyllocladaceae (phyllocladaceaes                ) - ceeooooiiiiiies                 
                        faddist (faddists                        ) - aaaiiees                        
                       penlight (penlights                       ) - aaaaiiies                       
                         calico (calicoes                        ) - aaaiies                         
                     craniology (craniologies                    ) - caoooiiiiees                    
                     salivation (salivations                     ) - aaoooiiiees                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 8
                       exchange (exchanges                       ) - saraiiees                       
                      areflexia (areflexias                      ) - carraiiies                      
                          morus (moruses                         ) - saaiess                         
                  lymphadenitis (lymphadenitises                 ) - ceerrooiiiiiees                 
                      intrusion (intrusions                      ) - carraiiies                      
                           worm (worms                           ) - saaes                           
                      diathesis (diatheses                       ) - carraiiies                      
                      thripidae (thripidaes                      ) - carraiiies                      
                    

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 11
                      soundness (soundnesses                     ) - carroiiiees                     
                         ordeal (ordeals                         ) - sariies                         
                       drepanis (drepani                         ) - saraiiies                       
                        sledder (sledders                        ) - sariiees                        
                  extravasation (extravasations                  ) - cerroooiiiiees                  
                           jong (jongs                           ) - saees                           
                           lamb (lambs                           ) - saaes                           
                         charon (charons                         ) - sariies                         
                      rosinweed (rosinweeds                      ) - sarriiiees                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 15
                       cakewalk (cakewalks                       ) - coooooons                       
                       eviction (evictions                       ) - caaoiions                       
                     bookdealer (bookdealers                     ) - carooaiiies                     
                         siskin (siskins                         ) - sariies                         
                        channel (channels                        ) - saraiies                        
                     eriobotrya (eriobotryas                     ) - caroooiiins                     
                       eunectes (eunecte                         ) - sereeeees                       
                          perth (perths                          ) - sereees                         
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 18
                        yobibit (yobibits                        ) - saraiias                        
                           size (sizes                           ) - saaes                           
                      shipowner (shipowners                      ) - sareeaiirs                      
                    counterfeit (counterfeits                    ) - cereooaaiirs                    
                       mujtihad (mujtihads                       ) - sareaiias                       
                 septuagenarian (septuagenarians                 ) - cereooooaaiions                 
                      indriidae (indriidaes                      ) - caraaiines                      
                           haik (haiks                           ) - soaks                           
                       wrangler (wranglers                       ) - sareeiers                  

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 22
                    eisenstaedt (eisenstaedts                    ) - cereooaatirs                    
                       drafting (draftings                       ) - sartiings                       
                    technocracy (technocracies                   ) - cereoooatiies                   
                 megasporophyll (megasporophylls                 ) - cereoooooaaiits                 
                        megaton (megatons                        ) - saraions                        
                         cynara (cynaras                         ) - saraeas                         
                 rhodymeniaceae (rhodymeniaceaes                 ) - cereooooaaiiaes                 
                      archangel (archangels                      ) - sareaaiias                      
                  a

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 25
                  gasterophilus (gasterophiluses                 ) - certooooinisses                 
                        monitor (monitors                        ) - sareeets                        
                            elf (elves                           ) - sots                            
                          farce (farces                          ) - saaies                          
                     television (televisions                     ) - sereeations                     
                      sinusitis (sinusitises                     ) - carieniuses                     
                         lutist (lutists                         ) - caaiu s                         
                     biometrics (biometric                       ) - caroooiiu                       
                      corncrake (corncrakes                      ) - careeaiies                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 29
               photoelectricity (photoelectricities              ) - cereeoooooiiiities              
                       arkansan (arkansans                       ) - careeiens                       
                       stardust (stardusts                       ) - coroiis                         
                       crockett (crocketts                       ) - sareetets                       
                      therapsid (therapsids                      ) - careeaieds                      
                          anger (angers                          ) - saaers                          
                         heming (hemings                         ) - saiings                         
                       appendix (appendixes                      ) - careeaties                      
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 32
                        process (processes                       ) - sannesses                       
                        dasyure (dasyures                        ) - sanaates                        
                         alisma (alismas                         ) - sartoas                         
                          brace (braces                          ) - salles                          
                   cladoniaceae (cladoniaceaes                   ) - cerooooainaes                   
                      shareware (sharewares                      ) - sareeaotes                      
                      decameter (decameters                      ) - soreetters                      
                     soundtrack (soundtracks                     ) - carooeaaiks                     
                      indinavir (indinavirs                      ) - soreeaiets                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 36
                         tabora (taboras                         ) - sartras                         
                       balancer (balancers                       ) - sareeiers                       
                     fiberboard (fiberboards                     ) - careeeiinds                     
                        levator (levators                        ) - sareiors                        
                   solanopteris (solanopteri                     ) - ceroooooaii                     
                   hypocreaceae (hypocreaceaes                   ) - cerooooainaes                   
                     catostomid (catostomids                     ) - sareeeiieds                     
                       arranger (arrangers                       ) - sareeiers                       
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 39
                     incipiency (incipiencies                    ) - caliiiiilies                    
                           thea (theas                           ) - saras                           
                        mukalla (mukallas                        ) - caraaias                        
                       keepsake (keepsakes                       ) - seeeeeces                       
                 dolichocephaly (dolichocephalies                ) - cereoooooiaieles                
                          stoat (stoats                          ) - salats                          
                   mylodontidae (mylodontidaes                   ) - palooooiidaes                   
                         stylet (stylets                         ) - sallets                         
                       abortion (abortions                       ) - calations                  

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 43
                     vertebrate (vertebrates                     ) - sereeeentes                     
                     competence (competences                     ) - seeeeeeeces                     
                      kanamycin (kanamycins                      ) - caliaaians                      
                  gasteromycete (gasteromycetes                  ) - ceteeooeeentes                  
                        frustum (frustums                        ) - senniums                        
                         morgen (morgens                         ) - sermens                         
                           muir (muirs                           ) - ciars                           
                            qin (qins                            ) - lins                            
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 46
                     pedionomus (pedionomuses                    ) - sereeeeeuses                    
                      racetrack (racetracks                      ) - cereeences                      
                       speculum (speculums                       ) - saliiiums                       
                    nephelinite (nephelinites                    ) - ciliiiaaites                    
                        custody (custodies                       ) - shssceles                       
                         bayard (bayards                         ) - barinds                         
                          barye (baryes                          ) - taries                          
                          bimbo (bimbos                          ) - aarbles                         
                          xyris (xyri                            ) - calus                      

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 50
                        rounder (rounders                        ) - denngers                        
                        burning (burnings                        ) - dandings                        
                      neophobia (neophobias                      ) - mooooooias                      
                       firedamp (firedamps                       ) - aenertaps                       
                 autosuggestion (autosuggestions                 ) - deeeeeerrations                 
                      normandie (normandies                      ) - dnnnnnnces                      
                         clique (cliques                         ) - lulques                         
                          rally (rallies                         ) - callles                         
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 53
                    psychedelia (psychedelias                    ) - shphhaaanias                    
                          actin (actins                          ) - caions                          
                          synod (synods                          ) - soords                          
                         caelum (caelums                         ) - pariums                         
                          krebs (kreb                            ) - mera                            
                          jihad (jihads                          ) - maaids                          
                       chestnut (chestnuts                       ) - shseeeuts                       
                     rosmarinus (rosmarinuses                    ) - saraiiaruses                    
                           hope (hopes                           ) - popes                      

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 57
                     monomaniac (monomaniacs                     ) - mnnooonnias                     
                      tailstock (tailstocks                      ) - aosssiccks                      
                         gazebo (gazeboes                        ) - gagbals                         
                       parrotia (parrotias                       ) - paraatias                       
                      orleanism (orleanisms                      ) - aanaanisms                      
                      meningism (meningisms                      ) - menaanisms                      
                   strophanthus (strophanthuses                  ) - chrrooheehuses                  
                        waggery (waggeries                       ) - banderoes                       
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 60
                         cortes (corte                           ) - corte                           
                     chromosome (chromosomes                     ) - choooohores                     
                 auriculariales (auriculariale                   ) - ausrraaliiale                   
                         bihari (biharis                         ) - baitrls                         
                          glebe (glebes                          ) - gebles                          
                           napu (napus                           ) - darus                           
                        cassock (cassocks                        ) - cossocks                        
                         packer (packers                         ) - paakers                         
                         galena (galenas                         ) - garanas                    

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 64
                       midplane (midplanes                       ) - melilines                       
                       josephus (josephuses                      ) - gosshhuses                      
                     okeechobee (okeechobees                     ) - lleessholes                     
                         helmet (helmets                         ) - hemiets                         
                       sundries (sundry                          ) - sunvine                         
                        massine (massines                        ) - messines                        
                            dub (dubs                            ) - dubs                            
                          cocos (coco                            ) - coco                            
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 67
                       starship (starships                       ) - starphops                       
                     descriptor (descriptors                     ) - desterctors                     
                     languisher (languishers                     ) - lindeesters                     
                      pygmalion (pygmalions                      ) - pallalions                      
                  ratiocination (ratiocinations                  ) - repioiiiations                  
                         isohel (isohels                         ) - ocssils                         
                       mantidae (mantidaes                       ) - mariidaes                       
                     providence (providences                     ) - phininnnces                     
               hydrocharitaceae (hydrocharitaceaes               ) - hyloooooeeaaceaes          

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 71
                       addendum (addendums                       ) - anddndums                       
                         poodle (poodles                         ) - pooneas                         
                        dictate (dictates                        ) - distetrs                        
                       ultimacy (ultimacies                      ) - ertitnices                      
                          deity (deities                         ) - deitiks                         
                      kanamycin (kanamycins                      ) - rinooolins                      
                   clannishness (clannishnesses                  ) - clinesssnesses                  
                           grad (grads                           ) - grane                           
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 74
                           eden (edens                           ) - edens                           
                        realist (realists                        ) - rellists                        
                      saxitoxin (saxitoxins                      ) - saiiotiins                      
                          sepal (sepals                          ) - sepils                          
                      sheepskin (sheepskins                      ) - sheesshins                      
                      balalaika (balalaikas                      ) - balllaacas                      
                    leatherleaf (leatherleaves                   ) - lacterlasiies                   
                    dialeurodes (dialeurode                      ) - dirlrleare                      
                        teacher (teachers                        ) - terchers                   

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 78
                    gravedigger (gravediggers                    ) - gredeedegers                    
                         gurkha (gurkhas                         ) - gukhhas                         
                    coffeeberry (coffeeberries                   ) - confeeeerries                   
                        percher (perchers                        ) - perchers                        
                      indiction (indictions                      ) - indictions                      
                   chaetognatha (chaetognathas                   ) - sharoonaattas                   
                    biliousness (biliousnesses                   ) - biulsssnesses                   
                    carotenemia (carotenemias                    ) - cartoeeerias                    
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 81
                        cyperus (cyperuses                       ) - cyprruses                       
                          ursus (ursuses                         ) - uuuuses                         
                            raw (raws                            ) - raws                            
                    sarcoidosis (sarcoidoses                     ) - sartooooses                     
                       maldives (maldive                         ) - malvive                         
                          bongo (bongos                          ) - bongos                          
                         antlia (antlias                         ) - antiias                         
                      homeopath (homeopaths                      ) - hommmppths                      
                      siltstone (siltstones                      ) - sinsstones                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 85
                          gripe (gripes                          ) - gripes                          
                    deuteronomy (deuteronomies                   ) - deutooonomies                   
                         parser (parsers                         ) - parsers                         
                       escargot (escargots                       ) - estorrots                       
                        torreya (torreyas                        ) - torraeas                        
                        kunzite (kunzites                        ) - tunnites                        
                       roomette (roomettes                       ) - rommettes                       
                          fraud (frauds                          ) - frards                          
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 88
                        logania (loganias                        ) - loganias                        
                         waffle (waffles                         ) - waffles                         
                        weigela (weigelas                        ) - weneelas                        
                       bramidae (bramidaes                       ) - bramidaes                       
                  actinidiaceae (actinidiaceaes                  ) - astinanaaceaes                  
                         shebat (shebats                         ) - shebats                         
                     conscience (consciences                     ) - consscences                     
                         ribald (ribalds                         ) - ribblds                         
                     statehouse (statehouses                     ) - stattoouses                

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 92
             disenfranchisement (disenfranchisements             ) - diseeneneeeessments             
                     activeness (activenesses                    ) - actevenesses                    
                       overtone (overtones                       ) - overtones                       
                         theist (theists                         ) - theists                         
                          depth (depths                          ) - dephhs                          
                      verdigris (verdigri                        ) - verrnidi                        
                       hologram (holograms                       ) - hollgrams                       
                        baldwin (baldwins                        ) - balddens                        
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 95
                          polyp (polyps                          ) - polyps                          
                    aglaomorpha (aglaomorphas                    ) - agaooorpphas                    
                          rotor (rotors                          ) - rotors                          
                      klopstock (klopstocks                      ) - closstocks                      
                       jasminum (jasminums                       ) - jasiniums                       
                     paralogism (paralogisms                     ) - parooogisms                     
                         knower (knowers                         ) - wwwwers                         
                         godwit (godwits                         ) - godwits                         
                  plectomycetes (plectomycete                    ) - plechoaatete               

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 99
                     culdoscopy (culdoscopies                    ) - culloscomies                    
                      islamabad (islamabads                      ) - ishababads                      
                          voile (voiles                          ) - vovles                          
                         tiptoe (tiptoes                         ) - titties                         
                            pup (pups                            ) - pups                            
                softheartedness (softheartednesses               ) - sonthettteenesses               
                      chocolate (chocolates                      ) - chocllates                      
                         elmont (elmonts                         ) - elionts                         
                   

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 102
                        playoff (playoffs                        ) - plalomfs                        
                     comeliness (comelinesses                    ) - comllinesses                    
                         nudism (nudisms                         ) - nudisms                         
                    pumpkinseed (pumpkinseeds                    ) - pupsessseeds                    
                        dignity (dignities                       ) - digiities                       
                        eelworm (eelworms                        ) - elwworts                        
                    snakeblenny (snakeblennies                   ) - slaseldeneies                   
                       addition (additions                       ) - additions                       
                 demonetization (demonetizations                 ) - denomntizations           

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 106
                    malebranche (malebranches                    ) - mareerargers                    
                      arianrhod (arianrhods                      ) - arrnnhhads                      
                    protohippus (protohippuses                   ) - protoopppuses                   
                          amman (ammen                           ) - ammen                           
                     rhinoceros (rhinoceroses                    ) - phidocoro                       
                         hoenir (hoenirs                         ) - honnirs                         
                        farrell (farrells                        ) - farrells                        
                       chlorite (chlorites                       ) - chlorites                       
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 109
                        proverb (proverbs                        ) - proverbs                        
                     slanginess (slanginesses                    ) - slanginesses                    
                       zaragoza (zaragozas                       ) - zaradazas                       
                       picariae (picariaes                       ) - picarays                        
                        tarweed (tarweeds                        ) - tareeeds                        
                        ghrelin (ghrelins                        ) - greelins                        
                      politburo (politburoes                     ) - polocuoroes                     
                         buckle (buckles                         ) - buckles                         
                balanoposthitis (balanoposthitises               ) - balrocoochhttises         

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 113
                        fitment (fitments                        ) - fiteents                        
                   parametritis (parametritises                  ) - parrrrteitises                  
                   hypocreaceae (hypocreaceaes                   ) - hypootraceaes                   
                        wingman (wingmen                         ) - wingmen                         
                          mania (manias                          ) - manias                          
                          curio (curios                          ) - curios                          
                       nematode (nematodes                       ) - nemahades                       
                           kidd (kidds                           ) - kidds                           
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 116
                    allopurinol (allopurinols                    ) - allocoronals                    
                          decay (decays                          ) - decays                          
                          anjou (anjous                          ) - anjous                          
                     glochidium (glochidiums                     ) - glochidiums                     
                    biocatalyst (biocatalysts                    ) - biactoalasts                    
                     eriocaulon (eriocaulons                     ) - eronculuons                     
                     eriocaulon (eriocaulons                     ) - eronculuons                     
                    matriculate (matriculates                    ) - matriculates                    
                       bolivian (bolivians                       ) - bolivians                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 120
                      poisoning (poisonings                      ) - poosonings                      
                         gander (ganders                         ) - ganders                         
                         sterna (sternas                         ) - sternas                         
                          idiom (idioms                          ) - idioms                          
                   lepidocybium (lepidocybiums                   ) - lepinocyliums                   
                            jig (jigs                            ) - jigs                            
                         little (littles                         ) - littles                         
                      sociopath (sociopaths                      ) - socoopaths                      
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 123
                       overbite (overbites                       ) - overbites                       
                    vladivostok (vladivostoks                    ) - fallddostils                    
                       sourball (sourballs                       ) - sourbllls                       
                      bandwagon (bandwagons                      ) - bangaagons                      
                        reading (readings                        ) - readings                        
                       shooting (shootings                       ) - shootings                       
                           sial (sials                           ) - sials                           
                        norfolk (norfolks                        ) - norfolks                        
                         ambush (ambushes                        ) - ambushes                  

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 127
                            spy (spies                           ) - spis                            
                    melancholia (melancholias                    ) - melancholias                    
                      mentzelia (mentzelias                      ) - menthelias                      
                         little (littles                         ) - littles                         
                           jung (jungs                           ) - jungs                           
                         guimpe (guimpes                         ) - gummpes                         
                         pallas (palla                           ) - palla                           
                     stuffiness (stuffinesses                    ) - stuffinesses                    
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 130
                         tiptoe (tiptoes                         ) - tiptoes                         
                        hinault (hinaults                        ) - hinallts                        
                           whiz (whizzes                         ) - whizzs                          
                       headshot (headshots                       ) - headshots                       
                       contrail (contrails                       ) - contrrils                       
                      profusion (profusions                      ) - profusions                      
                        waffler (wafflers                        ) - wafflers                        
                       lordship (lordships                       ) - lordships                       
                     fibrositis (fibrositises                    ) - fimrrsitis                

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 134
                         buzzer (buzzers                         ) - buzzers                         
                       titterer (titterers                       ) - titterers                       
                 rhodymeniaceae (rhodymeniaceaes                 ) - rhomamnnnaceaes                 
                         bihari (biharis                         ) - biharis                         
                          brier (briers                          ) - briers                          
                       drollery (drolleries                      ) - drllleries                      
                  morchellaceae (morchellaceaes                  ) - morchellaccaes                  
                        gatling (gatlings                        ) - gatlings                        
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 137
                          booby (boobies                         ) - boobies                         
                      indemnity (indemnities                     ) - indemnities                     
                      caproidae (caproidaes                      ) - caproidaes                      
                   epicureanism (epicureanisms                   ) - epictrranisms                   
                       giantess (giantesses                      ) - giaatesses                      
                    afroasiatic (afroasiatics                    ) - abriisaatics                    
                       spinacia (spinacias                       ) - stinacias                       
                           stay (stays                           ) - stays                           
                       thirteen (thirteens                       ) - thirteens                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 141
                    mesalliance (mesalliances                    ) - mesallaances                    
                         rotgut (rotguts                         ) - rotguts                         
                       proturan (proturans                       ) - proturans                       
                        mordant (mordants                        ) - mordants                        
                       pheasant (pheasants                       ) - pheasants                       
                    leatherette (leatherettes                    ) - leatheerttes                    
                   stupefaction (stupefactions                   ) - sputefuctions                   
                        sellers (seller                          ) - seller                          
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 144
                           luna (lunas                           ) - lunas                           
                       pacifist (pacifists                       ) - pacifists                       
                       dispatch (dispatches                      ) - dispatthes                      
                        shoebox (shoeboxes                       ) - shoeboxes                       
                   somatotropin (somatotropins                   ) - somttittopins                   
                         urease (ureases                         ) - ureaces                         
                    guadalajara (guadalajaras                    ) - geaalalaguas                    
                        alchemy (alchemies                       ) - alchemies                       
                  penicillamine (penicillamines                  ) - penicillamines            

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 148
                           mann (manns                           ) - manns                           
                  laboriousness (laboriousnesses                 ) - labrroousnesses                 
                     pastorship (pastorships                     ) - pastorships                     
                      bartender (bartenders                      ) - bartenders                      
                   commencement (commencements                   ) - commescements                   
                    transferrin (transferrins                    ) - transferrons                    
                        leather (leathers                        ) - leathers                        
                       ergotism (ergotisms                       ) - ergotisms                       
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 151
                      vaginitis (vaginitises                     ) - vaginitises                     
                        pretzel (pretzels                        ) - preteels                        
                    titillation (titillations                    ) - titillations                    
                        amputee (amputees                        ) - amputees                        
                  tractarianism (tractarianisms                  ) - tractaraanisms                  
                         rennet (rennets                         ) - rennets                         
                      surcharge (surcharges                      ) - surcharges                      
                        grogram (grograms                        ) - grograms                        
                         renter (renters                         ) - renters                   

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 155
                steatornithidae (steatornithidaes                ) - steatharottidaes                
                       vineyard (vineyards                       ) - vineyards                       
                          blank (blanks                          ) - blanks                          
                    dialeurodes (dialeurode                      ) - dillarrode                      
                pseudomonadales (pseudomonadale                  ) - pseudomonnaale                  
                        painter (painters                        ) - painters                        
                   pyxidanthera (pyxidantheras                   ) - pyxaninteeras                   
                     sensualist (sensualists                     ) - sensualists                     
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 158
                      tanacetum (tanacetums                      ) - tanaceeems                      
                          ricin (ricins                          ) - ricins                          
                   gastrulation (gastrulations                   ) - gastralations                   
                        caitiff (caitiffs                        ) - caitiffs                        
                        capitol (capitols                        ) - capitols                        
                           ruse (ruses                           ) - ruses                           
                      ligustrum (ligustrums                      ) - ligustrums                      
                      cataphyll (cataphylls                      ) - cataphylls                      
                    mauritanian (mauritanians                    ) - mauritinians              

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 162
                    parishioner (parishioners                    ) - parasiooners                    
                 rhodymeniaceae (rhodymeniaceaes                 ) - rhodymmenaceaes                 
                    teredinidae (teredinidaes                    ) - tereninidaes                    
                       chartist (chartists                       ) - chartists                       
                     readership (readerships                     ) - readerships                     
                            keg (kegs                            ) - kegs                            
                appropriateness (appropriatenesses               ) - apprrrpaitenesses               
                    overanxiety (overanxieties                   ) - overeiidities                   
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 165
                       lucidity (lucidities                      ) - lucidities                      
                         harlem (harlems                         ) - harlems                         
                       vittaria (vittarias                       ) - vittarias                       
                      bitternut (bitternuts                      ) - bitternuts                      
                 sporotrichosis (sporotrichoses                  ) - sporothochoses                  
                            rat (rats                            ) - rats                            
                         pliers (pliers                          ) - plier                           
                   catharanthus (catharanthuses                  ) - cathaaaathuses                  
                       keystone (keystones                       ) - keystones                 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 169
                  myxosporidian (myxosporidians                  ) - myoosporidians                  
                  ratiocination (ratiocinations                  ) - ratiiiiiations                  
                           sloe (sloes                           ) - sloes                           
                          money (monies                          ) - moneys                          
                     propositus (proposituses                    ) - proposituses                    
                       lothario (lotharios                       ) - lothariis                       
                         george (georges                         ) - georges                         
                     interlayer (interlayers                     ) - interllaers                     
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 172
                 intentionality (intentionalities                ) - intentinnalities                
                    handbreadth (handbreadths                    ) - handbrranttzs                   
                         permic (permics                         ) - permics                         
                          kasai (kasais                          ) - kasais                          
                      zebrawood (zebrawoods                      ) - zebrawoods                      
                     gingersnap (gingersnaps                     ) - gingrrssars                     
                           coho (cohoes                          ) - cohoes                          
                       proturan (proturans                       ) - proturans                       
                     rosmarinus (rosmarinuses                    ) - rosbarinuses              

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 176
                    sottishness (sottishnesses                   ) - sottishnesses                   
                       ridicule (ridicules                       ) - ridicules                       
                       ceremony (ceremonies                      ) - ceremonies                      
                        halesia (halesias                        ) - halesias                        
                       josephus (josephuses                      ) - josephuses                      
                     banishment (banishments                     ) - banishments                     
                     worthiness (worthinesses                    ) - worthinesses                    
                      tidewater (tidewaters                      ) - tidewaters                      
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 179
                         myriad (myriads                         ) - myriads                         
                         swivet (swivets                         ) - swivets                         
                        crochet (crochets                        ) - crochets                        
                           plow (plows                           ) - plows                           
                  swordsmanship (swordsmanships                  ) - swordsmanships                  
                      cambridge (cambridges                      ) - cambringes                      
                 clitoridectomy (clitoridectomies                ) - seicoddeettories                
                      glycerite (glycerites                      ) - glycerites                      
              chemiluminescence (chemiluminescences              ) - cheaiuimminstenaes        

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 183
                    ginkgopsida (ginkgopsidas                    ) - ginsmossinss                    
                       dairyman (dairymen                        ) - daizymens                       
                       letterer (letterers                       ) - letterrrs                       
                           elul (eluls                           ) - eluls                           
                         popgun (popguns                         ) - popunns                         
                       ergotism (ergotisms                       ) - eriotiits                       
                          carya (caryas                          ) - caryys                          
                       austrian (austrians                       ) - austrinns                       
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 186
                       standing (standings                       ) - standings                       
                        zamboni (zambonis                        ) - zambonis                        
                      caledonia (caledonias                      ) - caledonias                      
                          polyp (polyps                          ) - polyps                          
                     solicitude (solicitudes                     ) - solicitudes                     
                         coleus (coleuses                        ) - coleuses                        
                         arauca (araucas                         ) - araucas                         
                       graffito (graffiti                        ) - graffitos                       
                        sarazen (sarazens                        ) - sarazens                  

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 190
                          anvil (anvils                          ) - anvils                          
                      limicolae (limicolaes                      ) - limicolaes                      
                     leopardess (leopardesses                    ) - leopardesses                    
                         bodoni (bodonis                         ) - bodonis                         
                         puddle (puddles                         ) - puddles                         
                          mosul (mosuls                          ) - mosuls                          
                       sealskin (sealskins                       ) - sealskins                       
                          capek (capeks                          ) - capeks                          
                  

Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 193
                         cahoot (cahoots                         ) - cahoots                         
                   bilingualism (bilingualisms                   ) - bilingualists                   
                        aspirin (aspirins                        ) - aspirins                        
                           race (races                           ) - races                           
                     scoreboard (scoreboards                     ) - scoreboards                     
                        pompano (pompanoes                       ) - pompanoe                        
                          pepin (pepins                          ) - pepins                          
                    unsoundness (unsoundnesses                   ) - unspondnesses                   
                    misspelling (misspellings                    ) - misselllings              

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 197
                          valmy (valmies                         ) - valmies                         
                      spritsail (spritsails                      ) - spristails                      
                       entirety (entireties                      ) - entireties                      
                     movability (movabilities                    ) - movabilities                    
                       calisaya (calisayas                       ) - calisaaas                       
                        chancre (chancres                        ) - chancees                        
                          ladle (ladles                          ) - ladles                          
                     notoryctus (notoryctuses                    ) - notoryctuses                    
                  

In [86]:
shakespeare = strip_headers(load_etext(100))
tokens = [tuple(word) for word in tokenize(plays, to_lower=True)]
token_counts = Counter(tokens)

In [90]:
pairs = [(token[i], token[i + 1], token_id) for token_id, token in enumerate(tokens) for i in range(len(token) - 1)]

In [91]:
pairs[10]

('n', 'd', 3)

In [82]:
help(tokenize)

Help on function tokenize in module gensim.utils:

tokenize(text, lowercase=False, deacc=False, errors='strict', to_lower=False, lower=False)
    Iteratively yield tokens as unicode strings, removing accent marks
    and optionally lowercasing the unidoce string by assigning True
    to one of the parameters, lowercase, to_lower, or lower.
    
    Input text may be either unicode or utf8-encoded byte string.
    
    The tokens on output are maximal contiguous sequences of alphabetic
    characters (no digits!).
    
    >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True))
    [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']



In [87]:
tokens[5]

('b', 'y')