In [1]:
import os
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import Dense, Embedding, Input, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, SGD

Using TensorFlow backend.


In [2]:
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 3000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 2000
LATENT_DIM = 25

Data Loading

In [29]:
input_texts=[]
target_texts=[]


for line in open('frost.txt'):
    line=line.rstrip()
    if not line:
        continue
    input_line='<sos> ' + line
    target_line= line + ' <eos>'
    
    input_texts.append(input_line)
    target_texts.append(target_line)
    
    
all_lines=input_texts+target_texts

print(all_lines[0])

<sos> Two roads diverged in a yellow wood,


In [30]:
print(target_texts[0])

Two roads diverged in a yellow wood, <eos>


convert the sentences (strings) into integers

In [31]:
tokenizer=Tokenizer(num_words=MAX_VOCAB_SIZE,lower=True, split=" ", char_level=False,
                          filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(all_lines)
input_sequences=tokenizer.texts_to_sequences(input_texts)
target_sequences=tokenizer.texts_to_sequences(target_texts)
print(input_sequences[0])
print(target_sequences[0])


[1, 99, 582, 583, 10, 7, 584, 585]
[99, 582, 583, 10, 7, 584, 585, 2]


In [32]:
# find max seq length
max_sequence_length_from_data = max(len(s) for s in input_sequences)
print('Max sequence length:', max_sequence_length_from_data)

Max sequence length: 13


In [33]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))


Found 2275 unique tokens.


In [34]:
idx2word = {v:k for k, v in word2idx.items()}

In [38]:
idx2word[2]

'<eos>'

pad sequences so that we get a N x T matrix


In [41]:
max_sequence_length = min(max_sequence_length_from_data, MAX_SEQUENCE_LENGTH)
input_sequences=pad_sequences(input_sequences,maxlen=max_sequence_length,padding='post')
target_sequences=pad_sequences(target_sequences,maxlen=max_sequence_length,padding='post')
print(input_sequences[0])
print(target_sequences[0])

[  1  99 582 583  10   7 584 585   0   0   0   0   0]
[ 99 582 583  10   7 584 585   2   0   0   0   0   0]


load in pre-trained word vectors

In [48]:
print('Loading word vectors...')
word2vec={}
with open(r'D:\udemy\glove.6B.100d.txt',encoding="utf-8") as f:
    for line in f:
        values=line.split()
        word=values[0]
        vec=np.asarray(values[1:],dtype='float32')
        word2vec[word]=vec
print('Found %s word vectors.' % len(word2vec))        

Loading word vectors...
Found 400000 word vectors.


In [49]:
word2vec['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

prepare embedding matrix

In [50]:
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


one-hot the targets (can't use sparse cross-entropy)

In [51]:
one_hot_targets = np.zeros((len(input_sequences), max_sequence_length, num_words))
for i, target_sequence in enumerate(target_sequences):
  for t, word in enumerate(target_sequence):
    if word > 0:
      one_hot_targets[i, t, word] = 1

load pre-trained word embeddings into an Embedding layer

In [53]:
embedding_layer=Embedding(num_words,EMBEDDING_DIM,weights=[embedding_matrix])

Model Building

In [56]:
print('Building model...')


input_=Input(shape=(max_sequence_length,))
initial_h=Input(shape=(LATENT_DIM,))
initial_c=Input(shape=(LATENT_DIM,))
x=embedding_layer(input_)
lstm=LSTM(LATENT_DIM,return_sequences=True,return_state=True)
x,_,_=lstm(x,initial_state=[initial_h,initial_c])
dense=Dense(num_words,activation='softmax')
output=dense(x)
model=Model([input_,initial_h,initial_c],output)

print(model.summary())

Building model...
Instructions for updating:
Colocations handled automatically by placer.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 13)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 13, 100)      227600      input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 25)           0                                            
__

In [57]:
model.compile(loss='categorical_crossentropy',
             optimizer= Adam(lr=0.01),
             metrics=['accuracy'])

Training model

In [58]:
print('Training model...')
z = np.zeros((len(input_sequences), LATENT_DIM))
r = model.fit(
  [input_sequences, z, z],
  one_hot_targets,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT)

Training model...
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1148 samples, validate on 288 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 

Sampling Model

In [59]:
input2=Input(shape=(1,))
x=embedding_layer(input2)
x,h,c=lstm(x,initial_state=[initial_h,initial_c])
output2=dense(x)
sampling_model=Model([input2,initial_h,initial_c],[output2,h,c])

In [60]:
def created_line():
    np_input = np.array([[ word2idx['<sos>'] ]])
    h = np.zeros((1, LATENT_DIM))
    c = np.zeros((1, LATENT_DIM))
    eos=word2idx['<eos>']
    output_sentence=[]
    
    for _ in range(max_sequence_length):
        o,h,c=sampling_model.predict([np_input,h,c])
        probs=o[0,0]
        if np.argmax(probs) == 0:
            print("wtf")
        probs[0]=0
        probs/=probs.sum()
        idx=np.random.choice(len(probs),p=probs)
        if idx==eos:
            break
        output_sentence.append(idx2word.get(idx,'<WTF %s>' % idx))
        np_input[0,0]=idx
    
    return ' '.join(output_sentence)

Generation of 4 line poems

In [67]:
for _ in range(4):
    print(created_line())

in summer when i passed the place
mother yes we could too son tell the truth for once
the hillside on the day the sun lets go
two miles it was


In [72]:
for _ in range(4):
    print(created_line())

why north her books i wander out of beaten ways
upon a crooked know it would have ever cared make a post
what do you think you're like to hear
is back into the cellar in spring


In [82]:
for _ in range(4):
    print(created_line())

to stand together on the crater's verge
i don't remember why i ever
'under the shelter of the family tree '
and see if you wish to and


In [83]:
for _ in range(4):
    print(created_line())

the cellar windows were banked up with sawdust
her early leaf's a flower
and build the family it would do it ended beside the track
i'll find that fountain if it takes there some magic of the sun


In [90]:
for _ in range(4):
    print(created_line())

a likeness to surprise the thrilly tourist
brown makes there wet snow berries
i was half boring through half climbing through
then we know they made him nail the knob
