In [17]:
from nltk.corpus import brown
from __future__ import print_function
from nltk.tokenize import word_tokenize
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.recurrent import LSTM
from keras.datasets.data_utils import get_file
import numpy as np
import random
import sys
import os
import h5py

directory ='/home/sinah/'
#list_topic=['news','editorial','reviews','religion','hobbies','lore','government','learned','fiction','mystery','science_fiction','adventure','romance','humor']
list_topic=['editorial']
def create_dic():
    vocab={}
    for topics in list_topic:
        for item in brown.words(categories=topics):
            try:
                vocab[str(item)] = vocab[str(item)] +1
            except:
                 vocab[str(item)] = 1
    key_list= sorted(vocab, key=vocab.get,reverse=True)
    print('Number of words in text..', len(key_list))
    vocab_object=open(directory + 'vocab.txt','w')
    for things  in key_list:
        vocab_object.write(things.lower())
        vocab_object.write('\n')
    vocab_object.close()
    print('Dictionary created for all the words in one category of brown corpus and stored in vocab.txt..')

#Function that we will use later on for the sampling

def sample(a, temperature=1.0):
    # helper function to sample an index from a probability array
    a = np.log(a) / temperature
    a = np.exp(a) / np.sum(np.exp(a))
    return np.argmax(np.random.multinomial(1, a, 1))

#Create a dictionary and store it
create_dic()

# Create the words to index and vice versa indixes
vocabsize=9400
vector_embedding = 128
maxlen = 4
fixlen=3
print('Reading dictionary to create word to index and index to word indices...')
words=[]
text_open=open(directory + 'vocab.txt','r')
for item in text_open:
    words.append(item.strip('\n'))
word_indices = dict((w, i) for i, w in enumerate(words))
indices_word = dict((i, w) for i, w in enumerate(words))

# Create the sentences from the corpus

sentences = []
end_word=[]
for topics in list_topic:
    for sentence_tmp in brown.sents(categories = topics):
        if len(sentence_tmp) > maxlen:
            for counter in range(0,len(sentence_tmp)-fixlen):
                sentences.append(sentence_tmp[counter:counter+fixlen])
                end_word.append(sentence_tmp[counter+fixlen])
                
# Model
print('Building model...')
model = Sequential()
model.add(LSTM(input_dim=vocabsize+1, output_dim=vector_embedding, input_length=fixlen, return_sequences=True))
model.add(Activation('sigmoid'))
model.add(Dropout(0.2))
model.add(LSTM(input_dim=vector_embedding, output_dim=vector_embedding, input_length=fixlen, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocabsize+1))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# Vectorize
print('Vectorization...')
X = np.zeros((len(sentences), fixlen, vocabsize+1), dtype=np.bool)
y = np.zeros((len(sentences), vocabsize+1), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, wor in enumerate(sentence):
        try:
            X[i, t, word_indices[wor.lower()]] = 1
            y[i, word_indices[end_word[i].lower()]] = 1
        except:# for words those are outside the vocabsize
            X[i, t, vocabsize] = 1
            y[i, vocabsize] = 1
            
# Train and save         

model.fit(X, y, batch_size=128, nb_epoch=60)
score, acc = model.evaluate(X, y,batch_size=128,show_accuracy=True)
print('Test score:', score)
print('Test accuracy:', acc)
model.save_weights(directory + 'model_weight.hdf5',overwrite=True)

# Prediction
seed_vector = np.zeros((1, fixlen, vocabsize+1))
seed_sentence= "you cannot step"
for t, word in enumerate(sentence):
    seed_vector[0, t, word_indices[word]] = 1.
predicted_probabilties = model.predict(seed_vector, verbose=0)[0] # 0 because of only one sample
output_index = sample(predicted_probabilties)
output_word = indices_word[output_index]
print('After 20 iteration, seed_sentence is -', seed_sentence, '- followed by predicted word -', output_word)
                       

Number of words in text.. 9890
Dictionary created for all the words in one category of brown corpus and stored in vocab.txt..
Reading dictionary to create word to index and index to word indices...
Building model...
Vectorization...
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
Test score: 6.83658333634
Test accuracy: 0.0121916481513