In [84]:
#import libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM, Activation
from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping

import numpy as np
import random
import os

In [86]:
def preprocessing():
    directory='data/input.txt'
    file=open(directory).read().lower().replace('\n',' \n ')
    print('Corpus length in characters : {}'.format(len(file)))
    text_in_words=[w for w in file.split(' ') if w.strip() !='' or w=='\n']
    print('Corpus length in words : {}'.format(len(text_in_words)))

    # Calculate word frequencies

    #hyperparameter
    MIN_WORD_FREQUENCY=3

    word_freq={}
    for w in text_in_words:
        word_freq[w]=word_freq.get(w,0)+1
    
    ignored_words=set()
    for k,v in word_freq.items():
        if(word_freq[k]<MIN_WORD_FREQUENCY):
            ignored_words.add(k)
        
    words=set(text_in_words)
    print("Unique words before ignoring : {}".format(len(words)))
    print("Ignoring words with frequency < : {}".format(MIN_WORD_FREQUENCY))
    words=sorted(words-ignored_words)
    print("Unique words after ignoring : {}".format(len(words)))

    word_indices={word:i for i,word in enumerate(words)}
    indices_word={i:word for i,word in enumerate(words)}
    
    return word_indices, indices_word, text_in_words, ignored_words, MIN_WORD_FREQUENCY

In [81]:
#Creating and Filtering the Sequence
def filter_and_sequence(text_in_words, ignored_words):
    
    #hyperparameters
    SEQUENCE_LEN=8
    
    STEP=1
    sentences=[]
    next_words=[]
    ignored=0
    for i in range(0,len(text_in_words)-SEQUENCE_LEN,STEP):
        if len(set(text_in_words[i:i+SEQUENCE_LEN+1]).intersection(ignored_words))==0:
            sentences.append(text_in_words[i:i+SEQUENCE_LEN])
            next_words.append(text_in_words[i+SEQUENCE_LEN])
        else:
            ignored=ignored+1
    
    print('Ignored Sentences : {}'.format(ignored))
    print('Remaining Sequences : {}'.format(len(sentences)))
    
    return sentences, next_words, SEQUENCE_LEN

In [68]:
# Training and Test Set shuffling
def shuffle_and_split(sentences,next_words):
    
    combined=list(zip(sentences,next_words))
    random.shuffle(combined)
    sentences[:],next_words[:]=zip(*combined)
    
    #train test split ratio is 90-10
    sentences_test=sentences[int(len(sentences)*0.9):]
    next_words_test=next_words[int(len(next_words)*0.9):]
    sentences=sentences[:int(len(sentences)*0.9)]
    next_words=next_words[:int(len(next_words)*0.9)]
    
    return sentences, next_words, sentences_test, next_words_test

In [69]:
#Build the model
def get_model(dropout=0.2):
    model=Sequential()
    model.add(Bidirectional(LSTM(128),input_shape=(SEQUENCE_LEN,len(words))))
    model.add(Dropout(dropout))
    model.add(Activation('softmax'))
    return model

In [79]:
# Data Generator
def data_generator(sentence_list,next_word_list,batch_size):
    index=0
    while True:
        x=np.zeros((batch_size,SEQUENCE_LEN,len(words)),dtype=np.bool)
        y=np.zeros((batch_size,len(words)),dtype=np.bool)
        for i in range(batch_size):
            for t,w in enumerate(sentence_list[index%len(sentence_list)]):
                x[i,t,word_indices[w]]=1
            y[i,word_indices[next_word_list[index%len(sentence_list)]]]=1
            index=index+1
    yield x,y

In [None]:
def on_epoch_end():
    data_generator()
    for i in text_in_words:
        model.preds(i)

In [87]:
if __name__=="__main__":
    
    if not os.path.isdir('./checkpoints/'):
        os.makedirs('./checkpoints/')
    
    word_indices, indices_word, text_in_words, ignored_words, MIN_WORD_FREQUENCY = preprocessing()
    sentences, next_words, SEQUENCE_LEN= filter_and_sequence(text_in_words, ignored_words)
    sentences, next_words, sentences_test, next_words_test = shuffle_and_split(sentences, next_words)
    
    BATCH_SIZE=1100 # find ideal value
    model = get_model()
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    file_path="./checkpoints/LSTM-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-"\
              "loss{}-acc{}-val_loss{}-val_acc{}"%\
                (len(words), SEQUENCE_LEN, MIN_WORD_FREQUENCY)
    
    checkpoint = ModelCheckpoint(file_path,monitor='val_acc',save_best_only=True)
    print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
    early_stopping=EarlyStopping(monitor='val_acc',patience=5)
    callbacks_list=[checkpoint,print_callback,early_stopping]
    
    model.fit_generator(data_generator(sentences,next_words,BATCH_SIZE),
                        steps_per_epoch=int(len(sentences)/BATCH_SIZE+1),
                        epochs=100,
                        callbacks=callbacks_list,
                        validation_data=generator(sentences_test,next_words_test,BATCH_SIZE),
                        validation_steps=int(len(sentences_test)/BATCH_SIZE+1)
                       )
    
    sentences, next_words, sentences_test, next_words_test=shuffle_and_split(sentences,next_words)
    

Corpus length in characters : 1195391
Corpus length in words : 242650
Unique words before ignoring : 23642
Ignoring words with frequency < : 3
Unique words after ignoring : 6642
Ignored Sentences : 130548
Remaining Sequences : 112094


NameError: name 'on_epoch_end' is not defined