In [5]:
#import libraries
import tensorflow as tf
from tensorflow.python.keras.models import Sequential,Model
from tensorflow.python.keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM, Activation
from tensorflow.python.keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping

import numpy as np
import random
import os

tf.test.is_gpu_available()
#from tensorflow.python.client import device_lib
#device_lib.list_local_devices()

True

In [2]:
def preprocessing():
    directory='data/input.txt'
    file=open(directory).read().lower().replace('\n',' \n ')
    print('Corpus length in characters : {}'.format(len(file)))
    text_in_words=[w for w in file.split(' ') if w.strip() !='' or w=='\n']
    print('Corpus length in words : {}'.format(len(text_in_words)))

    # Calculate word frequencies

    #hyperparameter
    MIN_WORD_FREQUENCY=6

    word_freq={}
    for w in text_in_words:
        word_freq[w]=word_freq.get(w,0)+1
    
    ignored_words=set()
    for k,v in word_freq.items():
        if(word_freq[k]<MIN_WORD_FREQUENCY):
            ignored_words.add(k)
        
    words=set(text_in_words)
    print("Unique words before ignoring : {}".format(len(words)))
    print("Ignoring words with frequency < : {}".format(MIN_WORD_FREQUENCY))
    words=sorted(words-ignored_words)
    print("Unique words after ignoring : {}".format(len(words)))

    word_indices={word:i for i,word in enumerate(words)}
    indices_word={i:word for i,word in enumerate(words)}
    
    return word_indices, indices_word, text_in_words, ignored_words, MIN_WORD_FREQUENCY

In [3]:
#Creating and Filtering the Sequence
def filter_and_sequence(text_in_words, ignored_words):
    
    #hyperparameters
    SEQUENCE_LEN=10
    
    STEP=1
    sentences=[]
    next_words=[]
    ignored=0
    for i in range(0,len(text_in_words)-SEQUENCE_LEN,STEP):
        if len(set(text_in_words[i:i+SEQUENCE_LEN+1]).intersection(ignored_words))==0:
            sentences.append(text_in_words[i:i+SEQUENCE_LEN])
            next_words.append(text_in_words[i+SEQUENCE_LEN])
        else:
            ignored=ignored+1
    
    print('Ignored Sentences : {}'.format(ignored))
    print('Remaining Sequences : {}'.format(len(sentences)))
    
    return sentences, next_words, SEQUENCE_LEN

In [4]:
# Training and Test Set shuffling
def shuffle_and_split(sentences,next_words):
    
    combined=list(zip(sentences,next_words))
    random.shuffle(combined)
    sentences[:],next_words[:]=zip(*combined)
    
    #train test split ratio is 90-10
    sentences_test=sentences[int(len(sentences)*0.9):]
    next_words_test=next_words[int(len(next_words)*0.9):]
    sentences=sentences[:int(len(sentences)*0.9)]
    next_words=next_words[:int(len(next_words)*0.9)]
    
    return sentences, next_words, sentences_test, next_words_test

In [5]:
#Build the model
def get_model(dropout=0.2):
    model=Sequential()
    model.add(Bidirectional(LSTM(128),input_shape=(SEQUENCE_LEN,len(words))))
    model.add(Dropout(dropout))
    model.add(Activation('softmax'))
    return model

In [6]:
# Data Generator
def data_generator(sentence_list,next_word_list,batch_size):
    index=0
    while True:
        x=np.zeros((batch_size,SEQUENCE_LEN,len(words)),dtype=np.bool)
        y=np.zeros((batch_size,len(words)),dtype=np.bool)
        for i in range(batch_size):
            for t,w in enumerate(sentence_list[index%len(sentence_list)]):
                x[i,t,word_indices[w]]=1
            y[i,word_indices[next_word_list[index%len(sentence_list)]]]=1
            index=index+1
    yield x,y

In [7]:
def on_epoch_end():
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(50):
            x_pred = np.zeros((1, SEQUENCE_LEN, len(words)))
            for t, word in enumerate(sentence):
                x_pred[0, t, word_indices[word]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()

In [8]:
def set_gpu():
    os.environ["CUDA_DEVICE_ORDER"]    = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [9]:
if __name__=="__main__":
    
    if not os.path.isdir('./checkpoints/'):
        os.makedirs('./checkpoints/')
    
    if tf.test.is_gpu_available():
        set_gpu()
    
    word_indices, indices_word, text_in_words, ignored_words, MIN_WORD_FREQUENCY = preprocessing()
    sentences, next_words, SEQUENCE_LEN= filter_and_sequence(text_in_words, ignored_words)
    sentences, next_words, sentences_test, next_words_test = shuffle_and_split(sentences, next_words)
    
    words=set(text_in_words)
    
    # hyperparameter
    BATCH_SIZE=1100 
    model = get_model()
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    file_path="./checkpoints/LSTM-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-"\
              "loss{}-acc{}-val_loss{}-val_acc{}"%\
                (len(words), SEQUENCE_LEN, MIN_WORD_FREQUENCY)
    
    checkpoint = ModelCheckpoint(file_path,monitor='val_acc',save_best_only=True)
    print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
    early_stopping=EarlyStopping(monitor='val_acc',patience=5)
    callbacks_list=[checkpoint,print_callback,early_stopping]
       
    model.fit_generator(data_generator(sentences,next_words,BATCH_SIZE),
                        steps_per_epoch=int(len(sentences)/BATCH_SIZE+1),
                        epochs=100,
                        callbacks=callbacks_list,
                        validation_data=data_generator(sentences_test,next_words_test,BATCH_SIZE),
                        validation_steps=int(len(sentences_test)/BATCH_SIZE+1)
                       )
    
    sentences, next_words, sentences_test, next_words_test=shuffle_and_split(sentences,next_words)
    

Corpus length in characters : 1195391
Corpus length in words : 242650
Unique words before ignoring : 23642
Ignoring words with frequency < : 6
Unique words after ignoring : 3347
Ignored Sentences : 192984
Remaining Sequences : 49656
Epoch 1/100


W1029 14:18:05.722356 140015003514624 __init__.py:321] Limited tf.compat.v2.summary API due to missing TensorBoard installation.


KeyboardInterrupt: 

In [None]:
import tensorflow as tf
tf.__version__
tf.test.is_gpu_available()