In [2]:
# importing dependencies
import numpy 
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense ,Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Using TensorFlow backend.


In [3]:
#load data
#loading data and opening our input data in the form of txt file
file = open("frankenstein.txt").read()

In [4]:
#tokenization 
#standardization
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered=filter(lambda token: token not in stopwords.words('english'),tokens)
    return "".join(filtered)
    
processed_inputs = tokenize_words(file)
#chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [5]:
#check if words to chars or chars to num (?!) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocab:",vocab_len)

Total number of characters: 220857
Total vocab: 42


In [6]:
# seq length
seq_length =100
x_data =[]
y_data =[]

In [7]:
#loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns =len(x_data)
print ("Total Patterns:",n_patterns)

Total Patterns: 220757


In [8]:
#convert input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [9]:
# one-hot coding
y = np_utils.to_categorical(y_data)

In [11]:
# creating the model
import tensorflow as tf
tf.math.log
model = Sequential()
model.add(LSTM(100,input_shape=(X.shape[1], X.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))

In [13]:
# compile the model
import tensorflow as tf
tf.math.log
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [14]:
# saving weights
filepath ="model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only=True,mode='min')
desired_callbacks = [checkpoint]

In [15]:
# fit model and let it train
tf.math.log
tf.where
model.fit(X,y, epochs=4, batch_size =100,callbacks=desired_callbacks)
# here we are fitting the model which takes time

W0528 10:03:33.824648 140578850780992 deprecation.py:323] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/4

Epoch 00001: loss improved from inf to 2.92828, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.92828 to 2.90511, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.90511 to 2.86915, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.86915 to 2.83582, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7fda80934cc0>

In [21]:
# recompile model with the saved weights
from keras.models import load_model
filename = 'model_weights_saved.hdf5'
model = load_model(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [22]:
#output of model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [25]:
# random need to help generate
start = numpy.random.randint(0, len(x_data)-1)
pattern = x_data[start]
print("Random Seed: ")
print("\"",' '.join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" n u r s e t r u e s e l d o m c a m e s e e a l t h o u g h a r d e n t l y d e s i r e d r e l i e v e s u f f e r i n g s e v e r y h u m a n c r e a t u r e w i s h p r e s e n t a g o n i e s m i "


In [26]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern,(1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x,verbose=0)
    index= numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

ngeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee

In [None]:
#dissapointing model due to less number of epochs