In [1]:
#importing dependencies 
import numpy 
import sys 
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential 
from keras.layers import Dense , Dropout , LSTM
from keras.utils import np_utils 
from keras.callbacks import ModelCheckpoint 



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# load data 
# loading data and opening our input data in form f a txt file 
# Project Gutenberg is where the data can be found
file = open("Frankenstein_2.txt").read()

In [3]:
#tokenization
#standardization 
#what is tokenization ? Tokenization is the process of breaking a stream of text up into word phrases symbols or other
#meaningful elements
def tokenize_words(input):
    #lowercase everything to standardise it 
    input = input.lower()
    #instantiating the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    #tokenizing the text into tokens 
    tokens = tokenizer.tokenize(input)
    #filturing the stopwords using lambda
    filtered = filter(lambda token : token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

#preprocess the input data make tokens 
processed_inputs = tokenize_words(file)

In [4]:
# to chars to numbers
# convert character in our input to numbers 
# we'll sort the list of all characters that appear in out i/p text and then use he enumerate fc 
# to get numbers that represent the characters 
# we'll then create a dictionary that stores the keys and values, or the characters and the numbers that represent them

chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i , c in enumerate(chars))

In [5]:
# check if words to chars or chars to nums has worked ?
# just so we can get an idea of whether our process of converting to characters has worked 
# we print the length of our variables 

input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total Number of characters:", input_len)
print("Total Vocab:", vocab_len)


Total Number of characters: 269995
Total Vocab: 43


In [6]:
# seg length 
# we're defining how long we want an individual sequence here
# an individual sequence is a complete mapping of input characters as integers

seq_length = 100
x_data = []
y_data = []


In [8]:
# loops through sequence 
# here we're going through the entire list of i/p and converting the chars to numbers with a for looop 
# this will create a bunch of sequence starts with the next character is the i/p data 
# beginning  with the first character 

for i in range(0, input_len - seq_length, 1):
    # define i/p and o/p sequences 
    # i/p is the current character plus the desired sequence length 
    in_seq = processed_inputs[i:i + seq_length]
    # out sequence is the initial character plus total sequence length 
    out_seq = processed_inputs[i + seq_length]
    # converting the list of characters to integer based on previous values and appending the values to our lists 
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
# check to see how many total input sequences we have 
n_patterns = len(x_data)
print("Total Patterns :" , n_patterns)
    

Total Patterns : 269895


In [9]:
#convert input sequence to np array and network can use 
X = numpy.reshape(x_data , (n_patterns  ,seq_length ,1))
X = X/float(vocab_len)

In [10]:
# one - hot encoding our label data 
y = np_utils.to_categorical(y_data)

In [11]:
# creating the model 
# creating a sequential model 
# dropout is used to prevent overfititng 

model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1] , X.shape[2]),return_sequences = True))

model.add(Dropout(0.2))
model.add(LSTM(256 , return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))


In [12]:
#compile the model 
model.compile(loss='categorical_crossentropy',optimizer='adam')


In [13]:
#saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath , monitor ='loss' , verbose =1 ,save_best_only= True,mode='min')
desired_callbacks = [checkpoint]

In [14]:
#fit the model and let it train 
model.fit(X,y,epochs=4, batch_size=256, callbacks = desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.90671, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.90671 to 2.64079, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.64079 to 2.50679, saving model to model_weights_saved.hdf5
Epoch 4/4

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
# recompile model with the same weights 
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy' , optimizer ='adam')

In [16]:
# output of the mdoel back into characters
num_to_char = dict((i,e) for i , e in enumerate(chars))


In [17]:
# random seed to help ganerate
start = numpy.random.randint(0,len(x_data)-1)
pattern = x_data[start]
print("Random Seed :")
print("\"",''.join([num_to_char[value] for value in pattern]),"\"")

Random Seed :
" r chance led place concealment dared blast presence might unfailing aim put end existence monstrous  "


In [18]:
# generate the text 
for i in range(1000):
    x = numpy.reshape(pattern,(1,len(pattern),1))
    x = x/float(vocab_len)
    prediction = model.predict(x,verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare sear