In [1]:
#importing dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aisha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#load data
#loading data and opening our input data in the form of a txt file
#Project Gutenburg is where the data can be found
file = open(r"C:\Users\aisha\Downloads\gutenberg_metadata.txt",encoding="utf8").read()

In [3]:
#tokenization
#standardization
def tokenize_words(input):
    #lowercase everything to standardize it
    input = input.lower()
    #instanting the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    #tokenizing the texts into tokens
    tokens = tokenizer.tokenize(input)
    #filtering the stopwords using lambda
    stop_words = set(stopwords.words('english')) 
    # Use words() method to get stopwords
    filtered = [token for token in tokens if token not in stop_words]
    return " ".join(filtered)  # Use space to join the tokens

#preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [4]:
#chars to numbers
#convert characters in our input to numbers
#we'll sort the list of the set of all characters that appear in our i/p text and then use the enumerate fn
#to get numbers that represent the characters
#we'll then create a dictionary that stores the keys and values, or the numbers and characters that representthem

chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))


In [5]:
#check if words to chars or chars to num has worked (?!) has worked ?
#print length of our variables
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total number of vocab:" , vocab_len)


Total number of characters: 1233402
Total number of vocab: 122


In [6]:
#seq length
#defining how long we want our individual sequence here
#an individual sequence is the mapping of input characters as integers
seq_length = 100
x_data = []
y_data = []

In [7]:
#loop throught the sequence
#going through the entire list and converting chars to numbers with a for loop
#this will create a bunch of sequences where each sequence stars with the next character in the i/p data
#beginning with the first character

for i in range(0, input_len - seq_length, 1):
    #define i/p and o/p sequences
    #i/p is the current character plus the desired sequence length
    in_seq = processed_inputs[i:i + seq_length]
    #o/p is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]
    # Check if all characters in in_seq exist in char_to_num dictionary
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

#check to see how many i/p sequence we have
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 1233302


In [8]:
#convert input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)


In [9]:
#one-hot encoding
y = np_utils.to_categorical(y_data)

In [10]:
#creating the model
#creating a sequential model
#dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = 'softmax'))

In [11]:
#compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
#saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [13]:
#fit model and let it train
model.fit(X, y, epochs=1, batch_size=256, callbacks=desired_callbacks)

Epoch 1: loss improved from inf to 2.24822, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x1e99e6fb8b0>

In [16]:
#recompile model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer ='adam')

In [17]:
#output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [21]:
#random seed to help generate 
start = numpy.random.randint(0, len(x_data)-1)
pattern = x_data[start]
print("Random Seed:")
print("\"",''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" lantyne http www gutenberg org ebooks 21711 times peril tale india g henty http www gutenberg org eb "


In [22]:
#generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1,len(pattern),1))
    x = x/float(vocab_len)
    prediction = model.predict(x,verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern ]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

ooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenberg org ebooks 1111  sore sare sare hart http www gutenber