In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import time
import keras
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Importing a txt file
from __future__ import unicode_literals
text = open('./The_Foundation_Series.txt', 'rb').read().decode(encoding='utf-8')

In [67]:
# Choosing Number of Characters to train with
text = text[:50000]
n_text = len(text)
n_text

50000

In [68]:
# No of unique characters 
chars = sorted(set(text))
n_chars = len(chars)
n_chars

77

In [69]:
# Creating a Mapping from characters to integer values
map_char_int = dict((c, i) for i, c in enumerate(chars))

In [70]:
seq_size = 100
input_seq = []
target_char = []
for i in range(n_text-seq_size):
    seq_in = text[i: i + seq_size]
    seq_out = text[i + seq_size]
    input_seq.append([map_char_int[char] for char in seq_in])
    target_char.append(map_char_int[seq_out])
n_patterns = len(input_seq)
print("Total Patterns: ", n_patterns)

Total Patterns:  49900


In [71]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint

In [72]:
# reshape X to be [samples, time steps, features]
X = np.reshape(input_seq, (n_patterns, seq_size, 1))
# normalize
X = X / float(n_chars)


# one hot encode the output variable
data = target_char
values = np.array(data)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [73]:
n_chars

77

In [79]:
model = Sequential()
#model.add(Embedding(n_chars,256, input_length = len(input_seq[0])))
model.add(LSTM(128, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(onehot_encoded.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [80]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_13 (LSTM)               (None, 128)               66560     
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 77)                9933      
Total params: 76,493
Trainable params: 76,493
Non-trainable params: 0
_________________________________________________________________


In [81]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [82]:
# input_seq2 = []
# list1 = []
# for i in input_seq:
#     for j in i:
#         list1.append(j)
#         input_seq2.append(list1)
#         list1 = []

In [83]:
model.fit(X, onehot_encoded, epochs=20, batch_size=128 ,callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 3.21496, saving model to weights-improvement-01-3.2150.hdf5
Epoch 2/20

Epoch 00002: loss improved from 3.21496 to 3.08863, saving model to weights-improvement-02-3.0886.hdf5
Epoch 3/20

Epoch 00003: loss improved from 3.08863 to 2.96247, saving model to weights-improvement-03-2.9625.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.96247 to 2.91793, saving model to weights-improvement-04-2.9179.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.91793 to 2.89009, saving model to weights-improvement-05-2.8901.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.89009 to 2.87261, saving model to weights-improvement-06-2.8726.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.87261 to 2.85189, saving model to weights-improvement-07-2.8519.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.85189 to 2.83323, saving model to weights-improvement-08-2.8332.hdf5
Epoch 9/20

Epoch 00009: loss improved from 2.83323 to 2.81044, saving model to weig

<keras.callbacks.History at 0x1b74f7ba90>

In [84]:
 filename = "weights-improvement-20-2.5819.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [85]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [88]:
start = np.random.randint(0, len(input_seq)-1)
pattern = input_seq[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(10000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_chars)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" e series, 
where the series, to qualify, had to consist of at least three connected novels. It was t "
oe tore to te tee sore to the soet  



   The 
oonee toe tooe th the tore  ao  and to toe to toe te the sere  




   The woie toe tore to the  ao ao toe toee   


   The woie tee  aod to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe to th the sooe  




   The woee the  aor to the toe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to toe toe  ao the tooe  ao  and to 

KeyboardInterrupt: 