In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import time
import keras
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
# Importing a txt file
from __future__ import unicode_literals
text = open('./The_Foundation_Series.txt', 'rb').read().decode(encoding='utf-8')

In [3]:
# Choosing Number of Characters to train with
text = text[:50000]
n_text = len(text)
n_text

50000

In [4]:
# No of unique characters 
chars = sorted(set(text))
n_chars = len(chars)
n_chars

77

In [5]:
# Creating a Mapping from characters to integer values
map_char_int = dict((c, i) for i, c in enumerate(chars))

In [7]:
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_text - seq_length, 1):
    seq_in = text[i:i + seq_length]
    seq_out = text[i + seq_length]
    dataX.append([map_char_int[char] for char in seq_in])
    dataY.append(map_char_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  49900


In [8]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_chars)
# one hot encode the output variable
#y = tf.keras.np_utils.to_categorical(dataY)
data = dataY
values = np.array(data)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [9]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [10]:
model = Sequential()
# tf.keras.layers.Embedding(vocab_size, embedding_dim,
#                               batch_input_shape=[batch_size, None])
# model.add(keras.layers.Embedding(n_chars, 100, batch_input_shape=[128, None]))
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(onehot_encoded.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [11]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [12]:
model.fit(X, onehot_encoded, epochs=20, batch_size=128, callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Epoch 1/20
  896/49900 [..............................] - ETA: 6:05 - loss: 4.1594

KeyboardInterrupt: 

In [40]:
 filename = "weights-improvement-06-2.8109.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [50]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [54]:
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(10000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_chars)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" You are going to sign a contract to do a novel. What's more, we're 
going to give you a $50,000 adva "
 te the toe toe toe toe tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee tee toe tee t

KeyboardInterrupt: 