# THIS IS STILL IN PROGRESS

# Introduction

In this notebook, we're going to be looking at how you can generate text that is similar to the input text that you give the network. This work is inspired by the great [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) by Andrej Karpathy and most of the code is adapted from [this tutorial](http://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/). The network that we're going to be building is a **character level recurrent neural network**.

In [94]:
import tensorflow as tf
import numpy as np
import pandas as pd

# Load in Input Text

We'll be loading in a book from Thomas Paine, obtained from this [website](http://www.textfiles.com/etext/NONFICTION/). We'll need to clean up the sentences in the file, mainly removing the extra spaces.

In [9]:
# Removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("       ", " ")
    string = string.lower().replace("     ", " ")
    string = string.lower().replace("    ", " ")
    string = string.lower().replace("   ", " ")
    string = string.lower().replace("  ", " ")
    string = string.lower().replace("   ", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [10]:
allText = ""
with open("Data/paine.txt", "r", encoding='utf-8') as f:
    lines=f.readlines()
    numWords = 0
    for line in lines:
        allText += (cleanSentences(line))
        numWords += len(line.split())
    chars = sorted(list(set(allText)))
    print (numWords)

7280


In [11]:
nChars = len(allText)
nVocab = len(chars)
seqLength = 100
print (nChars)
print (nVocab)

40064
37


In [12]:
chars

[' ',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [98]:
charToInt = dict((c, i) for i, c in enumerate(chars))

In [99]:
# prepare the dataset of input to output pairs encoded as integers
dataX = []
dataY = []
for i in range(0, nChars - seqLength, 1):
    seq_in = allText[i:i + seqLength]
    seq_out = allText[i + seqLength]
    dataX.append([charToInt[char] for char in seq_in])
    dataY.append(charToInt[seq_out])
nExamples = len(dataX)
print ("Total Examples: ", nExamples)

Total Examples:  39964


In [100]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (nExamples, seqLength, 1))
# normalize
X = X / float(nVocab)
# one hot encode the output variable
y = np.zeros([nExamples, nVocab])
for i, example in enumerate(dataY):
    lis = np.zeros(nVocab)
    lis[example] = 1
    y[i] = lis

# Tensorflow Model

In [101]:
batchSize = 24
lstmUnits = 24
iterations = 10000
numDimensions = 1
numClasses = nVocab

In [102]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.float32, [batchSize, seqLength, numDimensions])

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.85)
value, _ = tf.nn.dynamic_rnn(lstmCell, input_data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

# Training

In [103]:
import datetime

sess = tf.InteractiveSession()
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

In [104]:
from random import randint
def getTrainBatch():
    num = randint(0,nExamples - batchSize - 1)
    labels = y[num:num+batchSize]
    arr = X[num:num+batchSize]
    return arr, labels

In [105]:
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
    #Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        writer.add_summary(summary, i)

writer.close()

KeyboardInterrupt: 

# What You Can Do (IN PROGRESS)

Really cool stuff right? Now that you know about the model, and how it works, you can try this generate new text based on your own datasets. In order to try the model with your own datasets, save the txt file in the Data folder, and then 