In this notebook, we will be training an LSTM and Bidirectional LSTM to generate the *same text it is being fed*. As in, we will be training them on sequences of characters with the goal of predicting the next character. Then, we will give them those same sequences hoping that, in each case, they predict the correct next character. By combining those characters, we hope to get a coherent body of text that looks like the original. So, a perfect accuracy would yield the same text as before.

# Setting up the Notebook

In [6]:
# Import the necessary modules
import numpy as np
import string
import math
import random 

# NN
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional
from keras.utils import np_utils

# PDF manipulation
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

# NLP
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Functions and Parameters

In [7]:
max_pages = 2
pdf_file = 'economics_textbook.pdf'
misc = '''... '' -- '''.split()
sequence_length = 100
window_size = 2
stop_words = stopwords.words('english')

def remove_non_ascii(text):
    return ''.join([word for word in text if ord(word) < 128])

def load_data(raw_text=False, pdf_file=pdf_file, max_pages=max_pages, directory='Data/'+pdf_file):
    return_string = StringIO()
    device = TextConverter(PDFResourceManager(), return_string, codec='utf-8', laparams=LAParams())
    interpreter = PDFPageInterpreter(PDFResourceManager(), device=device)
    filepath = open(directory, 'rb')
    for page in PDFPage.get_pages(filepath, set(), maxpages=max_pages, caching=True, check_extractable=True):
        interpreter.process_page(page)
    text_data = return_string.getvalue()
    filepath.close(), device.close(), return_string.close()
    if raw_text == True: return remove_non_ascii(text_data)
    else: text_data = ' '.join([word for word in word_tokenize(remove_non_ascii(text_data)) if word not in stop_words])
    return text_data

def preprocess_data(sequence_length=sequence_length, max_pages=max_pages, pdf_file=pdf_file):
    text_data = load_data(max_pages=max_pages, pdf_file=pdf_file)
    characters = list(set(text_data.lower())) # These are the characters that show up in the file
    character_dict = dict((character, i) for i, character in enumerate(characters)) # Dictionary with keys as characters and 
    # values as simply that character's value in a sequenctial list. We will use this to replace characters with numbers, so 
    # values don't matter, only that they're different
    int_dictionary = dict((i, character) for i, character in enumerate(characters)) # Same thing as above with keys and values
    # switched
    num_chars, vocab_size = len(text_data), len(characters)
    x, y = [], []

    for i in range(0, num_chars - sequence_length, 1):
        input_sequence = text_data[i: i+sequence_length] # Sequences of length 'sequence_length' of letters/characters
        output_sequence = text_data[i+sequence_length] # The next letter/character to come
        x.append([character_dict[character.lower()] for character in input_sequence]) # Lowercasing all the characters and 
        # replacing them with their value in the character dictionary
        y.append(character_dict[output_sequence.lower()]) # Lowercasing the character and replacing it with the value in the 
        # character dictionary
    
    for k in range(0, len(x)): # Changing the shape of x
        x[i] = [_x for _x in x[i]]    
    x = np.reshape(x, (len(x), sequence_length, 1)) # Further changing the shape of x
    x = x/float(vocab_size) 
    y = np_utils.to_categorical(y) # One-hot encoding of y
    return x, y, num_chars, vocab_size, int_dictionary

In [8]:
x, y, num_chars, vocab_size, int_dictionary = preprocess_data()
print('x shape', x.shape) 
print('Example of a training example for x', x[0]) # So, we have 1604 sequences, 
# each one containing 100 values, and each value is in brackets
print('y shape', type(y))
print('Example of a training example for y', y[0]) # One-hot encoding of the value of the next character to come in the sequence

x shape (1604, 100, 1)
Example of a training example for x [[0.73684211]
 [0.15789474]
 [0.13157895]
 [0.21052632]
 [0.68421053]
 [0.73684211]
 [0.92105263]
 [0.89473684]
 [0.73684211]
 [0.68421053]
 [0.47368421]
 [0.78947368]
 [0.47368421]
 [0.42105263]
 [0.73684211]
 [0.92105263]
 [0.78947368]
 [0.68421053]
 [0.73684211]
 [0.15789474]
 [0.92105263]
 [0.68421053]
 [0.21052632]
 [0.47368421]
 [0.97368421]
 [0.23684211]
 [0.07894737]
 [0.31578947]
 [0.68421053]
 [0.5       ]
 [0.07894737]
 [0.76315789]
 [0.84210526]
 [0.78947368]
 [0.47368421]
 [0.73684211]
 [0.13157895]
 [0.07894737]
 [0.84210526]
 [0.68421053]
 [0.05263158]
 [0.31578947]
 [0.92105263]
 [0.47368421]
 [0.73684211]
 [0.13157895]
 [0.        ]
 [0.92105263]
 [0.68421053]
 [0.05263158]
 [0.07894737]
 [0.60526316]
 [0.60526316]
 [0.07894737]
 [0.84210526]
 [0.21052632]
 [0.68421053]
 [0.47368421]
 [0.73684211]
 [0.73684211]
 [0.31578947]
 [0.13157895]
 [0.39473684]
 [0.76315789]
 [0.73684211]
 [0.13157895]
 [0.07894737]
 [0

# Neural Networks and Text Generation

## Building the RNN

In [9]:
   
def train_rnn_keras(epochs, activation, num_units): 
    
    x, y, num_chars, vocab_size, int_dictionary = preprocess_data()
    
    def create_rnn(num_units=num_units, activation=activation):
        model = Sequential()
        model.add(LSTM(num_units, activation=activation, input_shape=(None, x.shape[1])))
        model.add(Dense(y.shape[1], activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')            
        model.summary()
        return model
            
    rnn_model = create_rnn()
    _x = x.reshape(x.shape[0], 1, x.shape[1])
    rnn_model.fit(_x, y, epochs=epochs, shuffle=True)
    
    return rnn_model, _x

## Training and Text Prediction

In [10]:
# Training and Text Prediction

# Hyperparameters
epochs = 100
activation = 'relu'
num_units = 300


rnn_model, _x = train_rnn_keras(epochs, activation, num_units)

# Generating text from neural network
predictions = rnn_model.predict(_x[1:]) # Getting probabilities for each of the possible next values for each sequence in x
predictions = [np.argmax(prediction) for prediction in predictions] # Getting the index of the highest probability in each possibility
text = [int_dictionary[index] for index in predictions] # Getting the character of each index
print(''.join([word for word in text])) # Joining the characters to generate a coherent body of text

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 300)               481200    
                                                                 
 dense (Dense)               (None, 38)                11438     
                                                                 
Total params: 492,638
Trainable params: 492,638
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100


## Building the BRNN

In [11]:
    

def train_brnn_keras(epochs, activation, num_units):
        
    x, y, num_chars, vocab_size, int_dictionary = preprocess_data()
    
    def create_rnn(num_units=num_units, activation=activation):
        model = Sequential()
        
        model.add(Bidirectional(LSTM(num_units, activation=activation),
                                input_shape=(None, x.shape[1])))
        
        model.add(Dense(y.shape[1], activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')            
        model.summary()
        return model
            
    brnn_model = create_rnn()
    _x = x.reshape(x.shape[0], 1, x.shape[1])
    brnn_model.fit(_x, y, epochs=epochs, shuffle=True)

    return brnn_model, _x

## Training and Text Prediction

In [12]:
# Training and Text Prediction

# Hyperparameters
epochs = 100
activation = 'relu'
num_units = 300

brnn_model, _x = train_brnn_keras(epochs, activation, num_units)    
    
# Generating text from neural network
predictions = brnn_model.predict(_x[1:])
predictions = [np.argmax(prediction) for prediction in predictions]
text = [int_dictionary[index] for index in predictions]
print(''.join([word for word in text])) 

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 600)              962400    
 l)                                                              
                                                                 
 dense_1 (Dense)             (None, 38)                22838     
                                                                 
Total params: 985,238
Trainable params: 985,238
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/1