In [1]:
import random
import sys
import os

import re
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import LambdaCallback


from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional
from tensorflow.keras.layers import LSTM

%matplotlib inline

# import a custom text data preparation class
# !wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-4-Sprint-3-Deep-Learning/main/module1-rnn-and-lstm/data_cleaning_toolkit_class.py
# from data_cleaning_toolkit_class import data_cleaning_toolkit

## Adjusted toolkit code


In [2]:
class data_cleaning_toolkit(object):

    def __init__(self):
        """
        This class host several data cleaning and preperation methods that are used to prepare text data
        for a text generation model, specifically for the LSTM.
        """

        # TODO: provide descriptions of each variable
        self.sequences = None
        self.next_char = None
        self.n_features = None
        self.unique_chars = None
        self.maxlen = None
        self.char_int = None
        self.int_char = None
        self.sequences = None
        self.next_char = None

    def clean_data(self, doc):
        """
        Accepts a single text document and performs several regex substitutions in order to clean the document.

        Note
        ----
        Don't forget about online regex editors such as this one -  https://regex101.com/

        Parameters
        ----------
        doc: string or object

        Returns
        -------
        doc: string or object
        """

        # order of operations - apply the expression from top to bottom
        date_regex = r"\d+/\d+/\d+" # remove dates in the format 00/00/0000
        punct_regex = r"[^0-9a-zA-Z\s]" # any non-alphanumeric chars
        special_chars_regex = r"[\$\%\&\@\n+]" # any speical chars
        numerical_regex = r"\d+" # any remianing digits
        multiple_whitespace = " {2,}" # any 2 or more consecutive white spaces (don't strip single white spaces!)

        doc = re.sub(date_regex, "", doc)
        doc = re.sub(punct_regex, "", doc)
        doc = re.sub(special_chars_regex, " ", doc)
        doc = re.sub(numerical_regex, "", doc)
        doc = re.sub(multiple_whitespace, "", doc)

        # apply case normalization
        return doc.lower()

    def create_char_sequences(self, data, maxlen = 20, step = 5):
        """
        Creates numerically encoded text sequences for model input and encoded chars
        for what the model should predict next.

        This method needs to be used prior to calling def create_X_and_Y()

        Parameters
        ----------
        data: list of strings
            This is our list of documents

        maxlen: int
            This is the maximum length for the numerically encoded documents

        step: int
            Determines how many characters to skip before picking a starting index
            to generate the next input sequence.

            Example
            -------
            If the sequence is "I love big and fluffy dogs!"
            Then maxlen = 6 step = 5 would chop up the following sequences

            "I love", "ve big", " and f", "fluffy", and so on ...

            Notice that <maxlen> is the size of char seqeunce
            Notice that <step> is the starting index for creating the next char sequence

        Returns
        -------
        None
        """

        # this valueof maxlen will be used in def create_X_and_Y() method
        self.maxlen = maxlen

        # Encode Data as Chars

        # join all text data into a single string
        text = " ".join(data)

        # get unique characters
        self.unique_chars = list(set(text))

        # our text gen model will treat every unique char as a possible feature to predict
        self.n_features = len(self.unique_chars)

        # Lookup Tables
        # keys are chars
        # vals are integers
        self.char_int = {c:i for i, c in enumerate(self.unique_chars)}

        # keys are integers
        # vals are chars
        self.int_char = {i:c for i, c in enumerate(self.unique_chars)}

        # we will encore our text by taking a character and representing it by
        # the index that we have assigned to it in our char_int dictionary
        # we are transforming natural language into a numerical representation (similar to countvectorizer and tfidf)
        encoded = [self.char_int[char] for char in text]

        total_num_chars_in_text = len(encoded)

        sequences = [] # Each sequence in this list is maxlen chars long
        next_char = [] # One element for each sequence

        for i in range(0, total_num_chars_in_text - maxlen, step):

            # input sequence
            sequences.append(encoded[i : i + maxlen])
            # the very next char that a model should predict will follow the input sequence
            next_char.append(encoded[i + maxlen])

        # we know we have this many samples
        print('Created {0} sequences.'.format(len(sequences)))

        self.sequences = sequences
        self.next_char = next_char

    def create_X_and_Y(self):
        """
        Takes a sequence of chars and creates an input/output split (i.e. X and Y)

        Paremeters
        ----------
        None

        Returns
        -------
        x: array of Booleans (i.e. True and False)
        y: array of Booleans (i.e. True and False)
        """
        # this is the number of rows in the doc-term matrix that we are about to create (i.e. x)
        n_seqs = len(self.sequences)

        # this is the number of features in the doc-term matrix that we are about to create
        n_unique_chars = len(self.unique_chars)

        # Create shape for x and y
        x_dims = (len(self.sequences), self.maxlen, len(self.unique_chars))
        y_dims = (len(self.sequences),len(self.unique_chars))

        # create data containers for x and y
        # default values will all be zero ( i.e. look up docs for np.zeros() )
        # recall that a value of zero is equivalent to False in Python
        x = np.zeros(x_dims, dtype=bool)
        y = np.zeros(y_dims, dtype=bool)

        # populate x and y with 1 (from a Boolean perspective, 1 and True are the same thing)
        # iterative through the index and sequence
        for i, sequence in enumerate(self.sequences):
            # take tha sequence and iterate through the chars in the sequence
            for t, char in enumerate(sequence):
                # for row i, location in time series t, and feature char
                # assign a value of 1
                # recall we are using encoded chars from def create_char_sequenes()
                # meaning characters are now represented by a numerical value
                x[i,t,char] = 1

            # follow similar for the char that should be predicted by the model
            # given the corresponding sequence of chars in x
            y[i, self.next_char[i]] = 1

        return x, y

## Separator so youcan collapse toolkit

In [3]:
# download all Shakespeare Sonnets from Project Gutenberg
url = "https://www.gutenberg.org/cache/epub/1041/pg1041.txt"
data = requests.get(url)


In [4]:
# extract the downloaded text from the requests object and save it to `raw_text_data`
raw_text_data = data.text
# verify data type of `raw_text_data`
assert(type(raw_text_data)==str)

### Data Cleaning

In [5]:
# inspect data
raw_text_data[:3000]

"\ufeffThe Project Gutenberg eBook of Shakespeare's Sonnets\r\n    \r\nThis ebook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\nof the Project Gutenberg License included with this ebook or online\r\nat www.gutenberg.org. If you are not located in the United States,\r\nyou will have to check the laws of the country where you are located\r\nbefore using this eBook.\r\n\r\nTitle: Shakespeare's Sonnets\r\n\r\n\r\nAuthor: William Shakespeare\r\n\r\nRelease date: September 1, 1997 [eBook #1041]\r\n                Most recently updated: September 23, 2023\r\n\r\nLanguage: English\r\n\r\nCredits: the Project Gutenberg Shakespeare Team\r\n\r\n\r\n*** START OF THE PROJECT GUTENBERG EBOOK SHAKESPEARE'S SONNETS ***\r\n\r\n\r\n\r\n\r\nTHE SONNETS\r\n\r\nby William Shakespeare\r\n\r\n\r\n\r\n\r\nI\r\n\r\nFrom fairest creatures we desire

In [6]:
# split the text into **lines** and save the result to `split_data`
split_data = raw_text_data.split('\r\n')

In [7]:
# we need to drop all the boiler plate text (i.e. titles and descriptions) as well as extra white spaces
# so that we are left with only the sonnets themselves
split_data = [line for line in split_data if line.strip()]

# droping the text by index
for i, line in enumerate(split_data):
    if 'by William Shakespeare' in line:
        split_data = split_data[i:]
        break

print(split_data)
split_data[:80]



['by William Shakespeare',
 'I',
 'From fairest creatures we desire increase,',
 'That thereby beauty’s rose might never die,',
 'But as the riper should by time decease,',
 'His tender heir might bear his memory:',
 'But thou, contracted to thine own bright eyes,',
 'Feed’st thy light’s flame with self-substantial fuel,',
 'Making a famine where abundance lies,',
 'Thyself thy foe, to thy sweet self too cruel:',
 'Thou that art now the world’s fresh ornament,',
 'And only herald to the gaudy spring,',
 'Within thine own bud buriest thy content,',
 'And tender churl mak’st waste in niggarding:',
 '    Pity the world, or else this glutton be,',
 '    To eat the world’s due, by the grave and thee.',
 'II',
 'When forty winters shall besiege thy brow,',
 'And dig deep trenches in thy beauty’s field,',
 'Thy youth’s proud livery so gazed on now,',
 'Will be a tatter’d weed of small worth held:',
 'Then being asked, where all thy beauty lies,',
 'Where all the treasure of thy lusty days;',


In [8]:
split_data = [line for line in split_data if line != 'by William Shakespeare']

In [9]:
split_data[:80]

['I',
 'From fairest creatures we desire increase,',
 'That thereby beauty’s rose might never die,',
 'But as the riper should by time decease,',
 'His tender heir might bear his memory:',
 'But thou, contracted to thine own bright eyes,',
 'Feed’st thy light’s flame with self-substantial fuel,',
 'Making a famine where abundance lies,',
 'Thyself thy foe, to thy sweet self too cruel:',
 'Thou that art now the world’s fresh ornament,',
 'And only herald to the gaudy spring,',
 'Within thine own bud buriest thy content,',
 'And tender churl mak’st waste in niggarding:',
 '    Pity the world, or else this glutton be,',
 '    To eat the world’s due, by the grave and thee.',
 'II',
 'When forty winters shall besiege thy brow,',
 'And dig deep trenches in thy beauty’s field,',
 'Thy youth’s proud livery so gazed on now,',
 'Will be a tatter’d weed of small worth held:',
 'Then being asked, where all thy beauty lies,',
 'Where all the treasure of thy lusty days;',
 'To say, within thine own 

In [10]:
sonnets = split_data

In [11]:
# removing lines that are too short to be lines in the sonnets
sonnets[200:240]

['Nor can I fortune to brief minutes tell,',
 'Pointing to each his thunder, rain and wind,',
 'Or say with princes if it shall go well',
 'By oft predict that I in heaven find:',
 'But from thine eyes my knowledge I derive,',
 'And constant stars in them I read such art',
 'As ‘Truth and beauty shall together thrive,',
 'If from thyself, to store thou wouldst convert’;',
 '    Or else of thee this I prognosticate:',
 '    ‘Thy end is truth’s and beauty’s doom and date.’',
 'XV',
 'When I consider everything that grows',
 'Holds in perfection but a little moment,',
 'That this huge stage presenteth nought but shows',
 'Whereon the stars in secret influence comment;',
 'When I perceive that men as plants increase,',
 'Cheered and checked even by the self-same sky,',
 'Vaunt in their youthful sap, at height decrease,',
 'And wear their brave state out of memory;',
 'Then the conceit of this inconstant stay',
 'Sets you most rich in youth before my sight,',
 'Where wasteful Time debateth 

In [12]:
# setting length for removal to be 20
n_chars = 20
filtered_sonnets = [line.strip(' ') for line in sonnets if len(line) > n_chars]

In [13]:
filtered_sonnets

['From fairest creatures we desire increase,',
 'That thereby beauty’s rose might never die,',
 'But as the riper should by time decease,',
 'His tender heir might bear his memory:',
 'But thou, contracted to thine own bright eyes,',
 'Feed’st thy light’s flame with self-substantial fuel,',
 'Making a famine where abundance lies,',
 'Thyself thy foe, to thy sweet self too cruel:',
 'Thou that art now the world’s fresh ornament,',
 'And only herald to the gaudy spring,',
 'Within thine own bud buriest thy content,',
 'And tender churl mak’st waste in niggarding:',
 'Pity the world, or else this glutton be,',
 'To eat the world’s due, by the grave and thee.',
 'When forty winters shall besiege thy brow,',
 'And dig deep trenches in thy beauty’s field,',
 'Thy youth’s proud livery so gazed on now,',
 'Will be a tatter’d weed of small worth held:',
 'Then being asked, where all thy beauty lies,',
 'Where all the treasure of thy lusty days;',
 'To say, within thine own deep sunken eyes,',
 

### Use Custom Data Cleaning Tool

Useing one of the methods in the `data_cleaning_toolkit` from Bloom Tech to clean your data.

In [14]:
# instantiating the data_cleaning_toolkit class, saving to dctk
dctk = data_cleaning_toolkit()

In [15]:
# useing the data_cleaning_toolkit to remove punctuation and to case normalize - save results to `clean_sonnets`
clean_sonnets = [dctk.clean_data(line) for line in filtered_sonnets]

In [16]:
display(clean_sonnets)
print(len(clean_sonnets))

['from fairest creatures we desire increase',
 'that thereby beautys rose might never die',
 'but as the riper should by time decease',
 'his tender heir might bear his memory',
 'but thou contracted to thine own bright eyes',
 'feedst thy lights flame with selfsubstantial fuel',
 'making a famine where abundance lies',
 'thyself thy foe to thy sweet self too cruel',
 'thou that art now the worlds fresh ornament',
 'and only herald to the gaudy spring',
 'within thine own bud buriest thy content',
 'and tender churl makst waste in niggarding',
 'pity the world or else this glutton be',
 'to eat the worlds due by the grave and thee',
 'when forty winters shall besiege thy brow',
 'and dig deep trenches in thy beautys field',
 'thy youths proud livery so gazed on now',
 'will be a tatterd weed of small worth held',
 'then being asked where all thy beauty lies',
 'where all the treasure of thy lusty days',
 'to say within thine own deep sunken eyes',
 'were an alleating shame and thriftle

2437


### Useing The Data Tool to Create Character Sequences
for the LSTM model

In [17]:
def calc_stats(corpus):
    """
    Calculates statistics on the length of every line in the sonnets
    """
    doc_lens = [len(line) for line in corpus]
    return np.mean(doc_lens), np.median(doc_lens), np.std(doc_lens), np.max(doc_lens), np.min(doc_lens)

In [18]:
# sonnet line length statistics
mean, med, std, max_, min_ = calc_stats(clean_sonnets)
mean, med, std, max_, min_

(43.151826015592945, 41.0, 8.105475800095245, 74, 19)

In [19]:
# useing .create_char_sequences() to create sequences
maxlen = 40
dctk.create_char_sequences(clean_sonnets, maxlen)

Created 21512 sequences.


Take a look at the `data_cleaning_toolkit_class.py` file.

In the first 4 lines of code in the `create_char_sequences` method, class attributes `n_features` and `unique_chars` are created. <br>
Let's call them in the cells below.

In [20]:
# number of input features for our LSTM model
dctk.n_features

27

In [21]:
# unique characters that appear in our sonnets, this will be the features that our model predicts
dctk.unique_chars

['l',
 'x',
 'p',
 'm',
 's',
 'i',
 'o',
 'e',
 'q',
 'b',
 'c',
 'a',
 'k',
 'u',
 'n',
 'r',
 'w',
 'g',
 'j',
 'f',
 'v',
 ' ',
 'y',
 'z',
 'd',
 't',
 'h']

----

### Use Our Data Tool to Create X and Y Splits

Creating `create_X_and_Y` method to do this

In [22]:
X, y = dctk.create_X_and_Y()

In [23]:
# notice that our input array isn't a matrix - it's a rank three tensor
X.shape

(21512, 40, 27)

In $X$.shape, we see three numbers (*n1*, *n2*, *n3*). What do these numbers mean?

Well, *n1* tells us the number of samples that we have. But what about the other two?

In [24]:
# first index returns a signle sample, which we can see is a sequence
first_sample_index = 0
X[first_sample_index]

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

Notice that each sequence (i.e., $X[i]$ where $i$ is some index value) is `maxlen` long and <br>
has a number of features equal to `dctk.n_features`. <br>Let's try to understand this shape.

In [25]:
# each sequence is maxlen long and has dctk.n_features number of features
X[first_sample_index].shape

(40, 27)

**Each row corresponds to a character vector,** and there is `maxlen` number of character vectors.

**Each column corresponds to a unique character,** and there are `dctk.n_features` number of features.


In [26]:
# let's index for a single character vector
first_char_vect_index = 0
X[first_sample_index][first_char_vect_index]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False])

Notice that there is a single `True` value, and all the rest of the values are `False`.

This is a one-hot encoding for which character appears at each index within a sequence. Specifically, the cell above is looking at the first character in the sequence.

Only a single character can appear as the first character in a sequence, so there will be a single `True` value, and the rest will be `False`.

Let's say that `True` appears in the $ith$ index; by  $ith$ index we mean some index in the general case. So how can we find out which character corresponds to?

To answer this question, we need to use the character-to-integer look-up dictionary.

In [27]:
dctk.int_char

{0: 'l',
 1: 'x',
 2: 'p',
 3: 'm',
 4: 's',
 5: 'i',
 6: 'o',
 7: 'e',
 8: 'q',
 9: 'b',
 10: 'c',
 11: 'a',
 12: 'k',
 13: 'u',
 14: 'n',
 15: 'r',
 16: 'w',
 17: 'g',
 18: 'j',
 19: 'f',
 20: 'v',
 21: ' ',
 22: 'y',
 23: 'z',
 24: 'd',
 25: 't',
 26: 'h'}

In [28]:
# setting up seq_len_counter
seq_len_counter = 0

# index for a single sample
for seq_of_char_vects in X[first_sample_index]:

    # get index with max value, which will be the one TRUE value
    index_with_TRUE_val = np.argmax(seq_of_char_vects)

    print (dctk.int_char[index_with_TRUE_val])

    seq_len_counter+=1

print ("Sequence length: {}".format(seq_len_counter))

f
r
o
m
 
f
a
i
r
e
s
t
 
c
r
e
a
t
u
r
e
s
 
w
e
 
d
e
s
i
r
e
 
i
n
c
r
e
a
s
Sequence length: 40


----


### Building the Shakespeare Sonnet Text Generation Model

In [29]:
def sample(preds, temperature=1.0):
    """
    Helper function to generate a sample character
    Input is a predictions vector from our model, for example a set of 27 character probabilities
    Output is the index of the generated character
    """
    # convert predictions to an array
    preds = np.asarray(preds).astype('float64')

    # use the temperature hyper-parameter to "warp" (sharpen or spread out) the probability distribution
    preds = np.log(preds) / temperature

    # use the softmax activation function to create a new list of probabilities
    #   corresponding to the "warped" probability distribution
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    # Draw a single sample from a multinomial distribution, given these probabilities
    #   The sample will be a one-hot encoded character
    """ Notes on the np.random.multinomial() function
       The first argument is the number of "trials" we want: 1 in this case
       The second argument is the list of probabilities for each character
       The third argument is number of sets of "trials" we want: again, 1 in this case
       By analogy with a dice-rolling experiment:
          This "trial" consists of generating a single "throw" of a die with 27 faces;
             each face corresponds to a character and its associated probability
    """

    probas = np.random.multinomial(1, preds, 1)

    # return the index that corresponds to the max probability
    return np.argmax(probas)


In [30]:
# Creating the `on_epoch_end` function to be passed into `LambdaCallback()`
def on_epoch_end(epoch, _):
    """"
    Function invoked at the end of each epoch. Prints the text generated by our model.
    """

    print()
    print('----- Generating text after Epoch: %d' % epoch)


    # randomly pick a starting index
    # will be used to take a random sequence of chars from `text`
    start_index = random.randint(0, len(text) - dctk.maxlen - 1)

    # this is our seed string (i.e. input seqeunece into the model)
    generated = ''

    # start the sentence at index `start_index` and include the next` dctk.maxlen` number of chars
    sentence = text[start_index: start_index + dctk.maxlen]

    # add to generated
    generated += sentence


    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    # use model to predict what the next maxlen chars should be that follow the seed string
    for i in range(maxlen):

        # shape of a single sample in a rank 3 tensor
        x_dims = (1, dctk.maxlen, dctk.n_features)
        # create an array of zeros with shape x_dims
        # recall that python considers zeros and boolean FALSE as the same
        x_pred = np.zeros(x_dims)

        # create a seq vector for our randomly select sequence
        # i.e. create a numerical encoding for each char in the sequence
        for t, char in enumerate(sentence):
            # for sample 0 in seq index t and character `char` encode a 1 (which is the same as a TRUE)
            x_pred[0, t, dctk.char_int[char]] = 1

        # next, take the seq vector and pass into model to get a prediction of what the next char should be
        preds = model.predict(x_pred, verbose=0)[0]
        # use the sample helper function to get index for next char
        next_index = sample(preds)
        # use look up dict to get next char
        next_char = dctk.int_char[next_index]

        # append next char to sequence
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

In [31]:
# need this for on_epoch_end()
text = " ".join(clean_sonnets)
print(f'All of Shakespeare\'s sonnets comprise about {len(text)} characters')

All of Shakespeare's sonnets comprise about 107597 characters


In [32]:
# create callback object that will print out text generation at the end of each epoch
# use for real-time monitoring of model performance
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [43]:
# build text generation model layer by layer
# fit model
model = Sequential()

model.add(LSTM(264, input_shape=(dctk.maxlen, dctk.n_features), return_sequences=True, activation='relu'))

model.add(LSTM(128, activation='relu'))

model.add(Dense(dctk.n_features, activation='Softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit(X, y, batch_size=256, epochs=70, callbacks=[print_callback], workers=10)





Epoch 1/70
----- Generating text after Epoch: 0
----- Generating with seed: "me doth point out thee as his triumphant"
me doth point out thee as his triumphante iipttghgjtrt dhddas nf nphhheiswssbdpe
Epoch 2/70
----- Generating text after Epoch: 1
----- Generating with seed: "and your true rights be termd a poets ra"
and your true rights be termd a poets rabiseoeabte nnevre aehrh mdbtehsset uott 
Epoch 3/70
----- Generating text after Epoch: 2
----- Generating with seed: " put beside his part or some fierce thin"
 put beside his part or some fierce thinrfmleysseip lfrneigrchdtlthe osl hkly gn
Epoch 4/70
----- Generating text after Epoch: 3
----- Generating with seed: "f derivative works reports performances "
f derivative works reports performances utoou vtuot oh s loos otht raw orts d to
Epoch 5/70
----- Generating text after Epoch: 4
----- Generating with seed: "survey these poor rude lines of thy dece"
survey these poor rude lines of thy decew ths aire fhaisr fe com bnlo sor simd s


<keras.src.callbacks.History at 0x7d750816ea10>

In [44]:
# save trained model to file
model.save("trained_text_gen_model.keras")