In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.callbacks import LambdaCallback
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Bidirectional, LSTM

%matplotlib inline


## Adjusted toolkit code


In [None]:
class DataCleaningToolkit:
    def __init__(self):
        self.sequences = None
        self.next_char = None
        self.n_features = None
        self.unique_chars = None
        self.maxlen = None
        self.char_int = None
        self.int_char = None

    def clean_data(self, doc):
        date_regex = r"\d+/\d+/\d+"
        punct_regex = r"[^0-9a-zA-Z\s]"
        special_chars_regex = r"[\$\%\&\@\n+]"
        numerical_regex = r"\d+"
        multiple_whitespace = " {2,}"

        doc = re.sub(date_regex, "", doc)
        doc = re.sub(special_chars_regex, " ", doc)
        doc = re.sub(numerical_regex, "", doc)
        doc = re.sub(multiple_whitespace, "", doc)

        return doc.lower()

    def create_char_sequences(self, data, maxlen=20, step=5):
        self.maxlen = maxlen
        text = " ".join(data)
        self.unique_chars = list(set(text))
        self.n_features = len(self.unique_chars)
        self.char_int = {c: i for i, c in enumerate(self.unique_chars)}
        self.int_char = {i: c for i, c in enumerate(self.unique_chars)}
        encoded = [self.char_int[char] for char in text]
        total_num_chars_in_text = len(encoded)
        sequences = []
        next_char = []

        for i in range(0, total_num_chars_in_text - maxlen, step):
            sequences.append(encoded[i: i + maxlen])
            next_char.append(encoded[i + maxlen])

        print('Created {0} sequences.'.format(len(sequences)))

        self.sequences = sequences
        self.next_char = next_char

    def create_X_and_Y(self):
        n_seqs = len(self.sequences)
        n_unique_chars = len(self.unique_chars)
        x_dims = (len(self.sequences), self.maxlen, len(self.unique_chars))
        y_dims = (len(self.sequences), len(self.unique_chars))
        x = np.zeros(x_dims, dtype=bool)
        y = np.zeros(y_dims, dtype=bool)

        for i, sequence in enumerate(self.sequences):
            for t, char in enumerate(sequence):
                x[i, t, char] = 1
            y[i, self.next_char[i]] = 1

        return x, y


In [3]:
import requests

# download all Shakespeare Sonnets from Project Gutenberg
url = "https://www.gutenberg.org/cache/epub/1041/pg1041.txt"
response = requests.get(url)
data = response.text


In [4]:
# extract the downloaded text from the requests object and save it to `raw_text_data`
raw_text_data = data
# verify data type of `raw_text_data`
assert isinstance(raw_text_data, str)


### Data Cleaning

In [5]:
# inspect data
print(raw_text_data[:3000])


"\ufeffThe Project Gutenberg eBook of Shakespeare's Sonnets\r\n    \r\nThis ebook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\nof the Project Gutenberg License included with this ebook or online\r\nat www.gutenberg.org. If you are not located in the United States,\r\nyou will have to check the laws of the country where you are located\r\nbefore using this eBook.\r\n\r\nTitle: Shakespeare's Sonnets\r\n\r\nAuthor: William Shakespeare\r\n\r\nRelease date: September 1, 1997 [eBook #1041]\r\n                Most recently updated: March 10, 2024\r\n\r\nLanguage: English\r\n\r\nCredits: the Project Gutenberg Shakespeare Team\r\n\r\n\r\n*** START OF THE PROJECT GUTENBERG EBOOK SHAKESPEARE'S SONNETS ***\r\nTHE SONNETS\r\n\r\nby William Shakespeare\r\n\r\n\r\n\r\n\r\nI\r\n\r\nFrom fairest creatures we desire increase,\r\nThat there

In [6]:
# split the text into lines and save the result to `split_data`
split_data = raw_text_data.split('\r\n')


In [7]:
# Drop all the boilerplate text (titles and descriptions) as well as extra white spaces
# so that we are left with only the sonnets themselves
split_data = [line for line in split_data if line.strip()]

# Drop the text by index
for i, line in enumerate(split_data):
    if 'by William Shakespeare' in line:
        split_data = split_data[i:]
        break

print(split_data[:80])




['by William Shakespeare',
 'I',
 'From fairest creatures we desire increase,',
 'That thereby beauty’s rose might never die,',
 'But as the riper should by time decease,',
 'His tender heir might bear his memory:',
 'But thou, contracted to thine own bright eyes,',
 'Feed’st thy light’s flame with self-substantial fuel,',
 'Making a famine where abundance lies,',
 'Thyself thy foe, to thy sweet self too cruel:',
 'Thou that art now the world’s fresh ornament,',
 'And only herald to the gaudy spring,',
 'Within thine own bud buriest thy content,',
 'And tender churl mak’st waste in niggarding:',
 '    Pity the world, or else this glutton be,',
 '    To eat the world’s due, by the grave and thee.',
 'II',
 'When forty winters shall besiege thy brow,',
 'And dig deep trenches in thy beauty’s field,',
 'Thy youth’s proud livery so gazed on now,',
 'Will be a tatter’d weed of small worth held:',
 'Then being asked, where all thy beauty lies,',
 'Where all the treasure of thy lusty days;',


In [8]:
split_data = [line for line in split_data if 'by William Shakespeare' not in line]


In [9]:
print(split_data[:10])


['I',
 'From fairest creatures we desire increase,',
 'That thereby beauty’s rose might never die,',
 'But as the riper should by time decease,',
 'His tender heir might bear his memory:',
 'But thou, contracted to thine own bright eyes,',
 'Feed’st thy light’s flame with self-substantial fuel,',
 'Making a famine where abundance lies,',
 'Thyself thy foe, to thy sweet self too cruel:',
 'Thou that art now the world’s fresh ornament,']

In [10]:
sonnets = [line for line in split_data if 'by William Shakespeare' not in line]


In [11]:
sonnets = [line for line in sonnets if len(line) > 10]
sonnets[200:240]


['Nor can I fortune to brief minutes tell,',
 'Pointing to each his thunder, rain and wind,',
 'Or say with princes if it shall go well',
 'By oft predict that I in heaven find:',
 'But from thine eyes my knowledge I derive,',
 'And constant stars in them I read such art',
 'As ‘Truth and beauty shall together thrive,',
 'If from thyself, to store thou wouldst convert’;',
 '    Or else of thee this I prognosticate:',
 '    ‘Thy end is truth’s and beauty’s doom and date.’',
 'XV',
 'When I consider everything that grows',
 'Holds in perfection but a little moment,',
 'That this huge stage presenteth nought but shows',
 'Whereon the stars in secret influence comment;',
 'When I perceive that men as plants increase,',
 'Cheered and checked even by the self-same sky,',
 'Vaunt in their youthful sap, at height decrease,',
 'And wear their brave state out of memory;',
 'Then the conceit of this inconstant stay',
 'Sets you most rich in youth before my sight,',
 'Where wasteful Time debateth 

In [12]:
n_chars = 20
filtered_sonnets = [line.strip() for line in sonnets if len(line) > n_chars]


In [13]:
filtered_sonnets[:20]

['From fairest creatures we desire increase,',
 'That thereby beauty’s rose might never die,',
 'But as the riper should by time decease,',
 'His tender heir might bear his memory:',
 'But thou, contracted to thine own bright eyes,',
 'Feed’st thy light’s flame with self-substantial fuel,',
 'Making a famine where abundance lies,',
 'Thyself thy foe, to thy sweet self too cruel:',
 'Thou that art now the world’s fresh ornament,',
 'And only herald to the gaudy spring,',
 'Within thine own bud buriest thy content,',
 'And tender churl mak’st waste in niggarding:',
 'Pity the world, or else this glutton be,',
 'To eat the world’s due, by the grave and thee.',
 'When forty winters shall besiege thy brow,',
 'And dig deep trenches in thy beauty’s field,',
 'Thy youth’s proud livery so gazed on now,',
 'Will be a tatter’d weed of small worth held:',
 'Then being asked, where all thy beauty lies,',
 'Where all the treasure of thy lusty days;']

### Use Custom Data Cleaning Tool

using one of the methods in the `data_cleaning_toolkit` from Bloom teach to clean your data.

In [14]:
# Instantiating the DataCleaningToolkit class and saving it to dctk
dctk = DataCleaningToolkit()


In [15]:
clean_sonnets = filtered_sonnets


In [16]:
print(clean_sonnets)
print(len(clean_sonnets[:10]))


['From fairest creatures we desire increase,',
 'That thereby beauty’s rose might never die,',
 'But as the riper should by time decease,',
 'His tender heir might bear his memory:',
 'But thou, contracted to thine own bright eyes,',
 'Feed’st thy light’s flame with self-substantial fuel,',
 'Making a famine where abundance lies,',
 'Thyself thy foe, to thy sweet self too cruel:',
 'Thou that art now the world’s fresh ornament,',
 'And only herald to the gaudy spring,',
 'Within thine own bud buriest thy content,',
 'And tender churl mak’st waste in niggarding:',
 'Pity the world, or else this glutton be,',
 'To eat the world’s due, by the grave and thee.',
 'When forty winters shall besiege thy brow,',
 'And dig deep trenches in thy beauty’s field,',
 'Thy youth’s proud livery so gazed on now,',
 'Will be a tatter’d weed of small worth held:',
 'Then being asked, where all thy beauty lies,',
 'Where all the treasure of thy lusty days;',
 'To say, within thine own deep sunken eyes,',
 

10


In [17]:
import numpy as np

def calc_stats(corpus):
    """
    Calculates statistics on the length of every line in the sonnets
    """
    doc_lens = [len(line) for line in corpus]
    return np.mean(doc_lens), np.median(doc_lens), np.std(doc_lens), np.max(doc_lens), np.min(doc_lens)


In [18]:
# Sonnet line length statistics
mean, median, std, max_len, min_len = calc_stats(clean_sonnets)
mean, median, std, max_len, min_len


(44.89946655724251, 43.0, 8.343477135617167, 78, 20)

In [19]:
# Using .create_char_sequences() to create sequences
maxlen = 100
dctk.create_char_sequences(clean_sonnets, maxlen)


Created 22352 sequences.


Take a look at the `data_cleaning_toolkit_class.py` file.

In the first 4 lines of code in the `create_char_sequences` method, class attributes `n_features` and `unique_chars` are created. <br>
Let's call them in the cells below.

In [20]:
# Number of input features for our LSTM model
dctk.n_features


83

In [21]:
# Unique characters that appear in our sonnets, which will be the features that our model predicts
dctk.unique_chars


['G',
 'n',
 'y',
 '3',
 '”',
 '™',
 'u',
 'F',
 'J',
 'D',
 'R',
 's',
 ':',
 'q',
 ' ',
 'X',
 'Y',
 'z',
 'S',
 'W',
 '‘',
 '“',
 'x',
 'c',
 '6',
 'r',
 '8',
 'Q',
 '-',
 '4',
 'e',
 '.',
 'E',
 'o',
 '2',
 '7',
 'd',
 'P',
 '’',
 '%',
 'k',
 'v',
 "'",
 '5',
 '(',
 '/',
 'B',
 '*',
 'N',
 '!',
 'O',
 'm',
 'M',
 'A',
 '$',
 '•',
 'L',
 ';',
 'V',
 '9',
 ')',
 'H',
 ',',
 'h',
 'j',
 '0',
 'f',
 '1',
 'T',
 '?',
 'U',
 'i',
 '—',
 'p',
 'K',
 'C',
 'l',
 'I',
 't',
 'b',
 'w',
 'a',
 'g']

----

### Use Our Data Tool to Create X and Y Splits

Creating `create_X_and_Y` method to do this

In [22]:
X, y = dctk.create_X_and_Y()

In [23]:
# Check the shape of the input array
X.shape


(22352, 100, 83)

In [24]:
# Return the first sample from the input array
first_sample = X[0]


array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [25]:
X[first_sample_index].shape


(100, 83)

In [26]:
# Index for a single character vector
first_char_vect_index = 0
X[first_sample_index, first_char_vect_index]


array([False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False])

In [27]:
dctk.int_char

{0: 'G',
 1: 'n',
 2: 'y',
 3: '3',
 4: '”',
 5: '™',
 6: 'u',
 7: 'F',
 8: 'J',
 9: 'D',
 10: 'R',
 11: 's',
 12: ':',
 13: 'q',
 14: ' ',
 15: 'X',
 16: 'Y',
 17: 'z',
 18: 'S',
 19: 'W',
 20: '‘',
 21: '“',
 22: 'x',
 23: 'c',
 24: '6',
 25: 'r',
 26: '8',
 27: 'Q',
 28: '-',
 29: '4',
 30: 'e',
 31: '.',
 32: 'E',
 33: 'o',
 34: '2',
 35: '7',
 36: 'd',
 37: 'P',
 38: '’',
 39: '%',
 40: 'k',
 41: 'v',
 42: "'",
 43: '5',
 44: '(',
 45: '/',
 46: 'B',
 47: '*',
 48: 'N',
 49: '!',
 50: 'O',
 51: 'm',
 52: 'M',
 53: 'A',
 54: '$',
 55: '•',
 56: 'L',
 57: ';',
 58: 'V',
 59: '9',
 60: ')',
 61: 'H',
 62: ',',
 63: 'h',
 64: 'j',
 65: '0',
 66: 'f',
 67: '1',
 68: 'T',
 69: '?',
 70: 'U',
 71: 'i',
 72: '—',
 73: 'p',
 74: 'K',
 75: 'C',
 76: 'l',
 77: 'I',
 78: 't',
 79: 'b',
 80: 'w',
 81: 'a',
 82: 'g'}

In [28]:
seq_len_counter = 0

for seq_of_char_vects in X[first_sample_index]:
    index_with_TRUE_val = np.argmax(seq_of_char_vects)
    print(dctk.int_char[index_with_TRUE_val])
    seq_len_counter += 1

print("Sequence length: {}".format(seq_len_counter))


F
r
o
m
 
f
a
i
r
e
s
t
 
c
r
e
a
t
u
r
e
s
 
w
e
 
d
e
s
i
r
e
 
i
n
c
r
e
a
s
e
,
 
T
h
a
t
 
t
h
e
r
e
b
y
 
b
e
a
u
t
y
’
s
 
r
o
s
e
 
m
i
g
h
t
 
n
e
v
e
r
 
d
i
e
,
 
B
u
t
 
a
s
 
t
h
e
 
r
i
Sequence length: 100


----


### Building the Shakespeare Sonnet Text Generation Model

In [29]:
def sample(preds, temperature=0.5, top_k=5):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    top_k_idxs = np.argsort(preds)[-top_k:]
    top_k_probs = preds[top_k_idxs]
    top_k_probs /= np.sum(top_k_probs)

    chosen_idx = np.random.choice(top_k_idxs, p=top_k_probs)

    return chosen_idx


In [30]:
def on_epoch_end(epoch, _):
    """
    Function invoked at the end of each epoch. Prints the text generated by our model.
    """

    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - dctk.maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + dctk.maxlen]
    generated += sentence

    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for _ in range(dctk.maxlen):
        x_dims = (1, dctk.maxlen, dctk.n_features)
        x_pred = np.zeros(x_dims)

        for t, char in enumerate(sentence):
            x_pred[0, t, dctk.char_int[char]] = 1

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds)
        next_char = dctk.int_char[next_index]

        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()


In [31]:
# Import necessary libraries

# Calculate total number of characters in the text
text = " ".join(clean_sonnets)
print(f'All of Shakespeare\'s sonnets comprise about {len(text)} characters')

All of Shakespeare's sonnets comprise about 111856 characters


In [32]:
# Create a callback object that will print out text generation at the end of each epoch
# Use for real-time monitoring of model performance
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)


In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional

model = Sequential([
    Bidirectional(LSTM(256, return_sequences=True, input_shape=(dctk.maxlen, dctk.n_features))),
    Dropout(0.2),
    Bidirectional(LSTM(128)),
    Dropout(0.2),
    Dense(dctk.n_features, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=256, epochs=25, callbacks=[print_callback])


Epoch 1/90


  super().__init__(**kwargs)


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 422ms/step - loss: 3.4627
----- Generating text after Epoch: 0
----- Generating with seed: "any gazers mightst thou lead away, If thou wouldst use the strength of all thy state! But do not so;"
any gazers mightst thou lead away, If thou wouldst use the strength of all thy state! But do not so;i t oet eeete t      t     e  o o     tiet   et e    o ei e   ee   e  ite e  ee  e  ee    e eee  o o
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 460ms/step - loss: 3.4601
Epoch 2/90
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 464ms/step - loss: 3.0386
----- Generating text after Epoch: 1
----- Generating with seed: "thou lov’st me for my name is ‘Will.’ Thou blind fool, Love, what dost thou to mine eyes, That they "
thou lov’st me for my name is ‘Will.’ Thou blind fool, Love, what dost thou to mine eyes, That they the nen he the the on no th non so to ntr te he ne on nte ntoe no sn heer an an s

<keras.src.callbacks.history.History at 0x1ac7fe35880>

In [34]:
The current cell is fine as it is.


[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\synth\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [35]:
from nltk.corpus import cmudict

# Load the CMU Pronouncing Dictionary
d = cmudict.dict()

# Function to convert text to phonemes
def text_to_phonemes(text):
    words = text.lower().split()
    phonemes = [d[word][0] for word in words if word in d]
    return phonemes

# Example usage
text = "hello world"
phonemes = text_to_phonemes(text)
print(phonemes)


[['HH', 'AH0', 'L', 'OW1'], ['W', 'ER1', 'L', 'D']]


In [36]:
def map_phonemes(text_list):
    phonemes_map = {}
    missed_words = []
    for text in text_list:
        word_list = text.split()
        for word in word_list:
            try:
                phonemes = text_to_phonemes(word)[0]
                phonemes_map[word] = phonemes
            except:
                print(f"missed word: {word}")
                missed_words.append(word)
    return phonemes_map, missed_words


In [37]:
The current cell is fine as it is.


missed word: increase,
missed word: beauty’s
missed word: die,
missed word: riper
missed word: decease,
missed word: memory:
missed word: thou,
missed word: eyes,
missed word: Feed’st
missed word: light’s
missed word: self-substantial
missed word: fuel,
missed word: lies,
missed word: Thyself
missed word: foe,
missed word: cruel:
missed word: world’s
missed word: ornament,
missed word: spring,
missed word: buriest
missed word: content,
missed word: churl
missed word: mak’st
missed word: niggarding:
missed word: world,
missed word: glutton
missed word: be,
missed word: world’s
missed word: due,
missed word: thee.
missed word: brow,
missed word: beauty’s
missed word: field,
missed word: youth’s
missed word: now,
missed word: tatter’d
missed word: held:
missed word: asked,
missed word: lies,
missed word: days;
missed word: say,
missed word: eyes,
missed word: all-eating
missed word: shame,
missed word: thriftless
missed word: praise.
missed word: deserv’d
missed word: beauty’s
missed word

In [38]:
print("Phonemes Map:", phonemes_map)
print("Number of Missed Words:", len(missed_words))
print("Number of Words in Phonemes Map:", len(phonemes_map))


4103
2832


### saving the trained model to a file

In [39]:
# Save the trained model to a file
model.save("trained_text_gen_drop_model-1.h5")


