# Generating Code example

The following is an example of how to generate code from a trained keras model.

This is just an example, but if you want to run it, you can download `prideandprejudice.txt` and `exampmodel.zip` from Canvas, upload it here, and run.

## Load packages

In [14]:
import keras as kb
import tensorflow as tf
import numpy as np
import string
from random import randint
from pickle import load
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Input
from tensorflow.keras import Model

## Load File and Generate Sequences

In [15]:
# load doc into memory
def load_doc(filename):
 # open the file as read only
 file = open(filename, 'r', encoding='utf-8')
 # read all text
 text = file.read()
 # close the file
 file.close()
 return text

# turn a doc into clean tokens
def clean_doc(doc):
 # replace '--' with a space ' '
 doc = doc.replace('--', ' ')
 # split into tokens by white space
 tokens = doc.split()
 # remove punctuation from each token
 table = str.maketrans('', '', string.punctuation)
 tokens = [w.translate(table) for w in tokens]
 # remove remaining tokens that are not alphabetic
 tokens = [word for word in tokens if word.isalpha()]
 # make lower case
 tokens = [word.lower() for word in tokens]
 return tokens

# save tokens to file, one dialog per line
def save_doc(lines, filename):
 data = '\n'.join(lines)
 file = open(filename, 'w')
 file.write(data)
 file.close()

# load document
in_filename = 'Shake.txt'
doc = load_doc(in_filename)
print(doc[:200])

# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

# organize into sequences of tokens
length = 20 + 1
sequences = list()
for i in range(length, len(tokens)):
 # select sequence of tokens
 seq = tokens[i-length:i]
 # convert into a line
 line = ' '.join(seq)
 # store
 sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save sequences to file
out_filename = 'Shake_sequences.txt'
save_doc(sequences, out_filename)


Title: Romeo and Juliet

Author: William Shakespeare

Release date: November 1, 1998 [eBook #1513]
                Most recently updated: June 27, 2023

Language: English

Credits: the PG Shakespeare
['title', 'romeo', 'and', 'juliet', 'author', 'william', 'shakespeare', 'release', 'date', 'november', 'ebook', 'most', 'recently', 'updated', 'june', 'language', 'english', 'credits', 'the', 'pg', 'shakespeare', 'team', 'a', 'team', 'of', 'about', 'twenty', 'project', 'gutenberg', 'volunteers', 'start', 'of', 'the', 'project', 'gutenberg', 'ebook', 'romeo', 'and', 'juliet', 'the', 'tragedy', 'of', 'romeo', 'and', 'juliet', 'by', 'william', 'shakespeare', 'contents', 'the', 'prologue', 'act', 'i', 'scene', 'i', 'a', 'public', 'place', 'scene', 'ii', 'a', 'street', 'scene', 'iii', 'room', 'in', 'house', 'scene', 'iv', 'a', 'street', 'scene', 'v', 'a', 'hall', 'in', 'house', 'act', 'ii', 'chorus', 'scene', 'i', 'an', 'open', 'place', 'adjoining', 'garden', 'scene', 'ii', 'garden', 'scene', 

In [16]:
# load
in_filename = 'Shake_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [17]:
# integer encode sequences of words
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [18]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# create sequences
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = kb.utils.to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [20]:
# My initial LSTM model
inputs = Input(shape=(X.shape[1], 1))
x = LSTM(256)(inputs)
x = Dropout(0.2)(x)
output = Dense(vocab_size, activation='softmax')(x)
model = Model(inputs=inputs, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics='accuracy')

In [21]:
model.fit(X, y, epochs=40, batch_size=128,verbose=0)

<keras.src.callbacks.History at 0x7a25439cafb0>

In [22]:
# After training, evaluate the model on the same dataset to get the loss and metrics
loss, accuracy = model.evaluate(X, y, verbose=0)

# Print the loss and accuracy
print(f'Final loss: {loss}')
print(f'Final accuracy: {accuracy}')

Final loss: 2.6925363540649414
Final accuracy: 0.45420411229133606


## Generate Text

This loops through to generate long strings of text using the seed text + any generated tokens to predict the next token.

In [23]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    for _ in range(n_words):
        # Encode the text as integers
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # Pad sequences to a fixed length
        encoded = kb.preprocessing.sequence.pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # Predict probabilities for each word
        yhat = model.predict(encoded, verbose=0)
        # Get the highest probability word index
        yhat = np.argmax(yhat, axis=-1)
        # Map predicted word index to word
        out_word = tokenizer.index_word.get(yhat[0], '')  # Use .get to avoid KeyErrors and provide a default
        # If no word is found, break the loop
        if out_word == '':
            print('Unable to find a word for the index predicted.')
            break
        # Append to the result
        result.append(out_word)
        # Update the seed text with the new word
        in_text += ' ' + out_word
        # Use only the last 'seq_length' words to generate the next word
        in_text = ' '.join(in_text.split()[-seq_length:])
    return ' '.join(result)

# Generate new text 10x
for i in range(10):
  seed_text = lines[randint(0, len(lines))]
  print("Seed text: ", seed_text)
  generated = generate_seq(model, tokenizer, seq_length, seed_text, 20)
  print("GENDERATED TEXT:", generated + '\n\n')


Seed text:  send to romeo but when i came some minute ere the time of her awaking here untimely lay the noble paris
GENDERATED TEXT: come i prince servant romeo romeo romeo madam that thou death not that the montague the may so nurse groan


Seed text:  and his brother valentine mine uncle capulet his wife and daughters my fair niece rosaline and livia signior valentio and his
GENDERATED TEXT: cousin and daughters of his moved and the lively helena of the unseen button of the watery flask of duellist


Seed text:  of substance as the air and more inconstant than the wind who woos even now the frozen bosom of the north
GENDERATED TEXT: of the capulets of the capulets sampson the the capulets place and immortal passado of the capulets came the immortal


Seed text:  her silver why why with her silver what say you simon catling first musician marry sir because silver hath a sweet
GENDERATED TEXT: sound the then the me that both one that a comfort i tell i will for civil within to heads




In [24]:
#New model architecture with the mulitple recurrent layers
newmodel = Sequential()
#Three recurrent layers
newmodel.add(LSTM(128, return_sequences=True))
newmodel.add(LSTM(64, return_sequences=True))
newmodel.add(LSTM(32,return_sequences=True))

newmodel.add(Dense(1, activation='sigmoid'))

newmodel = Model(inputs=inputs, outputs=output)


# Compile the model
newmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
newmodel.fit(X, y, epochs=40, batch_size=128, verbose=0)

<keras.src.callbacks.History at 0x7a25b2c57280>

In [25]:
# After training, evaluate the model on the same dataset to get the loss and metrics
loss, accuracy = newmodel.evaluate(X, y, verbose=0)

# Print the loss and accuracy
print(f'Final loss: {loss}')
print(f'Final accuracy: {accuracy}')

Final loss: 1.3925262689590454
Final accuracy: 0.7179948091506958


In [26]:
# Generate new text 10x
for i in range(10):
  seed_text = lines[randint(0, len(lines))]
  print("Seed text: ", seed_text)
  generated = generate_seq(newmodel, tokenizer, seq_length, seed_text, 20)
  print("GENDERATED TEXT:", generated + '\n\n')

Seed text:  lawrence now must i to the monument alone within this three hours will fair juliet wake she will beshrew me much
GENDERATED TEXT: the romeo my night o was be of a be that a grief that a which love as what is


Seed text:  men either withdraw unto some private place and reason coldly of your grievances or else depart here all eyes gaze on
GENDERATED TEXT: us and to peace and balthasar of death thou in can word of young me tell be tell be tell


Seed text:  doth give nor aught so good but from that fair use revolts from true birth stumbling on abuse virtue itself turns
GENDERATED TEXT: vice vice vice the vice of the beast of the house and bakes of the watery beams of the watery


Seed text:  it to my face paris thy face is mine and thou hast it juliet it may be so for it is
GENDERATED TEXT: not mine own you life you that my lady o thou her thou my her love your that man price


Seed text:  is paris have i thought long to see this face and doth it give me such a sight as this 