In [None]:
pip install tensorflow



In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [None]:
# load ascii text and covert to lowercase
filename = "data_extract.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

In [None]:

# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [None]:

n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  482999
Total Vocab:  59


In [None]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
  seq_in = raw_text[i:i + seq_length]
  seq_out = raw_text[i + seq_length]
  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  482899


In [None]:

# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(dataY)

In [None]:

# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:

# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 1: loss improved from inf to 2.88035, saving model to weights-improvement-01-2.8803.hdf5
Epoch 2/20
   9/3773 [..............................] - ETA: 49s - loss: 2.7781

  saving_api.save_model(


Epoch 2: loss improved from 2.88035 to 2.73981, saving model to weights-improvement-02-2.7398.hdf5
Epoch 3/20
Epoch 3: loss improved from 2.73981 to 2.65709, saving model to weights-improvement-03-2.6571.hdf5
Epoch 4/20
Epoch 4: loss improved from 2.65709 to 2.56603, saving model to weights-improvement-04-2.5660.hdf5
Epoch 5/20
Epoch 5: loss improved from 2.56603 to 2.48206, saving model to weights-improvement-05-2.4821.hdf5
Epoch 6/20
Epoch 6: loss improved from 2.48206 to 2.41418, saving model to weights-improvement-06-2.4142.hdf5
Epoch 7/20
Epoch 7: loss improved from 2.41418 to 2.36124, saving model to weights-improvement-07-2.3612.hdf5
Epoch 8/20
Epoch 8: loss improved from 2.36124 to 2.31886, saving model to weights-improvement-08-2.3189.hdf5
Epoch 9/20
Epoch 9: loss improved from 2.31886 to 2.28276, saving model to weights-improvement-09-2.2828.hdf5
Epoch 10/20
Epoch 10: loss improved from 2.28276 to 2.25029, saving model to weights-improvement-10-2.2503.hdf5
Epoch 11/20
Epoch 1

<keras.src.callbacks.History at 0x7f0336f9bf40>

In [None]:
# Small LSTM Network to Generate Text for Alice in Wonderland
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
# load ascii text and covert to lowercase
filename = "data_extract.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(dataY)
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# fit the model
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Total Characters:  482999
Total Vocab:  59
Total Patterns:  482899
Epoch 1/20
Epoch 1: loss improved from inf to 2.87315, saving model to weights-improvement-01-2.8731.hdf5
Epoch 2/20
Epoch 2: loss improved from 2.87315 to 2.72707, saving model to weights-improvement-02-2.7271.hdf5
Epoch 3/20
Epoch 3: loss improved from 2.72707 to 2.63969, saving model to weights-improvement-03-2.6397.hdf5
Epoch 4/20
Epoch 4: loss improved from 2.63969 to 2.54658, saving model to weights-improvement-04-2.5466.hdf5
Epoch 5/20
Epoch 5: loss improved from 2.54658 to 2.46594, saving model to weights-improvement-05-2.4659.hdf5
Epoch 6/20
Epoch 6: loss improved from 2.46594 to 2.40331, saving model to weights-improvement-06-2.4033.hdf5
Epoch 7/20
Epoch 7: loss improved from 2.40331 to 2.35292, saving model to weights-improvement-07-2.3529.hdf5
Epoch 8/20
Epoch 8: loss improved from 2.35292 to 2.31284, saving model to weights-improvement-08-2.3128.hdf5
Epoch 9/20
Epoch 9: loss improved from 2.31284 to 2.27790

<keras.src.callbacks.History at 0x7f0336f6f490>

## Generating Text with an LSTM Network

In [None]:
filename = "weights-improvement-20-2.0581.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" ild!” observed hester, aside to the
minister. “o, i have much to tell thee about her! but, in
very t "
ooet, and the mani of the soaee of the maniet-place, and the mani of the pani of the pane of the pane oo her breosi was the sore of the searet of the pare ofniler, whth a sore of the searet of the saarlet letter, and the mani of the pane of the saarlet letter was the sore of the searet of the pare ofniler, whth a sore of the searet of the saarlet letter, and the mani of the pane of the saarlet letter was the sore of the searet of the pare ofniler, whth a sore of the searet of the saarlet letter, and the mani of the pane of the saarlet letter was the sore of the searet of the pare ofniler, whth a sore of the searet of the saarlet letter, and the mani of the pane of the saarlet letter was the sore of the searet of the pare ofniler, whth a sore of the searet of the saarlet letter, and the mani of the pane of the saarlet letter was the sore of the searet of the pare ofniler, whth 