In [None]:
pip install tensorflow



In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [9]:
# load ascii text and covert to lowercase
filename = "data_extract.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

In [10]:

# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [11]:

n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  482999
Total Vocab:  59


In [12]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
  seq_in = raw_text[i:i + seq_length]
  seq_out = raw_text[i + seq_length]
  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  482899


In [13]:

# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(dataY)

In [14]:

# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:

# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [16]:
model.fit(X, y, epochs=40, batch_size=128, callbacks=callbacks_list)

Epoch 1/40
Epoch 1: loss improved from inf to 2.87977, saving model to weights-improvement-01-2.8798.hdf5
Epoch 2/40
  10/3773 [..............................] - ETA: 48s - loss: 2.7989

  saving_api.save_model(


Epoch 2: loss improved from 2.87977 to 2.74391, saving model to weights-improvement-02-2.7439.hdf5
Epoch 3/40
Epoch 3: loss improved from 2.74391 to 2.66742, saving model to weights-improvement-03-2.6674.hdf5
Epoch 4/40
Epoch 4: loss improved from 2.66742 to 2.58670, saving model to weights-improvement-04-2.5867.hdf5
Epoch 5/40
Epoch 5: loss improved from 2.58670 to 2.50847, saving model to weights-improvement-05-2.5085.hdf5
Epoch 6/40
Epoch 6: loss improved from 2.50847 to 2.44498, saving model to weights-improvement-06-2.4450.hdf5
Epoch 7/40
Epoch 7: loss improved from 2.44498 to 2.39438, saving model to weights-improvement-07-2.3944.hdf5
Epoch 8/40
Epoch 8: loss improved from 2.39438 to 2.35432, saving model to weights-improvement-08-2.3543.hdf5
Epoch 9/40
Epoch 9: loss improved from 2.35432 to 2.31767, saving model to weights-improvement-09-2.3177.hdf5
Epoch 10/40
Epoch 10: loss improved from 2.31767 to 2.28791, saving model to weights-improvement-10-2.2879.hdf5
Epoch 11/40
Epoch 1

<keras.src.callbacks.History at 0x7a806a379fc0>

In [19]:
from tensorflow.keras.models import load_model
from google.colab import files
# Save the final trained model
model.save("final_model.h5")

# Download the final trained model file
files.download("final_model.h5")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Generating Text with an LSTM Network

In [22]:
filename = "weights-improvement-40-1.9462.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [23]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [25]:
import sys

In [26]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" what
resembled that of a person who should entertain an idea
of committing suicide, and, although be "
leriige the sore of her monher, and to the searen thet were a saaredld to the searen that saeked to be aerere herd and the siale whth the sore of her mother’s siale and searet the searen that sae to meter to the searet that whuh the sore of her mother’s siale and searen thet teeree to be the searet that he had been doen to the sooe of her mother’s siale and searen thet teeree to be the searet that hed mote the sooe of the sore of her mother’s siale and searen thet tee hor to the searen thet was a coefty ceal if hed mote the soaee tfat had been soeer of the sooe and searet and searet of her mother’s sorl, and the searen benne the sooe of the sore of her mother’s siale and searen thet teemed to be the searet that hed mote the sooe of the sore of her mother’s siale and searen thet tee hor to the searen thet was a coefty ceal if hed mote the soaee tfat had been soeer of the sooe a

In [27]:
def calculate_perplexity(model, X, y):
    # Predict probabilities for each character
    predictions = model.predict(X, verbose=0)

    # Flatten the predictions and true labels
    predictions_flat = predictions.reshape(-1, predictions.shape[-1])
    y_flat = y.reshape(-1, y.shape[-1])

    # Calculate cross-entropy loss
    cross_entropy = -np.sum(y_flat * np.log(predictions_flat + 1e-10)) / len(predictions_flat)

    # Calculate perplexity
    perplexity = np.exp(cross_entropy)

    return perplexity

# Calculate and print perplexity
model_perplexity = calculate_perplexity(model, X, y)
print(f"Model Perplexity: {model_perplexity}")

Model Perplexity: 6.201847939737958
