Import libaries

In [15]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

Load ascii text and convert to lowercase

In [4]:
filename = "C:\\Users\\rmct2\\OneDrive - Sri Lanka Institute of Information Technology\\Desktop\\SLIIT\\MLOM lab\\data.txt"
raw_text = open(filename,'r',encoding='utf-8').read()
raw_text = raw_text.lower()

In [5]:
raw_text[0:100]

"project gutenberg's alice's adventures in wonderland, by lewis carroll\n\nthis ebook is for the use of"

We must prepare the data for modeling by the neural network. We cannot model the charachters directly. Instead we must convert the character value(integer)

We can do this easily by first creating a set of all the distinct characters in the book, then creating a map of each character to a unique integer

In [35]:
#create mapping of unique char to integers
chars = sorted(list(set(raw_text)))
print(chars)
char_to_int = dict((c,i) for i,c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

['\n', ' ', '!', '"', '#', '$', '%', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [9]:
print(char_to_int)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, "'": 7, '(': 8, ')': 9, '*': 10, ',': 11, '-': 12, '.': 13, '/': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, ';': 26, '?': 27, '@': 28, '[': 29, ']': 30, '_': 31, 'a': 32, 'b': 33, 'c': 34, 'd': 35, 'e': 36, 'f': 37, 'g': 38, 'h': 39, 'i': 40, 'j': 41, 'k': 42, 'l': 43, 'm': 44, 'n': 45, 'o': 46, 'p': 47, 'q': 48, 'r': 49, 's': 50, 't': 51, 'u': 52, 'v': 53, 'w': 54, 'x': 55, 'y': 56, 'z': 57}


Geting the details of the dataset

There are just 150,000 characters and that when converted to lowercase that there are only 58 distinct characters in the vocabulary

In [7]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ",n_chars)
print("Total Vocab(Unique characters): ",n_vocab)

Total Characters:  163780
Total Vocab(Unique characters):  58


Prepare the dataset of input to output pairs encoded as integers

In [12]:
#Prepare the dataset of input to output pairs encoded as integers
seq_length = 15 #can be changed
dataX = []
dataY = []
for i in range(0, n_chars - seq_length,1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataY)
print("Total Patterns: ",n_patterns)

Total Patterns:  163765


Transform the list of input sequences into the form [samples, time steps, features] that is expected by an LSTM
network and rescale the integers to the range [0,1] to make the patterns easier to learn by the LSTM network
that uses the sigmoid activation function by default.

In [13]:
#reshape X to be[samples,time steps,features]
X = numpy.reshape(dataX,(n_patterns, seq_length, 1))
#normalize - rescaling the integer values
X = X / float(n_vocab)
print(X)

[[[0.81034483]
  [0.84482759]
  [0.79310345]
  ...
  [0.77586207]
  [0.56896552]
  [0.62068966]]

 [[0.84482759]
  [0.79310345]
  [0.70689655]
  ...
  [0.56896552]
  [0.62068966]
  [0.84482759]]

 [[0.79310345]
  [0.70689655]
  [0.62068966]
  ...
  [0.62068966]
  [0.84482759]
  [0.65517241]]

 ...

 [[0.55172414]
  [0.56896552]
  [0.79310345]
  ...
  [0.79310345]
  [0.79310345]
  [0.72413793]]

 [[0.56896552]
  [0.79310345]
  [0.89655172]
  ...
  [0.79310345]
  [0.72413793]
  [0.86206897]]

 [[0.79310345]
  [0.89655172]
  [0.87931034]
  ...
  [0.72413793]
  [0.86206897]
  [0.22413793]]]


**Note** - We use the dropout to obtain generalization of the dataset instead of overfitting the training dataset perferctly

Convert the output values (single characters converted to integers) into a one hot encoding.

In [18]:
#one hot encode the output variable
y = to_categorical(dataY)
#print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


Define the LSTM mode

In [24]:
#Define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2])))#It can have 1 or more training samples
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam')
#define the checkpoints
filepath = "weights-improvement-{epoch:02d} - {loss:.2f}.keras"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callback_list = [checkpoint]

  super().__init__(**kwargs)


Fitting model to data

In [25]:
#Chang the hyperparameter values and train model
epochs = 10
batch_size = 128

In [26]:
model.fit(X,y, epochs=epochs, batch_size = batch_size, callbacks=callback_list)

Epoch 1/10
[1m1279/1280[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - loss: 3.0795
Epoch 1: loss improved from inf to 2.98125, saving model to weights-improvement-01 - 2.98.keras
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 27ms/step - loss: 3.0793
Epoch 2/10
[1m1279/1280[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - loss: 2.8229
Epoch 2: loss improved from 2.98125 to 2.79872, saving model to weights-improvement-02 - 2.80.keras
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 27ms/step - loss: 2.8228
Epoch 3/10
[1m1278/1280[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - loss: 2.7325
Epoch 3: loss improved from 2.79872 to 2.71531, saving model to weights-improvement-03 - 2.72.keras
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 27ms/step - loss: 2.7324
Epoch 4/10
[1m1279/1280[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - loss: 2.6681
Epoch 

<keras.src.callbacks.history.History at 0x270186e3580>

Generate Text with the trained LSTM mode

In [36]:
#load the network weights
filename = "weights-improvement-10 - 2.40.keras"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',optimizer='adam')

#generate a random seed
print(len(dataX))
start = numpy.random.randint(0, len(dataX)-1)
print(start)
pattern = dataX[start] #dataX contains list of patterns
print("Seed: ")
print("\"",''.join([int_to_char[value] for value in pattern]),"\"")

163765
144909
Seed: 
" a pleasure in a "
