In [1]:
import tensorflow as tf
import numpy

from keras.models import Sequential
from keras.layers import Dense                #final layers to predict next word
from keras.layers import Dropout              #to prevent overfit
from keras.layers import LSTM                 #type of RNN to keep long term memory
from keras.callbacks import ModelCheckpoint   #to save model
from keras.utils import to_categorical     #to utility function

In [2]:
filename = "/content/drive/MyDrive/Practical Materials - Lab 6/data.txt"
raw_text = open(filename,'r', encoding='utf-8').read()
raw_text = raw_text.lower()

In [3]:
raw_text[0:100]

"project gutenberg's alice's adventures in wonderland, by lewis carroll\n\nthis ebook is for the use of"

In [4]:
#create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c,i) for i,c in enumerate(chars))

In [5]:
print(char_to_int)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, "'": 7, '(': 8, ')': 9, '*': 10, ',': 11, '-': 12, '.': 13, '/': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, ';': 26, '?': 27, '@': 28, '[': 29, ']': 30, '_': 31, 'a': 32, 'b': 33, 'c': 34, 'd': 35, 'e': 36, 'f': 37, 'g': 38, 'h': 39, 'i': 40, 'j': 41, 'k': 42, 'l': 43, 'm': 44, 'n': 45, 'o': 46, 'p': 47, 'q': 48, 'r': 49, 's': 50, 't': 51, 'u': 52, 'v': 53, 'w': 54, 'x': 55, 'y': 56, 'z': 57}


In [6]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  163780
Total Vocab:  58


In [7]:
#prepare the dataset of input to output pairs encode as integers
seq_length = 100 #can be changed
dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
  seq_in = raw_text[i:i + seq_length]
  seq_out = raw_text[i + seq_length]
  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  163680


In [8]:
#checking dataX and dataY
print(dataX[163679])
print(dataY[163679])

[1, 39, 36, 43, 47, 1, 47, 49, 46, 35, 52, 34, 36, 1, 46, 52, 49, 1, 45, 36, 54, 1, 36, 33, 46, 46, 42, 50, 11, 1, 32, 45, 35, 1, 39, 46, 54, 1, 51, 46, 0, 50, 52, 33, 50, 34, 49, 40, 33, 36, 1, 51, 46, 1, 46, 52, 49, 1, 36, 44, 32, 40, 43, 1, 45, 36, 54, 50, 43, 36, 51, 51, 36, 49, 1, 51, 46, 1, 39, 36, 32, 49, 1, 32, 33, 46, 52, 51, 1, 45, 36, 54, 1, 36, 33, 46, 46, 42, 50, 13]
0


In [9]:
#reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
#normalize
X = X / float(n_vocab)
#one hot encode the output variable
y = to_categorical(dataY)

In [10]:
#define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))  # (timesteps, features)
model.add(Dropout(0.2))
model.add(Dense(n_vocab, activation='softmax'))  # n_vocab for vocabulary size
model.compile(loss='categorical_crossentropy', optimizer='adam')

#define the checkpoint to save only weights
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.weights.h5"  # .weights.h5 extension for weights
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min')
callbacks_list = [checkpoint]


  super().__init__(**kwargs)


In [11]:
#change the hyperparameter values and train the model
epochs = 10
batch_size = 128

In [12]:
model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)

Epoch 1/10
[1m1278/1279[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - loss: 3.0685
Epoch 1: loss improved from inf to 2.97283, saving model to weights-improvement-01-2.9728.weights.h5
[1m1279/1279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 13ms/step - loss: 3.0684
Epoch 2/10
[1m1276/1279[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - loss: 2.8259
Epoch 2: loss improved from 2.97283 to 2.79729, saving model to weights-improvement-02-2.7973.weights.h5
[1m1279/1279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 13ms/step - loss: 2.8258
Epoch 3/10
[1m1277/1279[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - loss: 2.7307
Epoch 3: loss improved from 2.79729 to 2.71855, saving model to weights-improvement-03-2.7185.weights.h5
[1m1279/1279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 13ms/step - loss: 2.7307
Epoch 4/10
[1m1279/1279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss

<keras.src.callbacks.history.History at 0x79a1b6c97820>

Generating text with trained LSTM model

In [19]:
#Load the network weights
import os
filename =  "/content/drive/MyDrive/Practical Materials - Lab 6"
model.save_weights(os.path.join(filename,"weight-improvement-10-2.3517.weights.h5"))

In [20]:

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [21]:
int_to_char = dict((i,c) for i,c in enumerate(chars))

In [22]:
#generate a random seed
start = numpy.random.randint(0, len(dataX)-1)
print("Seed Start Index:", start)
pattern = dataX[start]
print("Seed Pattern:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

Seed Start Index: 92595
Seed Pattern:
" ing to find that the hedgehog had unrolled
itself, and was in the act of crawling away: besides all  "


In [23]:
pattern

[40,
 45,
 38,
 1,
 51,
 46,
 1,
 37,
 40,
 45,
 35,
 1,
 51,
 39,
 32,
 51,
 1,
 51,
 39,
 36,
 1,
 39,
 36,
 35,
 38,
 36,
 39,
 46,
 38,
 1,
 39,
 32,
 35,
 1,
 52,
 45,
 49,
 46,
 43,
 43,
 36,
 35,
 0,
 40,
 51,
 50,
 36,
 43,
 37,
 11,
 1,
 32,
 45,
 35,
 1,
 54,
 32,
 50,
 1,
 40,
 45,
 1,
 51,
 39,
 36,
 1,
 32,
 34,
 51,
 1,
 46,
 37,
 1,
 34,
 49,
 32,
 54,
 43,
 40,
 45,
 38,
 1,
 32,
 54,
 32,
 56,
 25,
 1,
 33,
 36,
 50,
 40,
 35,
 36,
 50,
 1,
 32,
 43,
 43,
 1]

In [25]:
#generate characters
length = 100
final = []

for i in range(length):
  # reshaping the seed sequence before passing it into the LSTM model
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  # normalizing the integer value
  x = x / float(n_vocab)
  # making prediction
  prediction = model.predict(x, verbose=0)
  #get the predicted value with maximum probability
  index = numpy.argmax(prediction)
  #get the predicted integer to char
  result = int_to_char[index]
  final.append(result)
  # adding the predicted character to the sequence
  pattern.append(index)
  # removing the first character from the sequence
  pattern = pattern[1:len(pattern)]
print(final)

['t', 'h', 'e', ' ', 'w', 'o', 'i', 'l', 'e', ' ', 't', 'h', 'e', ' ', 'w', 'a', 'i', ' ', 'i', 'o', 't', ' ', 't', 'o', ' ', 't', 'h', 'e', ' ', 't', 'o', 'i', 'e', 'e', ' ', 't', 'h', ' ', 't', 'h', 'e', ' ', 'c', 'a', 'r', ' ', 'h', 'f', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'u', 'r', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'u', 'r', 'e', ' ', 't', 'h', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'u', 'r', 'e', ' ', 't', 'h', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'u', 'r', 'd', ' ', 't', 'h', ' ']
