In [1]:
!pip install tensorflow



In [2]:
#Importing Librearies

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [3]:
#Load the dataset
with open('blue_castle.txt','r') as file:
    data=file.read().lower()

In [4]:
# Tokenize the text-creating indexes for words
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
total_words=len(tokenizer.word_index)+1
total_words

8727

In [5]:
tokenizer.word_index

{'the': 1,
 'to': 2,
 'and': 3,
 'of': 4,
 'a': 5,
 'she': 6,
 '”': 7,
 'her': 8,
 'was': 9,
 'in': 10,
 'had': 11,
 'it': 12,
 'valancy': 13,
 'that': 14,
 'you': 15,
 'i': 16,
 'he': 17,
 'with': 18,
 'not': 19,
 'for': 20,
 'but': 21,
 'said': 22,
 'be': 23,
 'at': 24,
 'on': 25,
 'his': 26,
 'as': 27,
 'all': 28,
 'have': 29,
 'would': 30,
 'barney': 31,
 'if': 32,
 'one': 33,
 'so': 34,
 'there': 35,
 'been': 36,
 'they': 37,
 'up': 38,
 'never': 39,
 'him': 40,
 'little': 41,
 'when': 42,
 'them': 43,
 'or': 44,
 'me': 45,
 'is': 46,
 'uncle': 47,
 'like': 48,
 'were': 49,
 'could': 50,
 'this': 51,
 'cousin': 52,
 'out': 53,
 'from': 54,
 'what': 55,
 'did': 56,
 'old': 57,
 'no': 58,
 'by': 59,
 'any': 60,
 'an': 61,
 'over': 62,
 'about': 63,
 'thought': 64,
 '“i': 65,
 'do': 66,
 'my': 67,
 'mrs': 68,
 'always': 69,
 'back': 70,
 'know': 71,
 'who': 72,
 'frederick': 73,
 'went': 74,
 'must': 75,
 'go': 76,
 'your': 77,
 'dr': 78,
 'only': 79,
 'are': 80,
 'come': 81,
 'now':

In [6]:
# Create inpput sequences
input_sequences=[]
for line in data.split('\n'):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [7]:
input_sequences

[[3764, 110],
 [3764, 110, 97],
 [3764, 110, 97, 608],
 [3764, 110, 97, 608, 4],
 [3764, 110, 97, 608, 4, 1],
 [3764, 110, 97, 608, 4, 1, 94],
 [3764, 110, 97, 608, 4, 1, 94, 143],
 [3764, 110, 97, 608, 4, 1, 94, 143, 59],
 [3764, 110, 97, 608, 4, 1, 94, 143, 59, 2456],
 [3764, 110, 97, 608, 4, 1, 94, 143, 59, 2456, 2457],
 [3764, 110, 97, 608, 4, 1, 94, 143, 59, 2456, 2457, 1884],
 [51, 608],
 [51, 608, 46],
 [51, 608, 46, 20],
 [51, 608, 46, 20, 1],
 [51, 608, 46, 20, 1, 250],
 [51, 608, 46, 20, 1, 250, 4],
 [51, 608, 46, 20, 1, 250, 4, 1293],
 [51, 608, 46, 20, 1, 250, 4, 1293, 880],
 [51, 608, 46, 20, 1, 250, 4, 1293, 880, 10],
 [51, 608, 46, 20, 1, 250, 4, 1293, 880, 10, 1],
 [51, 608, 46, 20, 1, 250, 4, 1293, 880, 10, 1, 534],
 [51, 608, 46, 20, 1, 250, 4, 1293, 880, 10, 1, 534, 446],
 [51, 608, 46, 20, 1, 250, 4, 1293, 880, 10, 1, 534, 446, 3],
 [275, 150],
 [275, 150, 2458],
 [275, 150, 2458, 4],
 [275, 150, 2458, 4, 1],
 [275, 150, 2458, 4, 1, 212],
 [275, 150, 2458, 4, 1, 212

In [8]:
max([len(x) for x in input_sequences])

18

In [9]:
# Pad Sequences
max_sequence_len=max([len(x) for x in input_sequences])
max_sequence_len

18

In [10]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0, 3764,  110],
       [   0,    0,    0, ..., 3764,  110,   97],
       [   0,    0,    0, ...,  110,   97,  608],
       ...,
       [   0,    0,    0, ...,    2,  401,   63],
       [   0,    0,    0, ...,  401,   63,  213],
       [   0,    0,    0, ...,   63,  213, 1081]], dtype=int32)

In [11]:
# Create predicitors and label
import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]

In [12]:
x

array([[   0,    0,    0, ...,    0,    0, 3764],
       [   0,    0,    0, ...,    0, 3764,  110],
       [   0,    0,    0, ..., 3764,  110,   97],
       ...,
       [   0,    0,    0, ..., 8726,    2,  401],
       [   0,    0,    0, ...,    2,  401,   63],
       [   0,    0,    0, ...,  401,   63,  213]], dtype=int32)

In [13]:
y

array([ 110,   97,  608, ...,   63,  213, 1081], dtype=int32)

In [14]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))


In [15]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [16]:
#Create LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
lstm_model.add(LSTM(150))
lstm_model.add(Dense(total_words, activation='softmax'))
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 100)           872700    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 8727)              1317777   
                                                                 
Total params: 2341077 (8.93 MB)
Trainable params: 2341077 (8.93 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(x,y, epochs=50,verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7992a00f2d70>

In [18]:
# Save the model
lstm_model.save("next_word_lstm.h5")

  saving_api.save_model(


In [19]:
# Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
#  Prediction next words based on input text and next words

seed_text = "Produced by"
next_words = 3

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(lstm_model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

Produced by laura natal rodrigues


In [21]:
#Create GRU Model

gru_model = Sequential()
gru_model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
gru_model.add(GRU(150))
gru_model.add(Dense(total_words, activation='softmax'))
gru_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 17, 100)           872700    
                                                                 
 gru (GRU)                   (None, 150)               113400    
                                                                 
 dense_1 (Dense)             (None, 8727)              1317777   
                                                                 
Total params: 2303877 (8.79 MB)
Trainable params: 2303877 (8.79 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
gru_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
gru_model.fit(x,y, epochs=50,verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7991a0374760>

In [23]:
## Save the model
gru_model.save("next_word_gru.h5")