In [2]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#sample traning text
data="""
Once upon a time in a land far away, there lived a young prince.
The prince was brave , strong , and kind . One day, the princes set out
on an adventure to discover new lands and find hidden treasures.
"""



Data Preparation:
The sample text is tokenized using Keras's Tokenizer. Input sequences are created with an increasing number of tokens to predict the next word in each sequence, tokenizer Tokenizer(): Creates an instance of the Tokenizer, which will be used to convert the text into sequences of numbers, tokenizer.fit_on_texts([data]): Fits the tokenizer on the input text (data), creating a dictionary that maps each word to a unique integer, total_words = len(tokenizer.word_index) + 1: tokenizer.word_index is a dictionary that maps words to indices. We add 1 to the total number of words because indices typically start from 1, and we need to account for a padding token (if used).

In [5]:
#preprocess the Text
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
total_words=len(tokenizer.word_index) + 1

tokenizer.texts_to_sequences([line]) takes the current sentence (line) and converts it into a list of tokens (or integers). This tokenization step assumes you have a tokenizer object (likely a Tokenizer from Keras or a similar library) that maps words to unique integers. Since texts_to_sequences returns a list of lists (because it processes batches of sentences), the [0] index is used to extract the token list for the current sentence. This for loop iterates over the tokenized sentence, starting from the second token (i=1). In each iteration, it creates an n-gram sequence by taking the first i+1 tokens from the token_list using slicing:token_list[i+1]. This gives a Sequence of increasing length. Each n_gram_sequence is added to the input_sequence list

In [6]:
#convert the text into sequence of tokens
input_sequence=[]

for line in data.split("."):
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence=token_list[: i+1]
    input_sequence.append(n_gram_sequence)


max sequence len: Finds the length of the longest sequence in input sequences so that all sequences can be padded to the same length, pad sequences (input sequences, maxlen=max_sequence_len, padding='pre") Pads shorter sequences with zeros at the beginning (pre) so that all sequences are of the same length.



In [15]:
#Pad sequences for consistent input size
max_sequence_len = max([len(x) for x in input_sequence])
input_sequence= np.array(pad_sequences(input_sequence, maxlen=max_sequence_len, padding='pre'))


X = input_sequences[::1]: Takes all but the last word of each sequence as the input (features). This is what the model will use to predict the next word. y= input_sequences[:-1]: The last word in the sequence is treated as the label (the word to be predicted). y = np.eye(total_words)[y]: Converts y into a one-hot encoded format, which is needed for classification If total worde is 100 the output will ba a vector of size 100 where only one position (corresepondingg  to the correct word) is 1 and the rest are 0

In [16]:
#create predictors and labels
X, y=input_sequence[:, :-1] , input_sequence[:,-1]
y= np.eye(total_words)[y] #one hot encode the labels

#model Architecture
An embedding layers to represent words in vectors. A simple RNN layer to learn the sequence of words . A dense layers with softmax activation to predict the next word based on the input sequence

In [18]:
#bulid the RNN model
model=Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(SimpleRNN(150, return_sequences=False))
model.add(Dense(total_words, activation='softmax'))



In [19]:
#compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [20]:
#train the model
model.fit(X,y, epochs=100, verbose=1)

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.0000e+00 - loss: 3.5237
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2548 - loss: 3.4810 
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2548 - loss: 3.4541 
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2045 - loss: 3.4292 
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1768 - loss: 3.4008 
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0884 - loss: 3.3980 
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1074 - loss: 3.3754 
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1664 - loss: 3.3350 
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7ce9ff7e0af0>

predict_next_word: Function to predict the next num_words given some seed_text.
tokenizer.texts_to_sequences([seed_text])[0]: Converts the seed text into a sequence of integers. pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre"): Pads the token list to match the input length required by the model, model.predict(token_list): The model predicts probabilities for each word in the vocabulary. np.argmax(predicted): Retrieves the index of the word with the highest probability, for word, index in tokenizer.word_index.items():: Finds the word corresponding to the predicted index. seed_text +=**+ output word: Appends the predicted word to the seed text. return seed_text: Returns the seed text with predicted words appended. Testing the Model: python Copy code seed_text = "The prince" next_words = 5 print(predict_next_word(seed_text, next_words)) seed_text = "The prince": The seed text for which you want to generate the next few words. next words = 5: The number of words you want to predict. print(predict_next_word(seed_text, next_words)): Prints the result

In [21]:
#function to predict the next words
def predict_next_word(seed_text,num_words):
  for _ in range(num_words):
    token_list= tokenizer.texts_to_sequences([seed_text])[0]
    token_list=pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted=np.argmax(model.predict(token_list), axis=-1)
    output_word=""
    for word, index in tokenizer.word_index.items():
      if index==predicted:
        output_word=word
        break
    seed_text+=" "+ output_word

  return seed_text

In [25]:
seed_text="The Prince"
next_words=5
print(predict_next_word(seed_text,next_words))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
The Prince was brave strong and kind
