In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
#simple training set
data="""
Once upon a time in a land far away.there lived a young prince,
The price was brave ,strong and kind. One day , the prince set out
on an adventure to discover new lands and find hidden treasures."""


**Data Preparation**

The sample text is tokenized using Keras's Tokenizer. Input sequences are created with an increasing number of tokens to predict the next word in each sequence, tokenizer Tokenizer(): Creates an instance of the Tokenizer, which will be used to convert the text into sequences of numbers.** tokenizer. text fit_on_texts([data]):** Fits the tokenizer on the input text (data), creating a dictionary that maps each word to a unique integer.

**total_words= len(tokenizer.word_index)+ 1:** tokenizer word_index is a dictionary that maps words to indices.

We add 1 to the total number of words because indices typically start from 1, and we need to account for a padding token (if used)

In [None]:
#preprocess the text
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
total_words=len(tokenizer.word_index)+1

Splits Text: The code splits data into sentences using ". " as a delimiter.
Tokenizes Sentences: Each sentence is converted into a list of tokens (integers) using tokenizer.texts_to_sequences().
Generates n-grams: For each sentence, it creates n-gram sequences (subsets of increasing length) from the tokens.
Stores Sequences: The n-gram sequences are stored in the input_sequences list for further processing.

In [None]:
#convert the text into sequence of tokens
input_sequences=[]
for line in data.split(". "):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
      n_gram_sequence=token_list[:i+1]
      input_sequences.append(n_gram_sequence)

max_sequence_len: Finds the length of the longest sequence in input_sequences so that all sequences can be padded to the same length. pad_sequences (input_sequences.maxlen=max_sequence_len, padding='pre'):
pads shorter sequences with zeros at the beginning ("pre") so that all sequences are of the same length.

In [None]:
#pad sequences for consistent input size
max_sequence_len=max([len(x) for x in input_sequences])
input_sequences=np.array(pad_sequences(input_sequences,maxlen=ax_sequence_len,padding='pre'))

X= input sequences 1] Takes all but the last word of each sequence as the input (features). This is what the model will use to predict the next word. y= input sequences[:-1]: The last word in the sequence is treated as the label (the word to be predicted). y = np.eye(total_words) [y]: Converts y into a one-hot encoded format,

which is needed for classification If total words is 100 the output will be a vector of size 100 where only one position (corresponding to the correct word) is 1, and the rest are 0.

In [None]:
#Create predictors and label
X,y=input_sequences[:,:-1],input_sequences[:,-1]
y=np.eye(total_words)[y] # one-hot encode the labels

**Model Architecture:**

An embedding layer to represent words in vectors. A simple RNN layer to learn the sequence of words. A dense layer with softmax  activation to predict the next word based on the input sequence.

In [None]:
#Build the RNN model
model = Sequential()
model.add(Embedding(total_words,10,input_length=max_sequence_len-1))
model.add(SimpleRNN(150,return_sequences=False))
model.add(Dense(total_words,activation='softmax'))



In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
#train the model
model.fit(X,y,epochs=100,verbose=1)

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.0289 - loss: 3.5297
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1447 - loss: 3.4681
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.2211 - loss: 3.4278
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.2315 - loss: 3.3799
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.1736 - loss: 3.3335
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1921 - loss: 3.2744
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2025 - loss: 3.2082
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.1817 - loss: 3.1512
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7d48858f8190>

In [None]:
#function to predict next word
def predict_next_word(seed_text,num_words):
  for _ in range(num_words):
    token_list=tokenizer.texts_to_sequences([seed_text])[0]
    token_list=pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')
    predicted=np.argmax(model.predict(token_list),axis=-1)
    output_word=""
    for word,index in tokenizer.word_index.items():
      if index==predicted:
        output_word=word
        break
    seed_text+=" "+output_word
  return seed_text

In [None]:
#test the model'
seed_text="The Prince"
next_words=5
print(predict_next_word(seed_text,next_words))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 262ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
The Prince set prince on an adventure
