In [2]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,SimpleRNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [14]:
#sample training text
data='''
once upon a time in a land far away, there lived a young prince.
The prince was brave, Strong and kind.One day, the prince set out
on an adventure to discover new lands and find hidden treasures.

'''

Data Preparation

The sample text is tokenized using Keras's Tokenizer. input sequences are created with an increasing number of tokens to predict the next word in each sequence, tokenizer Tokenizer(): Creates an instance of the Tokenizer, which will be used to convert the text into sequences of numbers, tokenizer.fit on_texts([data]): Fits the tokenizer on the input text (data), creating a dictionary that maps each word to a unique integer. konal words len(tokenizer.word index)+ 1; tokenizer word index is a dictionary that maps words to indices.

We add 1 to the total number of words because indices typically start from 1, and we need to account for a padding token (if used)

Text preprocessing

In [15]:
#preprocess the  text
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
total_words=len(tokenizer.word_index)+1
print(total_words)

33


tokenizer.texts_to_sequences((line)) takes the current sentence (line) and converts it into a list of tokens (or integers). This tokenization step assumes you have a tokenizer object (likely a Tokenizer from Keras or a similar library) that maps words to unique integers. Since texts_to_sequences returns a list of lists (because it processes batches of sentences), the [0] index is used to extract the token list for the current sentence. This for loop iterates over the tokenized sentence, starting from the second token (i = 1). In each iteration, it creates an n-gram sequence by taking the first i+1 tokens from the token_list using slicing: token_list[:i+1]. This gives a subsequece of increasing length.
Each n_gram_sequence is added to the input_sequences list.

convert the text into input sequence of tokens

In [16]:
input_seqence=[]
for line in data.split('.'):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_seqence.append(n_gram_sequence)

max_sequence_lenL Finds the length of the longest input sequence so that all sequences can be padded to the same slenght pad_sequences(input_sequence,max_len=max_sequence_len,padding='pre'): pads the shorter sequences with the zeros at the begining('pre') so that all sequence is of same lenght.

In [18]:
#pad the sequences for consistent input size
max_seqence_len=max([len(x) for x in input_seqence])
input_seqence=np.array(pad_sequences(input_seqence,maxlen=max_seqence_len,padding='pre'))

X=input_sequences[::-1] Takes all but the last word of each sequence as the input (features). This is what the model will use to predict the next word.
y=input_sequences[:,-1]: The last word in the sequence is treated as the label (the word to be predicted)
y=np.eye(total words)(y): Converts y into a one-hot encoded format,which is needed for classification if total words is 100 the output will be a vector of size 100 where only one position(corresponding to the correct words)rest one are zero.

In [20]:
#create precitors and labels
X,y=input_seqence[:,:-1],input_seqence[:,-1]
y=np.eye(total_words)[y]#one hot encode to the label

Model Architecture:

An embedding layer to represent words in vectors. A simple RNN layer to learn the sequence of words. A dense layer with softmax activation to predict the next word based on the input sequence.

In [21]:
#building the model
model=Sequential()
model.add(Embedding(total_words,10,input_length=max_seqence_len-1))
model.add(SimpleRNN(500,return_sequences=False))
model.add(Dense(total_words,activation='softmax'))



In [23]:
#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [24]:
model.fit(X,y,epochs=100,verbose=1)

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.0000e+00 - loss: 3.5020
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.3137 - loss: 3.3793 
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.2548 - loss: 3.2655 
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.1664 - loss: 3.1132
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.1473 - loss: 2.9376 
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.3241 - loss: 2.7432
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.2842 - loss: 2.7988 
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.2842 - loss: 2.5396 
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d7e4afd3b80>

predict next word Function to predict the next num words given some seed text.

kanizer sevts, o sequences(seed.text)(0) Converts the seed text into a sequence of integers.

pat sequences token ist maxlen-max sequence, Jen-1, padding pre"): Pads the token list to match the nput lingh requred by the model model predict(token list): The model predicts probabilities for each word in the vocabulary, rp.arpmaripredicted). Retrieves the index of the word with the highest probability, for word, Indies in tokmizecard index items(): Finds the word corresponding to the predicted index. seed_text += output wart Appends the predicted word to the seed text. retum seed text: Returns the seed text with() predicted words appended. Testing the Modet python Copy code seed text "The prince" next words = 5 prve predict next wond(seed,text, next words)) seed text "The prince: The seed text for which you want to generate the next few words, next, words 5: The number of words you want to predict. printpredict next word(seed,text, next words)): Prints the result

In [36]:
#function to predict next word
def predict_next_word(num_words, seed_text): # Add seed_text as an argument
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seqence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    print(seed_text)

seed_text="once upon a time"
next_words=2
predict_next_word(next_words, seed_text) # Pass seed_text as an argument

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
once upon a time in a
