#### Import the libraries

In [1]:
import pandas as pd 
import numpy as np 
from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re
import keras




### Define the source data

In [2]:
#Sample text data

text_data= [
    "hello how are you ",
    "hello how have you been",
    "hi there",
    "good morning",
    "good night",
    "have a nice day",
    "how is it going",
    "how have you been",
    "nice to meet you"
    "thank you"
]

#### Create the tokenizer

In [3]:
#prepare the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

In [4]:
total_words

21

In [5]:
tokenizer.word_index

{'how': 1,
 'you': 2,
 'have': 3,
 'hello': 4,
 'been': 5,
 'good': 6,
 'nice': 7,
 'are': 8,
 'hi': 9,
 'there': 10,
 'morning': 11,
 'night': 12,
 'a': 13,
 'day': 14,
 'is': 15,
 'it': 16,
 'going': 17,
 'to': 18,
 'meet': 19,
 'youthank': 20}

### Prepare the input sequences

In [6]:
#Create sequences of words
input_sequences =[]
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence =token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [7]:
input_sequences

[[4, 1],
 [4, 1, 8],
 [4, 1, 8, 2],
 [4, 1],
 [4, 1, 3],
 [4, 1, 3, 2],
 [4, 1, 3, 2, 5],
 [9, 10],
 [6, 11],
 [6, 12],
 [3, 13],
 [3, 13, 7],
 [3, 13, 7, 14],
 [1, 15],
 [1, 15, 16],
 [1, 15, 16, 17],
 [1, 3],
 [1, 3, 2],
 [1, 3, 2, 5],
 [7, 18],
 [7, 18, 19],
 [7, 18, 19, 20],
 [7, 18, 19, 20, 2]]

### padding of sequences

In [8]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences= pad_sequences(
    input_sequences, maxlen=max_sequence_len , padding ='pre')

In [9]:
input_sequences

array([[ 0,  0,  0,  4,  1],
       [ 0,  0,  4,  1,  8],
       [ 0,  4,  1,  8,  2],
       [ 0,  0,  0,  4,  1],
       [ 0,  0,  4,  1,  3],
       [ 0,  4,  1,  3,  2],
       [ 4,  1,  3,  2,  5],
       [ 0,  0,  0,  9, 10],
       [ 0,  0,  0,  6, 11],
       [ 0,  0,  0,  6, 12],
       [ 0,  0,  0,  3, 13],
       [ 0,  0,  3, 13,  7],
       [ 0,  3, 13,  7, 14],
       [ 0,  0,  0,  1, 15],
       [ 0,  0,  1, 15, 16],
       [ 0,  1, 15, 16, 17],
       [ 0,  0,  0,  1,  3],
       [ 0,  0,  1,  3,  2],
       [ 0,  1,  3,  2,  5],
       [ 0,  0,  0,  7, 18],
       [ 0,  0,  7, 18, 19],
       [ 0,  7, 18, 19, 20],
       [ 7, 18, 19, 20,  2]])

### Split into features and labelsm

In [11]:
input_sequences = np.array(input_sequences)
X,y = input_sequences[:,:-1], input_sequences[:,-1]
y = keras.utils.to_categorical(y, num_classes= total_words)

In [12]:
input_sequences[:,-1]

array([ 1,  8,  2,  1,  3,  2,  5, 10, 11, 12, 13,  7, 14, 15, 16, 17,  3,
        2,  5, 18, 19, 20,  2])

In [13]:
y

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0.,

#### Build the model

In [16]:
#build the RNN model
model =Sequential()
model.add(Embedding(total_words, 10, input_length= max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

In [17]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 10)             210       
                                                                 
 lstm (LSTM)                 (None, 4, 150)            96600     
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 21)                3171      
                                                                 
Total params: 280581 (1.07 MB)
Trainable params: 280581 (1.07 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#### Compile the model

In [18]:
model.compile(loss = 'categorical_crossentropy',
                optimizer='adam',
                metrics='accuracy')




### Train the model

In [19]:
model.fit(X,y, epochs=200 , batch_size=1)

Epoch 1/200


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 

<keras.src.callbacks.History at 0x1d3c8d09e90>

### Prediction on new data

In [20]:
new = 'nice'

In [23]:
def predict_new_word(model, tokenizer, new,max_sequence_len):


SyntaxError: incomplete input (1284590056.py, line 1)

In [21]:

next_word = predict_next_word(model, tokenizer, new,max_sequence_len)
print("Next word:",next_word)


NameError: name 'predict_next_word' is not defined