In [10]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

In [11]:
# Sample text data
text_data = "Hello, how are you? I am doing well. Thank you for asking."

In [12]:
# Tokenize the text
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts([text_data])
total_words = len(tokenizer.word_index) + 1
print(tokenizer.word_index)

{'you': 1, 'hello': 2, 'how': 3, 'are': 4, 'i': 5, 'am': 6, 'doing': 7, 'well': 8, 'thank': 9, 'for': 10, 'asking': 11}


In [13]:
# Create input sequences and labels
input_sequences = []
for line in text_data.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    print ('token_list',token_list) 
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
       
    print('input_sequences',input_sequences)  
print('input_sequences',input_sequences)      

token_list [2, 3, 4, 1, 5, 6, 7, 8]
input_sequences [[2, 3], [2, 3, 4], [2, 3, 4, 1], [2, 3, 4, 1, 5], [2, 3, 4, 1, 5, 6], [2, 3, 4, 1, 5, 6, 7], [2, 3, 4, 1, 5, 6, 7, 8]]
token_list [9, 1, 10, 11]
input_sequences [[2, 3], [2, 3, 4], [2, 3, 4, 1], [2, 3, 4, 1, 5], [2, 3, 4, 1, 5, 6], [2, 3, 4, 1, 5, 6, 7], [2, 3, 4, 1, 5, 6, 7, 8], [9, 1], [9, 1, 10], [9, 1, 10, 11]]
token_list []
input_sequences [[2, 3], [2, 3, 4], [2, 3, 4, 1], [2, 3, 4, 1, 5], [2, 3, 4, 1, 5, 6], [2, 3, 4, 1, 5, 6, 7], [2, 3, 4, 1, 5, 6, 7, 8], [9, 1], [9, 1, 10], [9, 1, 10, 11]]
input_sequences [[2, 3], [2, 3, 4], [2, 3, 4, 1], [2, 3, 4, 1, 5], [2, 3, 4, 1, 5, 6], [2, 3, 4, 1, 5, 6, 7], [2, 3, 4, 1, 5, 6, 7, 8], [9, 1], [9, 1, 10], [9, 1, 10, 11]]


In [14]:

# Pad sequences for equal length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre')
print(max_sequence_len)
print('input_sequences',input_sequences)

8
input_sequences [[ 0  0  0  0  0  0  2  3]
 [ 0  0  0  0  0  2  3  4]
 [ 0  0  0  0  2  3  4  1]
 [ 0  0  0  2  3  4  1  5]
 [ 0  0  2  3  4  1  5  6]
 [ 0  2  3  4  1  5  6  7]
 [ 2  3  4  1  5  6  7  8]
 [ 0  0  0  0  0  0  9  1]
 [ 0  0  0  0  0  9  1 10]
 [ 0  0  0  0  9  1 10 11]]


In [15]:
# Create predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

# Convert labels to one-hot encoding
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

# Define the model
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

# Fit the model
history = model.fit(xs, ys, epochs=100, verbose=1)

def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], 
                                                                   maxlen=max_sequence_len-1,
                                                                   padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_index = tf.argmax(predicted_probs, axis=-1).numpy()
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Generate text
print(generate_text("Hello", 1, model, max_sequence_len))


Epoch 1/100




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.1000 - loss: 2.4874
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.1000 - loss: 2.4785
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.2000 - loss: 2.4696
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.2000 - loss: 2.4604
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.2000 - loss: 2.4509
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.2000 - loss: 2.4407
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.2000 - loss: 2.4296
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.2000 - loss: 2.4174
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [20]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], 
                                                                   maxlen=max_sequence_len-1,
                                                                   padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_index = tf.argmax(predicted_probs, axis=-1).numpy()
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Generate text "Hello, how are you? I am doing well. Thank you for asking."
print(generate_text("Hello", 9, model, max_sequence_len))


Hello how are you i am doing well well well
