In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

In [None]:
text = "A short paragraph is about 100-200 words and focuses on one main topic that is very specific. A short paragraph has supporting details about the main idea and concludes in a way that further promotes the main idea. On the other hand, a long paragraph can be a specific topic or a broad one."

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [None]:
total_words

35

In [None]:
input_sequences = []
for line in text.split('.'):
  token_list = tokenizer.texts_to_sequences([line])[0]
  print(token_list)
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i + 1]
    print(n_gram_sequence)
    input_sequences.append(n_gram_sequence)

[1, 5, 2, 6, 7, 15, 16, 17, 8, 18, 9, 10, 3, 11, 12, 6, 19, 13]
[1, 5]
[1, 5, 2]
[1, 5, 2, 6]
[1, 5, 2, 6, 7]
[1, 5, 2, 6, 7, 15]
[1, 5, 2, 6, 7, 15, 16]
[1, 5, 2, 6, 7, 15, 16, 17]
[1, 5, 2, 6, 7, 15, 16, 17, 8]
[1, 5, 2, 6, 7, 15, 16, 17, 8, 18]
[1, 5, 2, 6, 7, 15, 16, 17, 8, 18, 9]
[1, 5, 2, 6, 7, 15, 16, 17, 8, 18, 9, 10]
[1, 5, 2, 6, 7, 15, 16, 17, 8, 18, 9, 10, 3]
[1, 5, 2, 6, 7, 15, 16, 17, 8, 18, 9, 10, 3, 11]
[1, 5, 2, 6, 7, 15, 16, 17, 8, 18, 9, 10, 3, 11, 12]
[1, 5, 2, 6, 7, 15, 16, 17, 8, 18, 9, 10, 3, 11, 12, 6]
[1, 5, 2, 6, 7, 15, 16, 17, 8, 18, 9, 10, 3, 11, 12, 6, 19]
[1, 5, 2, 6, 7, 15, 16, 17, 8, 18, 9, 10, 3, 11, 12, 6, 19, 13]
[1, 5, 2, 20, 21, 22, 7, 4, 3, 14, 8, 23, 24, 1, 25, 12, 26, 27, 4, 3, 14]
[1, 5]
[1, 5, 2]
[1, 5, 2, 20]
[1, 5, 2, 20, 21]
[1, 5, 2, 20, 21, 22]
[1, 5, 2, 20, 21, 22, 7]
[1, 5, 2, 20, 21, 22, 7, 4]
[1, 5, 2, 20, 21, 22, 7, 4, 3]
[1, 5, 2, 20, 21, 22, 7, 4, 3, 14]
[1, 5, 2, 20, 21, 22, 7, 4, 3, 14, 8]
[1, 5, 2, 20, 21, 22, 7, 4, 3, 14, 8, 23]


In [None]:
max_sequence_length = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
input_sequences

array([[ 0,  0,  0, ...,  0,  1,  5],
       [ 0,  0,  0, ...,  1,  5,  2],
       [ 0,  0,  0, ...,  5,  2,  6],
       ...,
       [ 0,  0,  0, ..., 11, 33,  1],
       [ 0,  0,  0, ..., 33,  1, 34],
       [ 0,  0,  0, ...,  1, 34, 10]], dtype=int32)

In [None]:
x, y = input_sequences[:, :-1], input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

In [None]:
y.shape

(52, 35)

In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length - 1))
model.add(LSTM(110))
model.add(Dense(total_words, activation='softmax'))



In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(x, y, epochs = 100, verbose = 1)

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 75ms/step - accuracy: 0.0232 - loss: 3.5564
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.1418 - loss: 3.5454 
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.1651 - loss: 3.5338
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.1546 - loss: 3.5221
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.1651 - loss: 3.5072 
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.1627 - loss: 3.4819 
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.1442 - loss: 3.4463
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.1082 - loss: 3.4034
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7aa914e2fed0>

In [None]:
model.summary()

In [None]:
def predict_next_word(model, tokenizer, input_text):
  input_seq = tokenizer.texts_to_sequences([input_text])[0]
  input_seq = pad_sequences([input_seq], maxlen=max_sequence_length - 1, padding='pre')
  predicted = model.predict(input_seq,verbose=0)
  return tokenizer.index_word[np.argmax(predicted)]


In [None]:
input_text= "LSTM is a"
next_word = predict_next_word(model, tokenizer, input_text)
print(f"Next word prediction:", next_word)

Next word prediction: short
