In [4]:
pip install tensorflow




In [5]:
with open("word.txt", "r", encoding="utf-8") as file:
    text = file.read()


In [6]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense



In [7]:
tokenizer= Tokenizer()
tokenizer.fit_on_texts([text])



In [8]:
total_words=len(tokenizer.word_index)+1

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_sequences = []

for line in text.split("\n"):
    tokens = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokens)):
        n_gram_seq = tokens[:i+1]  # ✅ Corrected variable name (was n_gram_Seq)
        input_sequences.append(n_gram_seq)


In [10]:
print(input_sequences[5])

[145, 4790, 1, 1020, 4, 128, 34]


In [11]:
max_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')



In [12]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

In [13]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=max_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))



In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 40ms/step - accuracy: 0.0570 - loss: 6.6773
Epoch 2/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 40ms/step - accuracy: 0.0718 - loss: 6.0198
Epoch 3/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 40ms/step - accuracy: 0.1082 - loss: 5.6390
Epoch 4/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 40ms/step - accuracy: 0.1291 - loss: 5.3207
Epoch 5/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 40ms/step - accuracy: 0.1403 - loss: 5.0933
Epoch 6/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 40ms/step - accuracy: 0.1512 - loss: 4.8805
Epoch 7/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 40ms/step - accuracy: 0.1620 - loss: 4.6888
Epoch 8/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 40ms/step - accuracy: 0.1706 - loss:

In [None]:
def predict_next_word(seed_text, next_words=1):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probs)

        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += ' ' + word
                break
    return seed_text

In [None]:
print(predict_next_word("I love", 1))
print(predict_next_word("machine learning", 1))

In [None]:
model.save_weights("next_word_weights.h5")


In [None]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
