In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [6]:
# Load dataset
file_path = "/content/franklin.txt"  # Adjust the path according to your dataset
with open(file_path, "r", encoding="utf8") as file:
    lines = file.readlines()

In [9]:
# Preprocess text
data = ' '.join([line.strip() for line in lines])
data = data.replace('\n', ' ').replace('\r', ' ').replace('\ufeff', '').replace('“', '').replace('”', '')
data = ' '.join(data.split())  # Remove multiple spaces

In [10]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# Save tokenizer
with open("tokenizer.pkl", "wb") as token_file:
    pickle.dump(tokenizer, token_file)

In [12]:
# Convert text to sequences
sequence_data = tokenizer.texts_to_sequences([data])[0]
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 8307


In [13]:
# Prepare input sequences
sequences = []
for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)

In [14]:
# Define the model
model = Sequential([
    Embedding(vocab_size, 10, input_length=3),
    LSTM(1000, return_sequences=True),
    LSTM(1000),
    Dense(1000, activation="relu"),
    Dense(vocab_size, activation="softmax")
])

model.compile(loss="categorical_crossentropy", metrics=['accuracy'], optimizer=Adam(learning_rate=0.001))



In [15]:
# Train the model
model.fit(X, y, epochs=70, batch_size=64)

Epoch 1/70
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 25ms/step - accuracy: 0.0511 - loss: 7.0029
Epoch 2/70
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 25ms/step - accuracy: 0.0868 - loss: 6.2570
Epoch 3/70
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.1065 - loss: 5.8937
Epoch 4/70
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 25ms/step - accuracy: 0.1206 - loss: 5.6202
Epoch 5/70
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 25ms/step - accuracy: 0.1347 - loss: 5.3543
Epoch 6/70
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 25ms/step - accuracy: 0.1441 - loss: 5.1306
Epoch 7/70
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 25ms/step - accuracy: 0.1525 - loss: 4.9008
Epoch 8/70
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 25ms/step - accuracy: 0.1619 - loss: 4.6374
Epoch 9/

<keras.src.callbacks.history.History at 0x7b292955d050>

In [16]:
# Save model
model.save("text_prediction_model.h5")



In [17]:
# Load model and tokenizer
def load_model_and_tokenizer():
    model = load_model("text_prediction_model.h5")
    with open("tokenizer.pkl", "rb") as token_file:
        tokenizer = pickle.load(token_file)
    return model, tokenizer

In [18]:
# Predict function
def predict_next_word(seed_text, model, tokenizer, num_words=1):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = np.array(token_list[-3:]).reshape(1, -1)
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text


In [19]:
# Example Usage
if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    seed_text = "the power of"
    print(predict_next_word(seed_text, model, tokenizer, num_words=5))




the power of credit and all finding his


In [21]:
# Example Usage
if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    seed_text = "When I saw one"
    print(predict_next_word(seed_text, model, tokenizer, num_words=5))



When I saw one too ambitious of court favor
