In [14]:
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [15]:
with open("1661-0.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [35]:
if "*** START OF" in text:
    text = text.split("*** START OF")[1]
text = re.sub(r"[^a-zA-Z\s.,!?']", "", text)  # keep punctuation
text = text.lower()

In [36]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [37]:
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
sequence_length = 10  # longer sequence for more context
for i in range(sequence_length, len(tokens)):
    seq = tokens[i-sequence_length:i+1]
    input_sequences.append(seq)

input_sequences = np.array(input_sequences)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]  # sparse labels

In [38]:
model = Sequential()
model.add(Embedding(total_words, 50, input_length=sequence_length-1))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation="softmax"))

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())




None


In [40]:
# ---------------------------
history = model.fit(
    X, y,
    epochs=20,          # more epochs
    batch_size=128,
    validation_split=0.1,
    verbose=1
)

# ---------------------------
# Perplexity
# ---------------------------
loss = history.history['loss'][-1]
perplexity = np.exp(loss)
print(f"Training Perplexity: {perplexity:.2f}")

Epoch 1/20
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.1409 - loss: 5.0374 - val_accuracy: 0.1127 - val_loss: 6.9653
Epoch 2/20
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.1455 - loss: 4.9540 - val_accuracy: 0.1151 - val_loss: 7.0371
Epoch 3/20
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.1450 - loss: 4.8874 - val_accuracy: 0.1168 - val_loss: 7.1205
Epoch 4/20
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.1510 - loss: 4.8231 - val_accuracy: 0.1140 - val_loss: 7.1933
Epoch 5/20
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.1527 - loss: 4.7657 - val_accuracy: 0.1178 - val_loss: 7.2653
Epoch 6/20
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.1587 - loss: 4.7024 - val_accuracy: 0.1188 - val_loss: 7.3592
Epoch 7/20
[1m755/755

In [41]:
def predict_next_word_sample(seed_text, next_words=3, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list],
                             maxlen=sequence_length-1, padding="pre")
        preds = model.predict(token_list, verbose=0)[0]
        preds = np.log(preds + 1e-8) / temperature  # apply temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        next_index = np.random.choice(len(preds), p=preds)
        output_word = tokenizer.index_word[next_index]
        seed_text += " " + output_word
    return seed_text


In [47]:
print(predict_next_word("sherlock holmes"))
print(predict_next_word("I love"))

sherlock holmes as i have
I love to be a
