In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding   
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import tensorflow as tf
import logging


tf.get_logger().setLevel(logging.ERROR)

EPOCHS = 32
BATCH_SIZE = 256
INPUT_FILE_NAME = "../data/frankenstein.txt"
WINDOW_LENGHT = 40
WINDOW_STEP = 3
PREDICT_LENGTH = 8
MAX_WORDS = 11
EMBEDDING_WIDTH = 100

2025-07-22 11:20:47.277813: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-22 11:20:47.278937: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-22 11:20:47.285594: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-22 11:20:47.303022: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753201247.335762  379880 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753201247.34

In [21]:
file = open(INPUT_FILE_NAME, "r", encoding="utf-8-sig")
text = file.read()
file.close()

text = text_to_word_sequence(text)

fragments = []
targets = []
for i in range(0, len(text) - WINDOW_LENGHT, WINDOW_STEP):
    fragments.append(text[i : i + WINDOW_STEP])
    targets.append(text[i + WINDOW_LENGHT])

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="UNK")
tokenizer.fit_on_texts(text)
fragments_indexed = tokenizer.texts_to_sequences(fragments)
targets_indexed = tokenizer.texts_to_sequences(targets)

X = np.array(fragments_indexed, dtype=np.int64)
y = np.zeros((len(targets_indexed), MAX_WORDS))
for i, target_index in enumerate(targets_indexed):
    y[i, target_index] = 1



In [None]:
training_model = Sequential()
training_model.add(Embedding(output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS, mask_zero=True, input_length=None))
training_model.add(LSTM(128,return_sequences=True, dropout= 0.2, recurrent_dropout=0.2))
training_model.add(LSTM(128, dropout= 0.2, recurrent_dropout=0.2))
training_model.add(Dense(128,activation='relu'))
training_model.add(Dense(MAX_WORDS,activation='softmax'))
training_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
training_model.summary()
history = training_model.fit(X,y, validation_split = 0.05, batch_size=BATCH_SIZE, epochs = EPOCHS, verbose = 2, shuffle = True)

Epoch 1/32
97/97 - 7s - 71ms/step - loss: 1.3018 - val_loss: 0.9274
Epoch 2/32
97/97 - 2s - 18ms/step - loss: 1.1429 - val_loss: 0.9051
Epoch 3/32
97/97 - 2s - 18ms/step - loss: 1.1418 - val_loss: 0.9084
Epoch 4/32
97/97 - 2s - 20ms/step - loss: 1.1412 - val_loss: 0.9102
Epoch 5/32
97/97 - 2s - 19ms/step - loss: 1.1434 - val_loss: 0.8995
Epoch 6/32
97/97 - 2s - 18ms/step - loss: 1.1423 - val_loss: 0.8993
Epoch 7/32


In [None]:
from tensorflow.keras.layers import Input

In [None]:
inference_model = Sequential()
inference_model.add(Input(batch_shape=(1, 1)))
inference_model.add(Embedding(output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS, mask_zero=True))
inference_model.add(LSTM(128,return_sequences=True, dropout= 0.2, recurrent_dropout=0.2, stateful=True))
inference_model.add(LSTM(128, dropout= 0.2, recurrent_dropout=0.2))
inference_model.add(Dense(128,activation='relu'))
inference_model.add(Dense(MAX_WORDS,activation='softmax'))
weights = training_model.get_weights()
inference_model.set_weights(weights)


In [None]:
first_words = ['i', 'saw']

first_words_indexed = tokenizer.texts_to_sequences(first_words)
for layer in inference_model.layers:
    if hasattr(layer, 'reset_states'):
        layer.reset_states()
predicted_string = ''
for i ,word_index in enumerate(first_words_indexed):
    x = np.zeros((1,1), dtype=np.int64)
    x[0][0] = word_index[0]
    predicted_string+= first_words[i]
    predicted_string += ' '
    y_predict = inference_model.predict(x,verbose=0)[0]
for i in range(PREDICT_LENGTH):
    new_word_index = np.argmax(y_predict)
    word = tokenizer.sequences_to_texts([[new_word_index]])
    x[0][0] = new_word_index
    predicted_string += word[0]
    predicted_string += ' '
    y_predict = inference_model.predict(x,verbose=0)[0]
print(predicted_string)

i saw UNK UNK UNK UNK UNK UNK UNK UNK 
