In [7]:
import os 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  

In [8]:
import tensorflow as tf
import numpy as np

In [9]:
# DATA LOADING AND PREPROCESSING
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read().lower()
    return data

text_data = load_data('data/1661-0.txt')

In [10]:
# TOKENIZATION
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts([text_data])
total_words = len(tokenizer.word_index) + 1

In [11]:
# SEQUENCE GENERATION
input_sequences = []
for line in text_data.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


In [12]:
import pickle

with open("models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("models/metadata.pkl", "wb") as f:
    pickle.dump({"max_sequence_len": max_sequence_len}, f)


In [13]:
# FEATURE AND LABEL SPLITTING
X, y = input_sequences[:, :-1], input_sequences[:, -1]

In [14]:
# MODEL ARCHITECTURE
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(max_sequence_len - 1,)),
    tf.keras.layers.Embedding(total_words, 128, input_length=max_sequence_len-1),

    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences=True)),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.LSTM(100),
    tf.keras.layers.BatchNormalization(), 

    tf.keras.layers.Dense(total_words // 2, activation='relu'),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

I0000 00:00:1770994236.684467    2071 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3582 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [15]:
model.summary()

In [16]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

In [17]:
# Setup the safety nets
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=2, min_lr=0.0001)

# Fit the model with callbacks
history = model.fit(
    X, y, 
    epochs=50,    
    batch_size=64,   
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

Epoch 1/50


I0000 00:00:1770994188.313816    8275 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m -198069us/step - accuracy: 0.0283 - loss: 6.5815 - learning_rate: 0.0010
Epoch 2/50
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 33ms/step - accuracy: 0.0519 - loss: 5.8665 - learning_rate: 0.0010
Epoch 3/50
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - accuracy: 0.1007 - loss: 5.3394 - learning_rate: 0.0010
Epoch 4/50
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.1416 - loss: 4.8545 - learning_rate: 0.0010
Epoch 5/50
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - accuracy: 0.1754 - loss: 4.4599 - learning_rate: 0.0010
Epoch 6/50
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 249ms/step - accuracy: 0.2091 - loss: 4.0457 - learning_rate: 0.0010
Epoch 7/50
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m-46s[0m -194801us/step - accuracy: 0.2513 - loss: 3.5905 - lear

In [18]:
model.save('models/lstm_model.h5')



In [19]:
model = tf.keras.models.load_model('models/lstm_model.h5')

def predict(seed_text, words_to_generate):
    # Repeat the process for as many words as you want
    for _ in range(words_to_generate):
        
        # 1. Convert the text input into numbers (tokens)
        tokens = tokenizer.texts_to_sequences([seed_text])[0]
        
        # 2. Make sure the input size matches what the model expects (Padding)
        tokens = tf.keras.preprocessing.sequence.pad_sequences([tokens], maxlen=max_sequence_len-1, padding='pre')
        
        # 3. Ask the model for the most likely next word (The "Winner")
        prediction_probs = model.predict(tokens, verbose=0)
        winner_index = np.argmax(prediction_probs) 
        
        # 4. Turn that number back into a word
        next_word = tokenizer.index_word.get(winner_index, "")
        
        # 5. Add that word to our sentence and repeat
        seed_text += " " + next_word
        
    return seed_text



In [None]:
print(predict("", 5))

good morning and mary at the end
