In [None]:
import os 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  

In [2]:
import tensorflow as tf
import numpy as np

E0000 00:00:1770919059.940280   30098 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770919059.962598   30098 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770919060.163474   30098 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770919060.163596   30098 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770919060.163599   30098 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770919060.163601   30098 computation_placer.cc:177] computation placer already registered. Please check linka

In [3]:
# DATA LOADING AND PREPROCESSING
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read().lower()
    return data

text_data = load_data('data/1661-0.txt')

In [4]:
# TOKENIZATION
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts([text_data])
total_words = len(tokenizer.word_index) + 1

In [5]:
# SEQUENCE GENERATION
input_sequences = []
for line in text_data.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


In [6]:
# FEATURE AND LABEL SPLITTING
X, y = input_sequences[:, :-1], input_sequences[:, -1]

In [21]:
# MODEL ARCHITECTURE
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(max_sequence_len - 1,)),
    tf.keras.layers.Embedding(total_words, 128, input_length=max_sequence_len-1),

    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences=True)),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.LSTM(100),
    tf.keras.layers.BatchNormalization(), 

    tf.keras.layers.Dense(total_words // 2, activation='relu'),
    tf.keras.layers.Dense(total_words, activation='softmax')
])



In [22]:
model.summary()

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

In [None]:
# Setup the safety nets
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=2, min_lr=0.0001)

# Fit the model with callbacks
history = model.fit(
    X, y, 
    epochs=50,    
    batch_size=64,   
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

Epoch 1/100


I0000 00:00:1770919134.726925   30227 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.0343 - loss: 6.7264 - learning_rate: 0.0010
Epoch 2/100
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.0533 - loss: 5.8588 - learning_rate: 0.0010
Epoch 3/100
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.0727 - loss: 5.5638 - learning_rate: 0.0010
Epoch 4/100
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.1023 - loss: 5.1649 - learning_rate: 0.0010
Epoch 5/100
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.1370 - loss: 4.7762 - learning_rate: 0.0010
Epoch 6/100
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.1914 - loss: 4.3708 - learning_rate: 0.0010
Epoch 7/100
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.2460 - loss: 3.8780 - learning_rat

In [23]:
model.save('models/lstm_model.h5')



In [12]:
def predict(seed_text, words_to_generate):
    # Repeat the process for as many words as you want
    for _ in range(words_to_generate):
        
        # 1. Convert the text input into numbers (tokens)
        tokens = tokenizer.texts_to_sequences([seed_text])[0]
        
        # 2. Make sure the input size matches what the model expects (Padding)
        tokens = tf.keras.preprocessing.sequence.pad_sequences([tokens], maxlen=max_sequence_len-1, padding='pre')
        
        # 3. Ask the model for the most likely next word (The "Winner")
        prediction_probs = model.predict(tokens, verbose=0)
        winner_index = np.argmax(prediction_probs) 
        
        # 4. Turn that number back into a word
        next_word = tokenizer.index_word.get(winner_index, "")
        
        # 5. Add that word to our sentence and repeat
        seed_text += " " + next_word
        
    return seed_text

In [25]:
print(predict("sherlock holmes is", 5))

sherlock holmes is found signal amiable bijou bijou
