In [1]:
import tensorflow as tf
import numpy as np
import requests

# Check if GPU is available (This confirms Colab is set up right)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
# --- Data Collection ---
# We will download 'The Adventures of Sherlock Holmes' from Project Gutenberg
url = "https://www.gutenberg.org/files/1661/1661-0.txt"

response = requests.get(url)
text_data = response.text

# Quick verification
print("------------------------------------------------")
print(f"Data Downloaded Successfully!")
print(f"Total characters in dataset: {len(text_data)}")
print("------------------------------------------------")
print("First 500 characters preview:\n")
print(text_data[:500])

------------------------------------------------
Data Downloaded Successfully!
Total characters in dataset: 593731
------------------------------------------------
First 500 characters preview:

﻿The Project Gutenberg eBook of The Adventures of Sherlock Holmes,
by Arthur Conan Doyle

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org. If you are not located in the United States, you
will have to check the laws of the country where 


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

In [5]:
# --- 1. Text Cleaning ---
# Remove the Gutenberg Header/Footer to get the actual story
# We look for the start of the first story roughly
start_marker = "SCANDAL IN BOHEMIA"
start_index = text_data.find(start_marker)

# If found, slice the text. If not, just use the whole thing.
if start_index != -1:
    corpus = text_data[start_index:]
else:
    corpus = text_data

# Split the huge text into a list of sentences/lines
corpus = corpus.lower().split("\n")

In [6]:
# --- 2. Tokenization ---
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus) # Creates the vocabulary

total_words = len(tokenizer.word_index) + 1 # +1 is required for padding later

print(f"Total unique words (Vocabulary Size): {total_words}")
print(f"Total lines of text to process: {len(corpus)}")

Total unique words (Vocabulary Size): 10229
Total lines of text to process: 12254


In [7]:
# --- 3. Create N-gram Sequences (The Logic) ---
input_sequences = []

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    # We only care about lines that have at least 2 words
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [8]:
print(f"Total Input Sequences created: {len(input_sequences)}")
print("Example sequence (numbers):", input_sequences[0])
print("Example sequence (words):", [tokenizer.index_word[idx] for idx in input_sequences[0]])

Total Input Sequences created: 102787
Example sequence (numbers): [982, 8]
Example sequence (words): ['scandal', 'in']


In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [10]:
# --- 1. Padding ---
# We need to find the longest sentence to ensure all inputs match that size
max_sequence_len = max([len(x) for x in input_sequences])
print(f"Longest sequence in the text: {max_sequence_len} words")

# Pad sequences so they are all the same length
# 'pre' padding means adding zeros at the START of short sentences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

Longest sequence in the text: 20 words


In [11]:
# --- 2. Create Predictors (X) and Label (y) ---
# X = All words EXCEPT the last one
# y = The LAST word (the one we want to predict)
X, y = input_sequences[:,:-1], input_sequences[:,-1]

# Check the shapes to ensure memory is handling it well
print("------------------------------------------------")
print(f"Shape of X (Inputs): {X.shape}")
print(f"Shape of y (Targets): {y.shape}")
print("------------------------------------------------")
# Verify the split with an example
print("Original padded sequence:", input_sequences[0])
print("Input (X):", X[0])
print("Target (y):", y[0])

------------------------------------------------
Shape of X (Inputs): (102787, 19)
Shape of y (Targets): (102787,)
------------------------------------------------
Original padded sequence: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 982   8]
Input (X): [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 982]
Target (y): 8


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [13]:
# --- Retrieve necessary variables from previous steps ---
# input_len is the length of your input sequences (X.shape[1] = 19)
input_len = X.shape[1]
# total_words is the size of your vocabulary (from Step 2)
# Make sure the variable total_words is defined in your Colab session!
# total_words = len(tokenizer.word_index) + 1

# --- Model Definition ---
model = Sequential()

# 1. Embedding Layer: Turns word indexes into dense vectors (100 dimensions)
# input_dim: Vocabulary size (total_words)
# output_dim: Size of the dense vector for each word (100)
# input_length: Length of the sequence (19)
model.add(Embedding(input_dim=total_words, output_dim=100, input_length=input_len))



In [14]:
# 2. LSTM Layer: The core of the sequence prediction
# 150 units is a good starting point for complexity
model.add(LSTM(150))
model.add(Dropout(0.2)) # Dropout helps prevent overfitting

In [15]:
# 3. Output Layer: Predicts the next word
# units: Must equal the vocabulary size (one neuron for every possible word)
# activation='softmax': Converts raw predictions into probabilities
model.add(Dense(total_words, activation='softmax'))

# --- Compilation (The Crucial Anti-Crash Step) ---
# We use 'sparse_categorical_crossentropy' because our labels (y) are integers,
# NOT one-hot encoded vectors, saving massive amounts of RAM.
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model structure
model.summary()

In [16]:
# Use a large batch size for GPU efficiency
BATCH_SIZE = 256
# 50 epochs is a solid goal, but we can stop early if accuracy is high
EPOCHS = 50

print("--- Starting Model Training (Using T4 GPU) ---")

# The model will 'build' itself right before the first epoch
history = model.fit(
    X,
    y,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1
)

print("\n--- Training Complete! ---")

--- Starting Model Training (Using T4 GPU) ---
Epoch 1/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.0400 - loss: 7.1810
Epoch 2/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.0559 - loss: 6.3610
Epoch 3/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.0745 - loss: 6.1023
Epoch 4/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.1012 - loss: 5.8040
Epoch 5/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.1124 - loss: 5.5895
Epoch 6/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.1262 - loss: 5.4276
Epoch 7/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.1343 - loss: 5.2914
Epoch 8/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.1391

In [17]:
# Use the high-level Keras saving function
model.save("next_word_predictor.h5")
print("Model saved as next_word_predictor.h5")



Model saved as next_word_predictor.h5


In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def predict_next_word(seed_text, n_words=1):
    """
    Predicts the next n_words given a seed text.
    """
    # X.shape[1] is the input length (19 in your case)
    input_len = X.shape[1]

    for _ in range(n_words):
        # 1. Prepare the seed text (using the same tokenizer)
        token_list = tokenizer.texts_to_sequences([seed_text])[0]

        # 2. Pad the sequence to match the model's input length (19)
        padded_token_list = pad_sequences([token_list], maxlen=input_len, padding='pre')

        # 3. Predict the word probabilities
        # We use model.predict() which outputs an array of probabilities for all ~9k words
        predicted_probs = model.predict(padded_token_list, verbose=0)

        # 4. Get the index of the word with the highest probability
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]

        # 5. Convert index back to word
        output_word = ""
        if predicted_index in tokenizer.index_word:
            output_word = tokenizer.index_word[predicted_index]

        # 6. Append the predicted word to the seed text for the next prediction loop
        seed_text += " " + output_word

    return seed_text.title()

# --- Test Cases ---

print("\n--- Model Predictions (Accuracy 37.56%) ---")

# Test 1: Common phrase in the book
seed1 = "holmes was sitting upon"
print(f"Input: '{seed1}'\nPrediction: {predict_next_word(seed1, 3)}\n")

# Test 2: Phrase to test grammar and context
seed2 = "the dog runs after the"
print(f"Input: '{seed2}'\nPrediction: {predict_next_word(seed2, 2)}\n")

# Test 3: Phrase to test common names
seed3 = "my dear watson"
print(f"Input: '{seed3}'\nPrediction: {predict_next_word(seed3, 5)}\n")


--- Model Predictions (Accuracy 37.56%) ---
Input: 'holmes was sitting upon'
Prediction: Holmes Was Sitting Upon The Door And

Input: 'the dog runs after the'
Prediction: The Dog Runs After The Other Side

Input: 'my dear watson'
Prediction: My Dear Watson ” Said Holmes “This Is



In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
# Import these again just in case the Colab session lost them
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# --- Define the NEW Bi-LSTM Model ---
# We use a new variable name (new_model) to avoid overwriting the old 'model'.
new_model = Sequential()

# 1. Embedding Layer: Increased output_dim to 150 (slightly higher quality vectors)
# We use X.shape[1] (19) for input_length
new_model.add(Embedding(input_dim=total_words, output_dim=150, input_length=X.shape[1]))

# 2. Bi-LSTM Layer (The Major Improvement)
# Bi-LSTM processes the sequence both forward and backward, enhancing context.
# We use 256 units for increased model capacity.
new_model.add(Bidirectional(LSTM(256)))
new_model.add(Dropout(0.3)) # Slightly increased dropout for regularization

# 3. Output Layer (Same as before)
new_model.add(Dense(total_words, activation='softmax'))

# --- Compilation (Anti-Crash Strategy Maintained) ---
# Must use 'sparse_categorical_crossentropy' since 'y' is integers.
new_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("--- New Bi-LSTM Model Architecture ---")
new_model.summary()

--- New Bi-LSTM Model Architecture ---


In [20]:
# Train for 100 epochs (Monitor this and stop if accuracy plateaus)
BATCH_SIZE = 256
NEW_EPOCHS = 100

print("--- Starting Bi-LSTM Model Training (100 Epochs) ---")

history_new = new_model.fit(
    X,
    y,
    epochs=NEW_EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1
)

print("\n--- Bi-LSTM Training Complete! ---")

--- Starting Bi-LSTM Model Training (100 Epochs) ---
Epoch 1/100
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - accuracy: 0.0462 - loss: 7.0147
Epoch 2/100
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.0755 - loss: 6.1579
Epoch 3/100
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.1083 - loss: 5.6878
Epoch 4/100
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.1272 - loss: 5.4082
Epoch 5/100
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.1403 - loss: 5.1525
Epoch 6/100
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.1535 - loss: 4.9306
Epoch 7/100
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.1641 - loss: 4.7156
Epoch 8/100
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - ac

In [22]:
# The prediction function needs to be updated to use the new_model
def predict_next_word_improved(seed_text, n_words=1):
    """
    Predicts the next n_words given a seed text using the IMPROVED Bi-LSTM model.
    """
    input_len = X.shape[1]

    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        padded_token_list = pad_sequences([token_list], maxlen=input_len, padding='pre')

        # *** IMPORTANT: Use new_model here ***
        predicted_probs = new_model.predict(padded_token_list, verbose=0)

        predicted_index = np.argmax(predicted_probs, axis=-1)[0]

        output_word = ""
        if predicted_index in tokenizer.index_word:
            output_word = tokenizer.index_word[predicted_index]

        seed_text += " " + output_word

    return seed_text.title()

# --- Test Cases ---

print("\n--- Bi-LSTM Predictions (Accuracy 87.37%) ---")

# Test 1: Re-run the problematic phrase to check improvement
seed1 = "the dog runs after the"
print(f"Input: '{seed1}'\nPrediction: {predict_next_word_improved(seed1, 3)}\n")

# Test 2: Dialogue test
seed2 = "my dear watson"
print(f"Input: '{seed2}'\nPrediction: {predict_next_word_improved(seed2, 5)}\n")

# Test 3: Longer generation
seed3 = "i could see the smoke"
print(f"Input: '{seed3}'\nPrediction: {predict_next_word_improved(seed3, 10)}\n")


--- Bi-LSTM Predictions (Accuracy 87.37%) ---
Input: 'the dog runs after the'
Prediction: The Dog Runs After The Letter Is I

Input: 'my dear watson'
Prediction: My Dear Watson I See That You Was

Input: 'i could see the smoke'
 That It

