In [2]:
# ============================================
# LSTM Text Generation - Complete Implementation
# ============================================

import numpy as np
import string
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split


In [3]:
# -----------------------------
# 1. Load Dataset
# -----------------------------
# Use any large .txt file (example: Shakespeare works)
with open("shakespeare.txt", "r", encoding="utf-8") as file:
    text = file.read()


In [4]:
# -----------------------------
# 2. Data Preprocessing
# -----------------------------

# Convert to lowercase
text = text.lower()

# Remove punctuation and special characters
text = text.translate(str.maketrans("", "", string.punctuation))

# Tokenization (WORD-LEVEL)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1

# Convert text to token sequence
token_list = tokenizer.texts_to_sequences([text])[0]

In [7]:
# -----------------------------
# 3. Create Input-Output Pairs
# -----------------------------
sequence_length = 20
input_sequences = []

for i in range(sequence_length, len(token_list)):
    input_sequences.append(token_list[i-sequence_length:i+1])

input_sequences = np.array(input_sequences)

# Split into inputs (X) and labels (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]   # keep labels as integers



In [8]:
# -----------------------------
# 4. Train-Validation Split
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [12]:
# -----------------------------
# 5. Model Design (LSTM)
# -----------------------------
model = Sequential([
    Embedding(total_words, 100, input_length=sequence_length),
    LSTM(150, return_sequences=True),
    LSTM(100),
    Dense(total_words, activation="softmax")
])

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)
model.summary()

In [15]:
# -----------------------------
# 6. Model Training (FAST VERSION)
# -----------------------------

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    "best_model.keras",   # modern format
    monitor="val_loss",
    save_best_only=True
)

history = model.fit(
    X_train,
    y_train,
    epochs=3,              
    batch_size=256,        
    validation_data=(X_val, y_val),
    callbacks=[early_stop, checkpoint],
    verbose=1
)


Epoch 1/3
[1m2546/2546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 206ms/step - accuracy: 0.0992 - loss: 5.8789 - val_accuracy: 0.0966 - val_loss: 6.1446
Epoch 2/3
[1m2546/2546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m538s[0m 211ms/step - accuracy: 0.1049 - loss: 5.7446 - val_accuracy: 0.0999 - val_loss: 6.1224
Epoch 3/3
[1m2546/2546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m524s[0m 206ms/step - accuracy: 0.1100 - loss: 5.6224 - val_accuracy: 0.1023 - val_loss: 6.1282


In [16]:
# -----------------------------
# 7. Text Generation Function
# -----------------------------
def generate_text(seed_text, next_words=30):
    for _ in range(next_words):
        tokenized = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized = pad_sequences(
            [tokenized], maxlen=sequence_length, padding="pre"
        )

        predicted = np.argmax(model.predict(tokenized, verbose=0))
        output_word = ""

        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += " " + output_word

    return seed_text

In [17]:
# -----------------------------
# 8. Generate Sample Outputs
# -----------------------------
print("\n--- Generated Text Samples ---\n")

seed_1 = "to be or not to be"
seed_2 = "love looks not with the eyes"
seed_3 = "all the worlds a stage"

print("Seed 1:\n", generate_text(seed_1, 40))
print("\nSeed 2:\n", generate_text(seed_2, 40))
print("\nSeed 3:\n", generate_text(seed_3, 40))



--- Generated Text Samples ---

Seed 1:
 to be or not to be a man and the king of the world and the king is not a man and i will not be a man and i will not be a man and i will not be a man and i will not

Seed 2:
 love looks not with the eyes of the king and the king and the king and the king is not the king of the king and the king of the world and the king is not a man and the king of the world and the

Seed 3:
 all the worlds a stage and the king and the king and the king of the world and the king is not a man and the king of the world and the king is not a man and i have been a man to be
