In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

# Load and preprocess data
filename = "wizard_of_us.txt"
with open(filename, "r", encoding="utf-8", errors="ignore") as file:
    raw_text = file.read().lower()

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip().lower()  # Convert to lowercase
    return text

raw_text = clean_text(raw_text)

# Tokenization
tokenizer = Tokenizer(num_words=10000)  # Increase vocab size
tokenizer.fit_on_texts([raw_text])
sequences = tokenizer.texts_to_sequences([raw_text])[0]

vocab_size = len(tokenizer.word_index) + 1
seq_length = 150

# Create input-output pairs
X = []
y = []
for i in range(0, len(sequences) - seq_length):
    X.append(sequences[i:i + seq_length])
    y.append(sequences[i + seq_length])

X = pad_sequences(X, maxlen=seq_length, padding='pre')
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

# Model definition
model = Sequential([
    Embedding(vocab_size, 256, input_length=seq_length),
    Bidirectional(LSTM(512, return_sequences=True)),
    Dropout(0.4),
    BatchNormalization(),
    Bidirectional(LSTM(512, return_sequences=True)),
    Dropout(0.4),
    BatchNormalization(),
    Bidirectional(LSTM(256)),
    Dropout(0.4),
    Dense(vocab_size, activation="softmax")
])

# Compile model
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])


NameError: name 'Adam' is not defined

In [None]:
checkpoint = ModelCheckpoint("weights-best2.hdf5", monitor="loss", save_best_only=True, verbose=1)
early_stopping = EarlyStopping(monitor="loss", patience=5, restore_best_weights=True)
callbacks = [checkpoint, early_stopping]

# Train the model
model.fit(X, y, epochs=100, batch_size=128, callbacks=callbacks)

In [None]:
# Generate text
def generate_text(seed_sequence, length=1000, temperature=0.7):
    output = []
    for _ in range(length):
        pred_input = np.reshape(seed_sequence, (1, len(seed_sequence)))
        pred_probs = model.predict(pred_input, verbose=0)[0]
        next_idx = sample_with_temperature(pred_probs, temperature)
        output.append(tokenizer.index_word[next_idx])
        seed_sequence = np.append(seed_sequence[1:], next_idx)
    return " ".join(output)

seed_idx = np.random.randint(0, len(X) - 1)
seed_sequence = X[seed_idx]
print("Generated Text:")
print(generate_text(seed_sequence))