<a href="https://colab.research.google.com/github/Enkrumah14/mannyNkrumahGenAi/blob/main/Problem1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:

#author : Manny Nkrumah
#file : Problem1.ipynb
#assignment # : #5
#date : 11/22/24
#description :Text generation with LSTM
#############################################################################################################################
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import requests
import re
import os
import gc
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

# Model parameters
EPOCHS = 5
BATCH_SIZE = 128
SEQ_LENGTH = 150
BUFFER_SIZE = 10000
EMBEDDING_DIM = 512
TEMPERATURE = 0.7

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9.,!?\'" \n]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def download_and_combine_texts(urls):
    combined_text = ""
    for url in urls:
        response = requests.get(url)
        text = response.text
        text = re.sub(r'\*\*\* START OF .+?\*\*\*', '', text)
        text = re.sub(r'\*\*\* END OF .+?\*\*\*', '', text)
        text = clean_text(text)
        combined_text += text + "\n\n"
    return combined_text

def create_sequences(text, seq_length):
    input_sequences = []
    target_sequences = []
    for i in range(0, len(text) - seq_length):
        seq = text[i:i + seq_length]
        target = text[i + 1:i + seq_length + 1]
        input_sequences.append([char_to_idx[char] for char in seq])
        target_sequences.append([char_to_idx[char] for char in target])
    return np.array(input_sequences), np.array(target_sequences)

def create_model(vocab_size):
    inputs = layers.Input(shape=(SEQ_LENGTH,))
    x = layers.Embedding(vocab_size, EMBEDDING_DIM)(inputs)
    x = layers.LSTM(1024, return_sequences=True)(x)
    x = layers.Dropout(0.3)(x)  # Increased dropout
    x = layers.LSTM(1024, return_sequences=True)(x)
    x = layers.Dropout(0.3)(x)
    x = layers.LSTM(1024, return_sequences=True)(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(vocab_size, activation='softmax')(x)

    model = models.Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

def generate_text(model, start_string, char_to_idx, idx_to_char, num_generate=1000, temperature=0.7):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = tf.keras.preprocessing.sequence.pad_sequences(
        [input_eval], maxlen=SEQ_LENGTH, padding='pre'
    )

    text_generated = []


    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        text_generated.append(idx_to_char[predicted_id])
        input_eval = tf.concat([input_eval[:, 1:], tf.expand_dims([predicted_id], 0)], axis=-1)

    return start_string + ''.join(text_generated)

# Main execution
if __name__ == "__main__":
    # Download texts
    urls = [
        "https://www.gutenberg.org/files/1041/1041-0.txt",  # Hamlet
        "https://www.gutenberg.org/files/152/152-0.txt",    # Macbeth
        "https://www.gutenberg.org/files/1112/1112-0.txt"   # Othello
    ]

    text = download_and_combine_texts(urls)
    chars = sorted(list(set(text)))
    char_to_idx = {char: idx for idx, char in enumerate(chars)}
    idx_to_char = {idx: char for idx, char in enumerate(chars)}
    VOCAB_SIZE = len(chars)

    # Prepare dataset
    x, y = create_sequences(text, SEQ_LENGTH)
    split_index = int(0.9 * len(x))  # 90-10 split
    x_train, x_val = x[:split_index], x[split_index:]
    y_train, y_val = y[:split_index], y[split_index:]

    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

    # Create and train model
    model = create_model(VOCAB_SIZE)

    callbacks = [
        ModelCheckpoint(
            filepath=checkpoint_prefix + ".weights.h5",
            save_weights_only=True,
            save_freq='epoch'
        ),
        EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=2,
            min_lr=0.0001
        )
    ]

    try:
        history = model.fit(
            train_dataset,
            epochs=EPOCHS,
            validation_data=val_dataset,
            callbacks=callbacks
        )
    except Exception as e:
        print(f"Training error: {str(e)}")
        model.save_weights('emergency_backup.h5')
        raise e

    # Generate sample texts
    prompts = [
        "To be, or not to be",
        "Shall I compare thee to a summer's day?",
        "All the world's a stage"
    ]

    for prompt in prompts:
        print(f"\nPrompt: {prompt}")
        try:
            generated_text = generate_text(
                model,
                prompt,
                char_to_idx,
                idx_to_char,
                num_generate=500,
                temperature=TEMPERATURE
            )
            print(generated_text)
        except Exception as e:
            print(f"Generation error: {str(e)}")

    # Cleanup
    tf.keras.backend.clear_session()
    gc.collect()

Epoch 1/5
[1m1627/1627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 66ms/step - accuracy: 0.6144 - loss: 1.4185 - val_accuracy: 0.4756 - val_loss: 2.6866 - learning_rate: 0.0010
Epoch 2/5
[1m1627/1627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 66ms/step - accuracy: 0.8937 - loss: 0.3842 - val_accuracy: 0.4864 - val_loss: 2.5007 - learning_rate: 0.0010
Epoch 3/5
[1m1627/1627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 66ms/step - accuracy: 0.9376 - loss: 0.2129 - val_accuracy: 0.4808 - val_loss: 2.9035 - learning_rate: 0.0010
Epoch 4/5
[1m1627/1627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 66ms/step - accuracy: 0.9549 - loss: 0.1488 - val_accuracy: 0.4838 - val_loss: 3.1516 - learning_rate: 0.0010
Epoch 5/5
[1m1627/1627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 66ms/step - accuracy: 0.9594 - loss: 0.1322 - val_accuracy: 0.4824 - val_loss: 3.5358 - learning_rate: 5.0000e-04

Prompt: To be, or not to be
To be, or not to 