<a href="https://colab.research.google.com/github/Enkrumah14/mannyNkrumahGenAi/blob/main/Problem1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import requests
import re
import os
import gc
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

# Text preprocessing functions
def clean_text(text):
    # Remove special characters and standardize spacing
    text = re.sub(r'[^a-zA-Z0-9.,!?\'\" \n]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def download_and_combine_texts(urls):
    combined_text = ""
    for url in urls:
        response = requests.get(url)
        text = response.text
        # Remove Project Gutenberg header and footer
        text = re.sub(r'\*\*\* START OF .+?\*\*\*', '', text)
        text = re.sub(r'\*\*\* END OF .+?\*\*\*', '', text)
        text = clean_text(text)
        combined_text += text + "\n\n"
    return combined_text

# Download Shakespeare texts
urls = [
    "https://www.gutenberg.org/files/1041/1041-0.txt",  # Hamlet
    "https://www.gutenberg.org/files/152/152-0.txt",    # Macbeth
    "https://www.gutenberg.org/files/1112/1112-0.txt"   # Othello
]

# Get and preprocess text
text = download_and_combine_texts(urls)
print(f"Total characters: {len(text)}")

# Create character mappings
chars = sorted(list(set(text)))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

# Model parameters
EPOCHS = 5
BATCH_SIZE = 128
SEQ_LENGTH = 150
BUFFER_SIZE = 10000
EMBEDDING_DIM = 512
VOCAB_SIZE = len(chars)
TEMPERATURE = 0.7

def create_sequences(text, seq_length):
    input_sequences = []
    target_sequences = []

    for i in range(0, len(text) - seq_length):
        seq = text[i:i + seq_length]
        target = text[i + 1:i + seq_length + 1]
        input_sequences.append([char_to_idx[char] for char in seq])
        target_sequences.append([char_to_idx[char] for char in target])

    x = np.array(input_sequences)
    y = np.array(target_sequences)
    return x, y

# Prepare dataset
x, y = create_sequences(text, SEQ_LENGTH)

# Split data into training and validation sets
split_index = int(0.8 * len(x))  # 80% for training, 20% for validation
x_train, x_val = x[:split_index], x[split_index:]
y_train, y_val = y[:split_index], y[split_index:]

# Create tf.data.Dataset objects for training and validation
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)


def create_model():
    inputs = layers.Input(shape=(SEQ_LENGTH,))
    x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
    # Add more LSTM layers with higher units
    x = layers.LSTM(1024, return_sequences=True)(x) #return_sequences should be True for all LSTMs except the last one
    x = layers.Dropout(0.2)(x)
    x = layers.LSTM(1024, return_sequences=True)(x) #return_sequences should be True for all LSTMs except the last one
    x = layers.Dropout(0.2)(x)
    x = layers.LSTM(1024, return_sequences=True)(x) #return_sequences added here to match target shape
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(VOCAB_SIZE, activation='softmax')(x)

    model = models.Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

def generate_text(model, start_string, num_generate=1000, temperature=0.7):
    # Convert start_string to indices
    input_eval = [char_to_idx[s] for s in start_string]

    # Pad the input to match SEQ_LENGTH
    input_eval = tf.keras.preprocessing.sequence.pad_sequences(
        [input_eval], maxlen=SEQ_LENGTH, padding='pre'
    )

    text_generated = []

    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) / temperature

        # Sample from the probability distribution
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Append the predicted character
        text_generated.append(idx_to_char[predicted_id])

        # Update input for the next prediction
        input_eval = tf.concat([input_eval[:, 1:], tf.expand_dims([predicted_id], 0)], axis=-1)

    return start_string + ''.join(text_generated)


# Create and train the model
model = create_model()
model.summary()

# Setup training callbacks
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_prefix + ".weights.h5",
    save_weights_only=True,
    save_freq='epoch'
)

early_stopping = EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True
)

lr_schedule = ReduceLROnPlateau(
    monitor='loss',
    factor=0.5,
    patience=2,
    min_lr=0.0001
)

# Train the model
try:
    history = model.fit(
        train_dataset,
        epochs=EPOCHS,
        validation_data=val_dataset,  # Use validation_data instead of validation_split
        batch_size=BATCH_SIZE,
        callbacks=[
            checkpoint_callback,
            early_stopping,
            lr_schedule
        ]
    )
except Exception as e:
    print(f"Training error occurred: {str(e)}")
    try:
        model.save_weights('emergency_backup_weights.h5')
    except:
        print("Could not save emergency backup")
    raise e


# Generate sample texts
prompts = [
    "To be, or not to be",
    "Shall I compare thee to a summer's day?",
    "All the world's a stage"
]

for prompt in prompts:
    print(f"\nPrompt: {prompt}")
    try:
        generated_text = generate_text(
            model,
            prompt,
            num_generate=500,
            temperature=TEMPERATURE
        )
        print(generated_text)
    except Exception as e:
        print(f"Error generating text: {str(e)}")

# Cleanup
tf.keras.backend.clear_session()
gc.collect()

Total characters: 231644


Epoch 1/5
[1m1446/1446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 69ms/step - accuracy: 0.5779 - loss: 1.5438 - val_accuracy: 0.4684 - val_loss: 2.6883 - learning_rate: 0.0010
Epoch 2/5
[1m1446/1446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 69ms/step - accuracy: 0.9091 - loss: 0.3409 - val_accuracy: 0.4759 - val_loss: 2.8547 - learning_rate: 0.0010
Epoch 3/5
[1m1446/1446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 69ms/step - accuracy: 0.9487 - loss: 0.1763 - val_accuracy: 0.4751 - val_loss: 3.2858 - learning_rate: 0.0010
Epoch 4/5
[1m1446/1446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 69ms/step - accuracy: 0.9635 - loss: 0.1202 - val_accuracy: 0.4783 - val_loss: 3.5825 - learning_rate: 0.0010
Epoch 5/5
[1m1446/1446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 69ms/step - accuracy: 0.9677 - loss: 0.1049 - val_accuracy: 0.4778 - val_loss: 3.7131 - learning_rate: 0.0010

Prompt: To be, or not to be
To be, or not to beFo

0