<a href="https://colab.research.google.com/github/DavidSalmon13/GenAI/blob/main/HW5/Problem1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Name: David Schwartzman
# Date: 11/18/2025
# Description: In this model I will build a LSTM model that will take 3 books as an input
#              and by studying the text it will generate similar sentences as in the books

In [None]:
import requests
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import re

In [None]:

# 1. Data Collection from Project Gutenberg
urls = [
    "https://www.gutenberg.org/files/1041/1041-0.txt",  # Hamlet
    "https://www.gutenberg.org/files/152/152-0.txt",   # Macbeth
    "https://www.gutenberg.org/files/1112/1112-0.txt"  # Othello
]

# Initialize an empty string to hold all text
all_text = ""

# Download each text file and append to all_text
for url in urls:
    response = requests.get(url)
    text = response.text
    all_text += text + "\n\n"  # Separate texts by newlines

# Save combined text to a single file
with open("combined_shakespeare.txt", "w", encoding="utf-8") as file:
    file.write(all_text)


In [None]:
# 2. Text Preprocessing (Cleaning, Tokenization)
def clean_text(text):
    # Remove unwanted characters, special symbols, and extra spaces
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\r", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Keep only alphabets and spaces
    text = text.lower().strip()
    return text

# Clean the collected text
cleaned_text = clean_text(all_text)

# Tokenize the text into words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned_text])
total_words = len(tokenizer.word_index) + 1  # Adding 1 for the padding token

# Prepare the data for training
input_sequences = []
output_words = []

# Create input-output sequences for training
sequence_length = 50

In [None]:
#Display the first 10 token:word mappings
token_word_mappings = list(tokenizer.word_index.items())[:10]
print("Token: Word Mappings (First 10):")
for token, word in token_word_mappings:
    print(f"{token}: {word}")

Token: Word Mappings (First 10):
and: 1
the: 2
to: 3
i: 4
of: 5
my: 6
that: 7
a: 8
in: 9
thou: 10


In [None]:
print("Input:", input_sequences[0])
print("Output:", output_words[0])


Input: [884, 5, 2, 605, 537, 885, 2, 2695, 28, 2696, 2697, 4, 36, 755, 1315, 84, 390, 1047, 7, 1765, 296, 673, 179, 337, 164, 19, 24, 2, 1766, 66, 28, 69, 1316, 25, 360, 1317, 179, 421, 25, 538, 19, 10, 1767, 3, 123, 184, 322, 75, 2698, 12]
Output: 1048


In [None]:

for i in range(sequence_length, len(cleaned_text.split())):
    sequence = cleaned_text.split()[i-sequence_length:i]
    input_sequences.append(tokenizer.texts_to_sequences([sequence])[0])
    output_words.append(tokenizer.texts_to_sequences([cleaned_text.split()[i]])[0][0])

# Convert input sequences and output words to numpy arrays
X = np.array(input_sequences)
y = np.array(output_words)

# One-hot encode the output labels
y = to_categorical(y, num_classes=total_words)


In [None]:
def create_model(vocab_size, sequence_length, lstm_units=128, num_layers=1):
    model = models.Sequential()
    model.add(layers.Embedding(vocab_size, EMBEDDING_DIM, input_length=sequence_length))

    # Add LSTM layers
    for i in range(num_layers - 1):
        model.add(layers.LSTM(lstm_units, return_sequences=True))
        #lstm_units *= 2
    model.add(layers.LSTM(lstm_units))  # Last LSTM layer without return_sequences

    # Output layer
    model.add(layers.Dense(vocab_size, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model


In [None]:

# 4. Training the Model
VOCAB_SIZE = total_words
MAX_LEN = sequence_length
BATCH_SIZE = 64
EPOCHS = 50
EMBEDDING_DIM = 100

# Create and train the model
lstm_model = create_model(VOCAB_SIZE, MAX_LEN, lstm_units=128, num_layers=3)

lstm_model.summary()
# Train the model
lstm_model.fit(X, y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           644000    
                                                                 
 lstm (LSTM)                 (None, 50, 128)           117248    
                                                                 
 lstm_1 (LSTM)               (None, 50, 128)           131584    
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 6440)              830760    
                                                                 
Total params: 1855176 (7.08 MB)
Trainable params: 1855176 (7.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
Epoch 2

<keras.src.callbacks.History at 0x7e83180c6890>

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

class TextGenerator:
    def __init__(self, model, tokenizer, max_len=50):
        self.model = model
        self.tokenizer = tokenizer
        self.max_len = max_len

    def generate(self, seed_text, max_tokens=50, temperature=1.0):
        input_text = seed_text.lower()

        # Tokenize the input text
        sequence = self.tokenizer.texts_to_sequences([input_text])[0]

        # Ensure the sequence is the correct length (pad or truncate)
        sequence = sequence[-self.max_len:]
        sequence = pad_sequences([sequence], maxlen=self.max_len, padding='pre', truncating='pre')

        for _ in range(max_tokens):
            # Predict the next word
            prediction = self.model.predict(sequence, verbose=0)
            prediction = prediction[0]

            # Adjust predictions based on temperature
            prediction = np.asarray(prediction).astype("float64")
            prediction = np.log(prediction + 1e-7) / temperature
            exp_probs = np.exp(prediction)
            prediction = exp_probs / np.sum(exp_probs)

            # Sample a word from the distribution
            next_index = np.random.choice(len(prediction), p=prediction)
            next_word = self.tokenizer.index_word[next_index]

            # Append the word to the input text
            input_text += " " + next_word

            # Update the sequence with the newly generated word
            sequence = self.tokenizer.texts_to_sequences([input_text])[-1]
            sequence = sequence[-self.max_len:]
            sequence = pad_sequences([sequence], maxlen=self.max_len, padding='pre', truncating='pre')

        return input_text


In [None]:

# Instantiate the TextGenerator class
text_generator = TextGenerator(lstm_model, tokenizer, max_len=MAX_LEN)

# Generate text based on seed prompts
seed_prompts = [
    "to be or not to be",
    "shall I compare thee to a summer's day",
    "all the world's a stage"
]

# Experiment with different temperature settings
for prompt in seed_prompts:
    print(f"Seed prompt: {prompt}")
    for temp in [0.1,0.5,1.0]:
        print(f"Temperature: {temp}")
        generated_text = text_generator.generate(prompt, max_tokens=20, temperature=temp)
        print(f"Generated text: {generated_text}\n")

Seed prompt: to be or not to be
Temperature: 0.1
Generated text: to be or not to be short than all the world rom comes me and go by me i am fel i not the letters of

Temperature: 0.5
Generated text: to be or not to be bright at recompense shall soone to arme from cupids arrow calme burne distilld with heat must tongues and siren vile

Temperature: 1.0
Generated text: to be or not to be old without those bound now you on the huswife of the instant grace your white app compare the very eye

Seed prompt: shall I compare thee to a summer's day
Temperature: 0.1
Generated text: shall i compare thee to a summer's day to be peruerse and mayst not proofe for all away speakst thou not iule to moue me to thy wounds

Temperature: 0.5
Generated text: shall i compare thee to a summer's day to be peruerse and say thee not mercy she mer on the churchyard thrice thing and yet not added like

Temperature: 1.0
Generated text: shall i compare thee to a summer's day with feather thou knowst i do defy see f