<a href="https://colab.research.google.com/github/1bensupriatma/GenerativeAI/blob/main/HW5/Problem1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

In [8]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

In [2]:
import requests

# List of URLs for additional texts (e.g., different Shakespeare plays)
urls = [
      "https://www.gutenberg.org/files/766/766-0.txt",  # David Copperfield
      "https://gutenberg.org/files/24022/24022-0.txt",   # A Christmas Carol
      "https://www.gutenberg.org/files/564/564-0.txt"   # The Mystery of Edwin Drood
      ]

# Initialize an empty string to hold all text
all_text = ""

      # Download each text file and append to all_text
for url in urls:
  response = requests.get(url)
  text = response.text
  all_text += text + "\n\n"  # Separate texts by newlines

# Save combined text to a single file
with open("combined_dickens.txt", "w", encoding="utf-8") as file:
  file.write(all_text)

In [3]:
with open("combined_dickens.txt", "r", encoding="utf-8") as file:
    all_text = file.read()

# Split the text into sentences or lines (adjust as needed)
text_data = all_text.split("\n")  # Split by newline

filtered_data = [
    "Text: " + line
    for line in text_data
    if line.strip()
]

In [6]:
example = filtered_data[15043]
print(example)

Text: on intoxication, employed by a broker. That individual is in legal


In [10]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [11]:
example_data = text_data[15043]
print(example_data)

Text : on intoxication , employed by a broker . That individual is in legal


In [12]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [13]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [14]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [15]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: ,
3: :
4: text
5: .
6: the
7: and
8: i
9: to


In [16]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[   4    3   34 5906    2 1563   45   11 5405    5   15 1726   40   12
 2136    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [17]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

Task 2: Initial LSTM Model Training

In [18]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

In [19]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [20]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("It was a rainy day. ", max_tokens=100, temperature=1.0)

In [21]:
# Tokenize starting prompt
text_generator = TextGenerator(vocab)
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

Epoch 1/25
[1m1338/1339[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 41ms/step - loss: 1.1081
generated text:
It was a rainy day.  tinker starving ‘o “em’ly character nooks footstool articulate thorough problem blossom groups penitence tat bells oftener detail squeeze engage kind rackham handy obligation so fast , it my 

[1m1339/1339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 42ms/step - loss: 1.1073
Epoch 2/25
[1m1339/1339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 0.3898
generated text:
It was a rainy day.  u conciliatory defenceless opposite applicable ma unbounded dutifully waiter instruction wardrobes thousands deferential advertisement watchfully solution wheel frankness fortnight shipwrecked spine micawbers poker sheffield bodgers puffy casks chinese worship fears native substantial breezy circumstance manifest landscape tempt brothers mother’s stretched colour stationary joints robbers music aired airing pollis cage child’s 

<keras.src.callbacks.history.History at 0x797968402050>

In [22]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [30]:
info = text_generator.generate(
    "On the night of", max_tokens=10, temperature=1.0
)


generated text:
On the night of unfolded forwards girl’s ‘confound root repel



In [31]:
print_probs(info, vocab)


PROMPT: On the night of
dilber:   	0.72%
forwards:   	0.45%
forged:   	0.43%
mentally:   	0.43%
—for:   	0.42%
--------


PROMPT: On the night of unfolded
’old:   	1.57%
stakes:   	0.71%
sarcasm:   	0.54%
packets:   	0.42%
riches:   	0.39%
--------


PROMPT: On the night of unfolded forwards
selfishly:   	0.41%
brush:   	0.33%
wretches:   	0.29%
nurse’s:   	0.29%
quarrelled:   	0.27%
--------


PROMPT: On the night of unfolded forwards girl’s
—for:   	1.1%
staggering:   	0.74%
—but:   	0.55%
peregrine:   	0.51%
“peggotty:   	0.49%
--------


PROMPT: On the night of unfolded forwards girl’s ‘confound
unclean:   	0.8%
unfit:   	0.71%
dissipation:   	0.67%
diminished:   	0.64%
tape:   	0.64%
--------


PROMPT: On the night of unfolded forwards girl’s ‘confound root
tm:   	0.3%
toes:   	0.29%
denying:   	0.27%
resumption:   	0.25%
referee:   	0.25%
--------



In [34]:
info2 = text_generator.generate(
    "On the night of", max_tokens=10, temperature=0.2
)


generated text:
On the night of forwards stakes insulted stakes warmer tape



In [36]:
print_probs(info2, vocab)


PROMPT: On the night of
dilber:   	62.54%
forwards:   	5.74%
forged:   	5.02%
mentally:   	4.97%
—for:   	3.99%
--------


PROMPT: On the night of forwards
stakes:   	45.85%
indistinctly:   	11.48%
waistcoats:   	8.38%
grinby’s:   	4.13%
beau:   	3.46%
--------


PROMPT: On the night of forwards stakes
’old:   	61.73%
insulted:   	14.93%
warmer:   	6.44%
blas:   	3.54%
tales:   	1.97%
--------


PROMPT: On the night of forwards stakes insulted
stakes:   	26.77%
selfishly:   	15.35%
harvest:   	14.85%
valedictory:   	10.21%
forwards:   	5.98%
--------


PROMPT: On the night of forwards stakes insulted stakes
warmer:   	54.84%
’old:   	30.94%
blas:   	8.8%
split:   	1.11%
limitation:   	0.86%
--------


PROMPT: On the night of forwards stakes insulted stakes warmer
tape:   	41.95%
depend:   	22.24%
denying:   	17.11%
dial:   	7.39%
warmer:   	5.65%
--------



In [40]:
info3 = text_generator.generate(
    "On the night of", max_tokens=12, temperature=0.7
)


generated text:
On the night of forwards waistcoats crazy twist playfellow mite interfere grinby’s



In [41]:
print_probs(info3, vocab)


PROMPT: On the night of
dilber:   	2.53%
forwards:   	1.28%
forged:   	1.23%
mentally:   	1.23%
—for:   	1.15%
--------


PROMPT: On the night of forwards
stakes:   	1.71%
indistinctly:   	1.15%
waistcoats:   	1.05%
grinby’s:   	0.86%
beau:   	0.82%
--------


PROMPT: On the night of forwards waistcoats
wholesale:   	2.44%
declaring:   	1.93%
artifice:   	1.91%
’old:   	1.48%
prisons:   	1.33%
--------


PROMPT: On the night of forwards waistcoats crazy
whereat:   	2.5%
“uncle:   	2.06%
—for:   	1.77%
overhung:   	0.98%
whatsoever:   	0.9%
--------


PROMPT: On the night of forwards waistcoats crazy twist
’old:   	1.87%
earliest:   	1.64%
voted:   	1.2%
wrists:   	1.09%
lowered:   	0.99%
--------


PROMPT: On the night of forwards waistcoats crazy twist playfellow
unshaded:   	1.7%
’old:   	1.17%
aggravating:   	1.17%
stakes:   	1.16%
selfishly:   	1.08%
--------


PROMPT: On the night of forwards waistcoats crazy twist playfellow mite
testamentary:   	2.15%
’old:   	2.09%
grieve:   	

Task 3: Experiment with Model Complexity

In [42]:
def lstm2(num_layers=2, num_units=256, dropout_rate=0.2):
    inputs = layers.Input(shape=(None,), dtype="int32")
    x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
    for _ in range(num_layers):
        x = layers.LSTM(num_units, return_sequences=True)(x)
        x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
    lstm_model = models.Model(inputs, outputs)
    return lstm_model

model2 = lstm2()
model2.summary()

In [43]:
lstm_2 = lstm2(num_layers=2, num_units=256)
lstm_2.compile("adam", loss_fn)

lstm_2.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

Epoch 1/25
[1m1339/1339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - loss: 0.8811
generated text:
It was a rainy day.  dealing prophetic prophetic disparity geese plunge nat’ral him and have time it the exclaiming , [UNK] . you in curling . my , , and 

[1m1339/1339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 80ms/step - loss: 0.8808
Epoch 2/25
[1m1339/1339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - loss: 0.3992
generated text:
It was a rainy day.  complete , 

[1m1339/1339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 81ms/step - loss: 0.3992
Epoch 3/25
[1m1339/1339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - loss: 0.3683
generated text:
It was a rainy day.  win if miss murdstone are 

[1m1339/1339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 81ms/step - loss: 0.3683
Epoch 4/25
[1m1339/1339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - loss: 0.3511
generated te

<keras.src.callbacks.history.History at 0x79786068c8e0>

Task 4

"Every traveler has a"

In [58]:
info = text_generator.generate(
    "Every traveler has a", max_tokens=10, temperature=1.0
)


generated text:
Every traveler has a poor piece 



In [63]:
info = text_generator.generate(
    "Every traveler has a", max_tokens=10, temperature=0.1
)


generated text:
Every traveler has a 



In [61]:
info = text_generator.generate(
    "Every traveler has a", max_tokens=10, temperature=0.5
)


generated text:
Every traveler has a smile , that i was not



In [64]:
info = text_generator.generate(
    "Every traveler has a home of his own", max_tokens=10, temperature=2.0
)


generated text:
Every traveler has a home of his own hope !



"whatever I have tried to do"

In [65]:
info = text_generator.generate(
    "whatever I have tried to do", max_tokens=10, temperature=1.0
)


generated text:
whatever I have tried to do - - well .



In [66]:
info = text_generator.generate(
    "whatever I have tried to do", max_tokens=10, temperature=0.5
)


generated text:
whatever I have tried to do it . ’ 



In [67]:
info = text_generator.generate(
    "whatever I have tried to do", max_tokens=10, temperature=0.1
)


generated text:
whatever I have tried to do , and i am



In [68]:
info = text_generator.generate(
    "whatever I have tried to do", max_tokens=10, temperature=2.0
)


generated text:
whatever I have tried to do but regret christian pen



"You are more beautiful in"

In [70]:
info = text_generator.generate(
    "You are more beautiful in", max_tokens=10, temperature=1.0
)


generated text:
You are more beautiful in me to offer the 



In [71]:
info = text_generator.generate(
    "You are more beautiful in", max_tokens=10, temperature=0.5
)


generated text:
You are more beautiful in the world . ’ 



In [72]:
info = text_generator.generate(
    "You are more beautiful in", max_tokens=10, temperature=0.1
)


generated text:
You are more beautiful in my thoughts as i 



In [73]:
info = text_generator.generate(
    "You are more beautiful in", max_tokens=10, temperature=2.0
)


generated text:
You are more beautiful in ! warn’t 



I have observed that more creative responses come from the higher temeratures, specifically 1.0 and 2.0. On the lower temperatures, the sentences are less coherent but still make sense. There is a tradeoff of increasing and descreasing the temperature between the creativity and coherence of the sentences.

TASK 5:

coherence, relevance, stylistic accuracy
The coherence of the sentences were very incoherent in the one layer lstm. Most of the sentences seemed like one run on sentence of random words from the selected pieces of text. The 2 layer LSTM has shorter sentences, but more coherent sentences.
The relevance of the generated text was in the point of making a scene of the story. This made all the sentences a scene builder, painting the background of a story.
The stylistic accuarcy picked up on the Dickens style of writing, but with a more complex model we could have a better style of writing comparing to Dickens.